In [1]:
import re
import string
import pandas as pd
import numpy as np
from datetime import datetime
%matplotlib inline

# Load Data

In [2]:
dir_data = '../data/nba_reg18/'

ds_sub = pd.DataFrame()
ds_com = pd.DataFrame()

nf_subs = 7 #7 files for submissions
nf_coms = 12 #12 files for comments

In [3]:
for icom in range(1, nf_subs+1):
    path_ds = dir_data + 'nba_submissions_reg18_' + str(icom) + '.csv'
    ds_cur = pd.read_csv(path_ds, index_col = 0, parse_dates = ['created'])
    ds_sub = ds_sub.append(ds_cur).drop_duplicates()

In [4]:
for icom in range(1, nf_coms+1):
    path_ds = dir_data + 'nba_comments_reg18_' + str(icom) + '.csv'
    ds_cur = pd.read_csv(path_ds, index_col = 0, parse_dates = ['created'])
    ds_com = ds_com.append(ds_cur).drop_duplicates()

In [5]:
print(ds_sub.shape)
print(ds_com.shape)

(92383, 6)
(2064767, 6)


### Standerdize Team Name

In [6]:
team_names = pd.read_csv('teams', names = ['name', 'abbrs'])
team_dict = team_names.set_index('name').to_dict()['abbrs']
team_fulls = team_names['name'].to_list()
team_abbrs = team_names['abbrs'].to_list()

In [7]:
def flair2team(flair, team_dict, team_fulls, team_abbrs):
    # transform all flairs (team names, player names with team abbrs) into team abbrs only
    
    # team names
    for name in team_fulls:
        if name in flair:
            return team_dict[name]
    
    # player names with team abbrs
    for abbr in team_abbrs:
        if abbr in flair:
            return abbr
    
    # otherwise return original (bind into "others" later)
    return flair

In [8]:
# get rid of the weired pandas float nan
ds_com.loc[ds_com['flair'].isnull().values, 'flair'] = 'NONE'

In [9]:
# mapping to abbrev team name
ds_com['flair'] = ds_com['flair'].apply(flair2team, args = [team_dict, team_fulls, team_abbrs])

In [10]:
print('No. Total:')
print(ds_com.shape)
ds_com = ds_com.loc[ds_com['flair'].isin(team_abbrs), :]
print('No. for 31 Teams:')
print(ds_com.shape)

No. Total:
(2064767, 6)
No. for 31 Teams:
(1562342, 6)


In [11]:
# now do the same for submissions
# get rid of the weired pandas float nan
ds_sub.loc[ds_sub['flair'].isnull().values, 'flair'] = 'NONE'
ds_sub['flair'] = ds_sub['flair'].apply(flair2team, args = [team_dict, team_fulls, team_abbrs])

In [12]:
print('No. Total:')
print(ds_sub.shape)
ds_sub = ds_sub.loc[ds_sub['flair'].isin(team_abbrs), :]
print('No. for 31 Teams:')
print(ds_sub.shape)

No. Total:
(92383, 6)
No. for 31 Teams:
(55475, 6)


In [13]:
# look at comment/fan distribution
#ds_comment_by_fan = ds_com.groupby(['author'])['author'].count()

In [14]:
#print(ds_comment_by_fan.shape)

In [15]:
#ds_comment_by_fan.loc[ds_comment_by_fan < 50].hist(bins = 100, figsize=(20,10))

In [16]:
#print(ds_com.shape)

In [17]:
#users = ds_com[['author', 'flair']].drop_duplicates()

In [18]:
#users.groupby('flair')['author'].count().sort_values(ascending = False).plot.bar(figsize = (20,10))

In [19]:
#ds_sub_by_fan = ds_sub.groupby('author')['author'].count()

In [20]:
#ds_sub_by_fan.loc[ds_sub_by_fan<20].hist(bins = 100, figsize = (20,10))

In [21]:
# put submissions and comments together
ds = pd.concat([ds_sub[['author', 'flair', 'title', 'created']].rename(columns = {'flair':'team', 'title':'text'}),
                ds_com[['author', 'flair', 'text', 'created']].rename(columns = {'flair':'team'})],
               ignore_index = True)

In [22]:
print(ds.shape)

(1617817, 4)


# Text Preprocessing

In [23]:
# remove links and emojis
ds['text'] = ds['text'].map(lambda x: re.sub(r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', ' ', str(x)))

In [24]:
def replace_punctuation(text):
    #replace punctuation with space
    no_punct = ''.join([c if c not in string.punctuation else ' ' for c in text])
    #replace special char like 🏀
    no_punct = no_punct.encode('ascii', 'ignore').decode('ascii')
    #remove \n \t \r
    return no_punct.translate(str.maketrans("\n\t\r", "   "))

In [25]:
ds['text'] = ds['text'].apply(replace_punctuation)

# Get Game Time Data (who posted during a team's games)

In [26]:
# load game data
gametime = pd.read_csv('../data/game_schedule_reg18.csv',
                       names = ['date', 'time', 'visitor', 'vpts', 'home', 'hpts',
                                  'box', 'ot', 'attend', 'notes'],
                      skiprows=1).drop(['box', 'notes'], axis=1)

In [27]:
# combine date&time to datetime
form = '%a %b %d %Y %I:%M%p'
def dataplustime(ds):
    return datetime.strptime(ds['date'] + ' ' + ds['time'] + 'm', form)

In [28]:
gametime['datetime'] = gametime[['date', 'time']].apply(dataplustime, axis=1)
gametime.drop(['date', 'time'], axis = 1, inplace=True)

In [29]:
# add overtimes to game time (10 min extension per overtime)
# get rid of the float nan
gametime.loc[gametime['ot'].isnull(), 'ot'] = 'NAN'

In [30]:
# count how many OTs
def count_ots(ot):
    if ot == 'NAN':
        return 0
    elif ot == 'OT':
        return 1
    else:
        return int(ot[0])

In [31]:
gametime['ot'] = gametime['ot'].apply(count_ots)

In [32]:
# standerdize team name
gametime['visitor'] = gametime['visitor'].apply(flair2team, args = [team_dict, team_fulls, team_abbrs])
gametime['home'] = gametime['home'].apply(flair2team, args = [team_dict, team_fulls, team_abbrs])

In [33]:
# concat home/visitor games together
tgame = pd.concat([gametime[['visitor', 'datetime', 'ot']].rename(columns = {'visitor':'team'}),
                  gametime[['home', 'datetime', 'ot']].rename(columns = {'home':'team'})])

In [34]:
# calculate end_time for games
duration = 2.5 #2.5 hrs per game
ot_duration = 10 #10 min per OT
tgame['end_time'] = tgame['datetime'] + pd.Timedelta(duration, unit='h') + tgame['ot'].apply(lambda x: pd.Timedelta(x*ot_duration, unit='m'))
tgame = tgame.rename(columns={'datetime':'start_time'}).drop('ot', axis = 1)

In [35]:
# tuple (start_time, end_time)
tgame['time_span'] = list(zip(tgame['start_time'], tgame['end_time']))
# group by team, each team get a list of game time tuples
ttgame = tgame.drop(['start_time', 'end_time'], axis=1).groupby('team')['time_span'].apply(list)

In [36]:
# go through the dataset, assign game time tags for each entry (posted during X team's games, 30 columns added)
for team, time in ttgame.items():
    ds[team] = 0
    for ts, te in time:
        ds.loc[(ds['created'] > ts) & (ds['created'] < te), team] = 1

# Prep for Training

In [42]:
# concat all texts by users
ds_user = ds.groupby(['author', 'team'])['text'].apply(lambda x: ' '.join(x)).reset_index()

In [53]:
team_abbrs.remove('SEA')
team_abbrs = list(set(team_abbrs))

In [56]:
# count game posts
game_posts = ds.drop(['team', 'text', 'created'], axis=1).groupby('author')[team_abbrs].sum()

In [77]:
# normalize the count to each user's total count
game_posts['total'] = game_posts.sum(axis=1)
game_posts[team_abbrs] = game_posts[team_abbrs].div(game_posts['total'], axis=0)
game_posts = game_posts.fillna(0)

In [85]:
# attach to user dataset
ds_user = ds_user.set_index('author').join(game_posts)

In [87]:
# feature extraction using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 1), stop_words='english', max_df=.6)

In [89]:
from sklearn.model_selection import train_test_split

In [96]:
X_train, X_test, y_train, y_test = train_test_split(ds_user.drop(['team','total'], axis=1), ds_user['team'], test_size=0.20, random_state=0)

In [101]:
train_txt = tfidf.fit_transform(X_train['text'])
test_txt = tfidf.transform(X_test['text'])

In [110]:
train_txt.A.shape

(54945, 1000)

In [117]:
train_all = np.concatenate((train_txt.A, X_train.drop('text', axis=1).values), axis=1)

In [118]:
test_all = np.concatenate((test_txt.A, X_test.drop('text', axis=1).values), axis=1)

In [119]:
from sklearn.naive_bayes import MultinomialNB

In [121]:
tfidf_nb = MultinomialNB()
tfidf_nb.fit(train_all, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [122]:
tfidf_nb.score(train_all, y_train)

0.2880698880698881

In [123]:
tfidf_nb.score(test_all, y_test)

0.278663463638349

In [124]:
tfidf_nb = MultinomialNB()
tfidf_nb.fit(train_txt, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [125]:
tfidf_nb.score(train_txt, y_train)

0.28382928382928385

In [126]:
tfidf_nb.score(test_txt, y_test)

0.27407730945621317

In [None]:
ds_base = ds['team'].to_frame()
ds_base['pred'] = ds_base['team'].sample(frac=1, random_state=0).reset_index(drop=True)

In [None]:
(ds_base['team'] == ds_base['pred']).mean()

In [None]:
1/31

In [None]:
y_test_prob = cvec_nb.predict_proba(test_raw)

In [None]:
y_test_prob.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder()
y_test_ohe = enc.fit_transform(y_test.values.reshape(-1, 1))

In [None]:
enc.categories_

In [None]:
cvec_nb.classes_

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
n_classes = ds_base['team'].unique().shape[0]

In [None]:
y_test_prob.shape

In [None]:
y_test_ohe.shape

In [None]:
y_test_ohe = y_test_ohe.toarray()

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_ohe[:, i], y_test_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_ohe.ravel(), y_test_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
roc_auc