In [1]:
import re
import string
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [2]:
dir_data = '../data/nba_reg18/'

ds_sub = pd.DataFrame()
ds_com = pd.DataFrame()

nf_subs = 7 #7 files for submissions
nf_coms = 12 #12 files for comments

In [3]:
for icom in range(1, nf_subs+1):
    path_ds = dir_data + 'nba_submissions_reg18_' + str(icom) + '.csv'
    ds_cur = pd.read_csv(path_ds, index_col = 0, parse_dates = ['created'])
    ds_sub = ds_sub.append(ds_cur).drop_duplicates()

In [4]:
for icom in range(1, nf_coms+1):
    path_ds = dir_data + 'nba_comments_reg18_' + str(icom) + '.csv'
    ds_cur = pd.read_csv(path_ds, index_col = 0, parse_dates = ['created'])
    ds_com = ds_com.append(ds_cur).drop_duplicates()

In [5]:
print(ds_sub.shape)
print(ds_com.shape)

(92383, 6)
(2064767, 6)


In [6]:
# put submissions and comments together
ds = pd.concat([ds_sub[['author', 'flair', 'title', 'created']].rename(columns = {'flair':'team', 'title':'text'}),
                ds_com[['author', 'flair', 'text', 'created']].rename(columns = {'flair':'team'})],
               ignore_index = True)

In [7]:
# drop empty texts
ds.dropna(inplace = True)

In [8]:
print(ds.shape)

(1731127, 4)


### Standerdize Team Name

In [9]:
team_names = pd.read_csv('teams', names = ['name', 'abbrs'])
team_dict = team_names.set_index('name').to_dict()['abbrs']
team_fulls = team_names['name'].to_list()
team_abbrs = team_names['abbrs'].to_list()

In [10]:
def team2abbr(flair, team_dict, team_fulls, team_abbrs):
    # transform all flairs (team names, player names with team abbrs) into team abbrs only
    
    # team names
    for name in team_fulls:
        if name in flair:
            return team_dict[name]
    
    # player names with team abbrs
    for abbr in team_abbrs:
        if abbr in flair:
            return abbr
    
    # otherwise return original (bind into "others" later)
    return flair

In [11]:
# get rid of the weired pandas float nan
ds.loc[ds['team'].isnull().values, 'team'] = 'NONE'

In [12]:
# mapping to abbrev team name
ds['team'] = ds['team'].apply(team2abbr, args = [team_dict, team_fulls, team_abbrs])

In [13]:
print('No. Total:')
print(ds.shape)
ds = ds.loc[ds['team'].isin(team_abbrs), :]
print('No. for 31 Teams:')
print(ds.shape)

No. Total:
(1731127, 4)
No. for 31 Teams:
(1617815, 4)


In [14]:
#ds.to_csv('ds_orig.csv')

# Text Preprocessing

In [15]:
def preprocess_one(text, b_punct):
    # remove links and emojis
    cleaned1 = re.sub(r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', ' ', str(text))

    #replace special char like 🏀
    cleaned2 = cleaned1.encode('ascii', 'ignore').decode('ascii')
    
    #remove \n \t \r
    cleaned3 = cleaned2.translate(str.maketrans("\n\t\r", "   "))
    if b_punct:
        #replace punctuation with space
        cleaned3 = ''.join([c if c not in string.punctuation else ' ' for c in cleaned3])
    
    return cleaned3

In [16]:
#prep_text = ds['text'].apply(lambda x: preprocess_one(x, False))

In [17]:
#corpus = ' '.join(prep_text['text'].to_list())
#f = open('txt_corpus.txt', 'w')
#f.write(corpus)
#f.close()

In [18]:
ds['text'] = ds['text'].apply(lambda x: preprocess_one(x, True))

# Use spacy to do name entity recognition

In [24]:
# see the other script: ../scripts/ner1.py

In [25]:
ner1 = pd.read_csv('entity1.csv', index_col = 0)
ner2 = pd.read_csv('entity2.csv', index_col = 0)
ner3 = pd.read_csv('entity3.csv', index_col = 0)

In [26]:
ner = pd.concat([ner1,ner2,ner3])

In [27]:
ds = ds.join(ner)

In [28]:
ds_sub = ds.loc[ds['entity'].notnull(),:]

In [29]:
print(ds.shape[0])
print(ds_sub.shape[0])

1617815
700020


In [30]:
# concat all texts by users
ds_user = ds_sub.groupby(['author', 'team'])['text'].apply(lambda x: ' '.join(x)).reset_index()

In [31]:
# concat all entities by users
ds_user_ents = ds_sub.groupby(['author', 'team'])['entity'].apply(lambda x: ' '.join(x)).reset_index()

In [32]:
ds_user = ds_user.join(ds_user_ents['entity'])

# Visit History Data

In [33]:
ds_history = pd.DataFrame()
for i in range(1,6):
    ds_bot = pd.read_csv('bot'+str(i)+'_final.csv', index_col = 0)
    ds_history = ds_history.append(ds_bot)

In [34]:
ds_all = ds_user.merge(ds_history, how = 'left', left_on = 'author', right_on = 'user')

In [35]:
ds_all.dropna(inplace = True)

In [36]:
print(ds_all.shape)

(50691, 7)


In [None]:
ds_all.groupby('team')['author'].count().sort_values(ascending=False).plot.bar(figsize=(16,8))

# Training and Testing set

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(ds_all[['text', 'entity', 'history']], 
                                                    ds_all['team'], test_size=0.20, random_state=0)

# Model with TF-IDF and MultinomialNB (Text only)

In [39]:
# feature extraction using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 1), stop_words='english', max_df=.6)

In [41]:
train_txt = tfidf.fit_transform(X_train['text'])
test_txt = tfidf.transform(X_test['text'])

## With SMOTE

In [42]:
from imblearn.over_sampling import SMOTE
X_train_resampled, y_train_resampled = SMOTE(random_state=0).fit_resample(train_txt, y_train)

In [43]:
from sklearn.naive_bayes import MultinomialNB

In [44]:
tfidf_smote_nb = MultinomialNB()
tfidf_smote_nb.fit(X_train_resampled, y_train_resampled)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
# accuracy on balanced training set
tfidf_smote_nb.score(X_train_resampled, y_train_resampled)

0.4891134609378486

In [46]:
# accuracy on training set
tfidf_smote_nb.score(train_txt, y_train)

0.41509666600907474

In [47]:
# accuracy on testing set
tfidf_smote_nb.score(test_txt, y_test)

0.3049610415228326

In [48]:
pred_txt_logp = tfidf_smote_nb.predict_log_proba(test_txt)

# Model with TF-IDF and MultinomialNB (Entity only)

In [None]:
# feature extraction using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 1), stop_words='english', max_df=.6)

In [None]:
train_txt = tfidf.fit_transform(X_train['entity'])
test_txt = tfidf.transform(X_test['entity'])

## With SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
X_train_resampled, y_train_resampled = SMOTE(random_state=0).fit_resample(train_txt, y_train)

In [None]:
ent_tfidf_smote_nb = MultinomialNB()
ent_tfidf_smote_nb.fit(X_train_resampled, y_train_resampled)

In [None]:
# accuray on balanced training set
ent_tfidf_smote_nb.score(X_train_resampled, y_train_resampled)

In [None]:
# accuracy on training set
ent_tfidf_smote_nb.score(train_txt, y_train)

In [None]:
# accuracy on testing set
ent_tfidf_smote_nb.score(test_txt, y_test)

In [None]:
pred_ent_logp = ent_tfidf_smote_nb.predict_log_proba(test_txt)

# Entity + TextRank + CountVectorizer + MultinomialNB

In [None]:
f = open('kwords2.txt','r')
kwords = f.read()
f.close()

In [None]:
vocabulary = kwords.split('@')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(vocabulary=vocabulary[:3000])

In [None]:
train_txt = vectorizer.fit_transform(X_train['entity'])
test_txt = vectorizer.transform(X_test['entity'])

In [None]:
from imblearn.over_sampling import SMOTE
X_train_resampled, y_train_resampled = SMOTE(random_state=0).fit_resample(train_txt, y_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
ent_txtrk_smote_nb = MultinomialNB()
ent_txtrk_smote_nb.fit(X_train_resampled, y_train_resampled)

In [None]:
# accuray on balanced training set
ent_txtrk_smote_nb.score(X_train_resampled, y_train_resampled)

In [None]:
# accuracy on training set
ent_txtrk_smote_nb.score(train_txt, y_train)

In [None]:
# accuracy on testing set
ent_txtrk_smote_nb.score(test_txt, y_test)

In [None]:
pred_txtrk_logp = ent_txtrk_smote_nb.predict_log_proba(test_txt)

# Visit History Data

In [None]:
# feature extraction using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 1), stop_words='english', max_df=.6)

In [None]:
train_txt = tfidf.fit_transform(X_train['history'])
test_txt = tfidf.transform(X_test['history'])

In [None]:
from imblearn.over_sampling import SMOTE
X_train_resampled, y_train_resampled = SMOTE(random_state=0).fit_resample(train_txt, y_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
hist_smote_nb = MultinomialNB()
hist_smote_nb.fit(X_train_resampled, y_train_resampled)

In [None]:
# accuray on balanced training set
hist_smote_nb.score(X_train_resampled, y_train_resampled)

In [None]:
# accuracy on training set
hist_smote_nb.score(train_txt, y_train)

In [None]:
# accuracy on testing set
hist_smote_nb.score(test_txt, y_test)

In [None]:
pred_hist_logp = hist_smote_nb.predict_log_proba(test_txt)

## Ensemble: Text predict_prob + Entity predict_prob + Visit History

In [None]:
print('Correlation between two predict_logp:')
print(np.corrcoef(pred_txt_logp.reshape(1,-1), pred_ent_logp.reshape(1,-1)))

In [None]:
print('Correlation between two predict_logp:')
print(np.corrcoef(pred_txt_logp.reshape(1,-1), pred_txtrk_logp.reshape(1,-1)))

In [None]:
print('Correlation between two predict_logp:')
print(np.corrcoef(pred_txt_logp.reshape(1,-1), pred_hist_logp.reshape(1,-1)))

In [None]:
from matplotlib import cm

In [None]:
# prediction pattern
plt.imshow(pred_txt_logp[:31,:], norm = cm.colors.Normalize(vmax=pred_txt_logp.max(), vmin=pred_txt_logp.min()))

In [None]:
plt.imshow(pred_ent_logp[:31,:], norm = cm.colors.Normalize(vmax=pred_ent_logp.max(), vmin=pred_ent_logp.min()))

In [None]:
plt.imshow(pred_txtrk_logp[:31,:], norm = cm.colors.Normalize(vmax=pred_ent_logp.max(), vmin=pred_ent_logp.min()))

In [None]:
plt.imshow(pred_hist_logp[:31,:], norm = cm.colors.Normalize(vmax=pred_ent_logp.max(), vmin=pred_ent_logp.min()))

In [None]:
def custom_normalize(np_data):
    mx = np_data.max()
    mn = np_data.min()
    return (np_data-mn)/(mx-mn)

In [None]:
pred_comb = custom_normalize(pred_ent_logp) + custom_normalize(pred_txt_logp) + custom_normalize(pred_txtrk_logp) + custom_normalize(pred_hist_logp)

In [None]:
y_comb = y_test.to_frame().copy()

In [None]:
y_comb['pred'] = ent_tfidf_smote_nb.classes_[pred_comb.argmax(axis = 1)]

In [None]:
y_comb['cor'] = y_comb['pred'] == y_comb['team']

In [None]:
y_comb['cor'].mean()

In [None]:
# to do: fit the best linear combination of probs using training data

## Multi-class AUC

In [50]:
# prediction probability
y_test_prob = tfidf_smote_nb.predict_proba(test_txt)

In [None]:
y_test_smote_prob = tfidf_smote_nb.predict_proba(test_txt)

In [51]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve, auc

In [52]:
enc = OneHotEncoder()
y_test_ohe = enc.fit_transform(y_test.values.reshape(-1, 1)).toarray()

In [53]:
# Compute ROC curve and ROC area for each class
n_classes = 31
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_ohe[:, i], y_test_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [70]:
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_ohe.ravel(), y_test_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
print(roc_auc['micro'])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
n_classes = 31
fpr_smote = dict()
tpr_smote = dict()
roc_auc_smote = dict()
for i in range(n_classes):
    fpr_smote[i], tpr_smote[i], _ = roc_curve(y_test_ohe[:, i], y_test_smote_prob[:, i])
    roc_auc_smote[i] = auc(fpr_smote[i], tpr_smote[i])

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkred',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot(fpr_smote[2], tpr_smote[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_smote[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

## Model for the webapp, trained on all data

In [None]:
test_txt

In [None]:
tfidf_nb_app = MultinomialNB()
X_all = tfidf.fit_transform(ds_user['text'])
tfidf_nb_app.fit(X_all, ds_user['team'])

In [None]:
tfidf_nb_app.score(X_all, ds_user['team'])

In [None]:
# dump both model and tf-idf to files
from joblib import dump

In [None]:
dump(tfidf_nb_app, 'tfidf_nb_app.joblib') 

In [None]:
dump(tfidf, 'tfidf.joblib')