In [None]:
acc_dict = {5:0, 6:0, 8:0, 10:0, 12:0, 15:0}
f_dict = {5:0, 6:0, 8:0, 10:0, 12:0, 15:0}

In [None]:
import pandas as pd
import numpy as np
import json
path = './dataset/FOOTBALL/football_15.json'
window_length = 15

In [None]:
import fasttext
from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))



In [None]:
with open(path) as f:
    data_json = json.load(f)

In [None]:
len(data_json)

In [None]:
j = 0
for i in data_json.keys():
    if j>5:
        break
    j+=1
    print(data_json[i])

In [None]:
df_lists = []
for i in data_json.keys():
    temp = []
    label = data_json[i]['label']
    temp.append(label['player'])
    temp.append(label['position'])
    temp.append(label['race'])
    temp.append(label['reference'])
    temp.append(label['teams'][0])
    temp.append(label['teams'][1])
    temp.append(label['year'])
    temp.append(data_json[i]['mention'])
    temp.append(' '.join(data_json[i]['mention']))
    df_lists.append(temp)

data_df = pd.DataFrame(df_lists, columns= ['Name', 'Position', 'Race', 'Reference', 'Team1', 
                                           'Team2', 'Year', 'Mention_tokens', 'Mention_text' ])

In [None]:
data_df.Race.unique()

In [None]:
len(data_df)

In [None]:
data_df = data_df.loc[data_df['Position']=='QB']

In [None]:
data_df.head()

In [None]:
len(data_df)

#### Check for missing Race information on players

In [None]:
data_df.head()

In [None]:
data_df.Race.isnull().values.any()

In [None]:
data_df.Race.isnull().sum()

No missing data

#### Wordclouds

In [None]:
import matplotlib.pyplot as plt

In [None]:
n_posts = 10000
whites = ' '.join(data_df[data_df['Race'] == 'white']['Mention_text'].str.lower().values[:n_posts])
nonwhites = ' '.join(data_df[data_df['Race'] == 'nonwhite']['Mention_text'].str.lower().values[:n_posts])

wordcloud_W = WordCloud(max_font_size=None, stopwords=stop,scale = 2,colormap = 'Dark2').generate(whites)
wordcloud_NW = WordCloud(max_font_size=None, stopwords=stop,scale = 2,colormap = 'Dark2').generate(nonwhites)

fig, ax = plt.subplots(1,2, figsize=(20, 5))
ax[0].imshow(wordcloud_W)
ax[0].set_title('Top words for WHITE player mentions',fontsize = 20)
ax[0].axis("off")

ax[1].imshow(wordcloud_NW)
ax[1].set_title('Top words for NON-WHITE player mentions',fontsize = 20)
ax[1].axis("off")

plt.show()

Wordclouds seem very similar - not much can be said visually

### Building a basic Naive Bayes model

In [None]:
data_df['label'] = data_df.Race.map({'white': 1, 'nonwhite': 0})

# preprocessing 

# lower casing all letters 
data_df['Mention_text'] = data_df.Mention_text.map(lambda x: x.lower())

# removing punctuation
data_df['Mention_text'] = data_df.Mention_text.str.replace('[^\w\s]', '')

In [None]:
# first without stemming
# from nltk.stem import PorterStemmer

# stemmer = PorterStemmer()

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
counts = count_vect.fit_transform(data_df['Mention_text'])

In [None]:
# TFIDF
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts_tfidf = transformer.transform(counts)

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts_tfidf, data_df['label'], test_size=0.2, random_state=42)

In [None]:
# training the Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# model = MultinomialNB().fit(X_train, y_train)
scikit_log_reg = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=100)
model=scikit_log_reg.fit(X_train,y_train)

In [None]:
import numpy as np

predicted = model.predict(X_test)

acc_dict[window_length] = np.mean(predicted == y_test)
print(np.mean(predicted == y_test))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
f_dict[window_length] = f1_score(y_test, predicted, average='macro')
print(f1_score(y_test, predicted, average='macro'))

In [None]:
# f1_score(y_test, predicted, average='weighted')

In [None]:
# recall_score(y_test, predicted)

In [None]:
# precision_score(y_test, predicted)

In [None]:
acc_dict

In [None]:
f_dict