In [1]:
from embeddings_loader import *
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score


In [2]:
def computeAllScores(y_true, y_pred):
    print("F1 score: ", f1_score(y_true, y_pred, average='macro'))
    print("Accuracy score: ", accuracy_score(y_true, y_pred))
    print("Recall score: ", recall_score(y_true, y_pred, average='macro'))
    print("Precision score: ", precision_score(y_true, y_pred, average='macro'))
    print("Confusion matrix: ")
    print(confusion_matrix(y_true, y_pred))

In [3]:
train_labels, dev_labels, test_labels = load_labels()

In [4]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
k_means = KMeans(n_clusters=3, random_state=0).fit(gt25_train, train_labels)

In [9]:
train_preds = k_means.predict(gt25_train)
dev_preds = k_means.predict(gt25_dev)
test_preds = k_means.predict(gt25_test)

In [10]:
computeAllScores(train_labels, train_preds)

F1 score:  0.147388082222974
Accuracy score:  0.18153062121078992
Recall score:  0.32618430603949866
Precision score:  0.3622779694158384
Confusion matrix: 
[[1099   70  793]
 [8657 3027 9094]
 [   4   12    6]]


In [11]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.14283631492082952
Accuracy score:  0.17622230038691522
Recall score:  0.2107389958708279
Precision score:  0.36097332451499115
Confusion matrix: 
[[ 133    8  131]
 [1083  368 1118]
 [   0    2    0]]


In [12]:
computeAllScores(test_labels, test_preds)

F1 score:  0.15016522555352096
Accuracy score:  0.18657765284609978
Recall score:  0.23452063247204014
Precision score:  0.36523584905660383
Confusion matrix: 
[[ 138    5  107]
 [1080  393 1120]
 [   1    2    0]]


### FastText 300 

In [13]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [14]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [15]:
k_means = KMeans(n_clusters=3, random_state=0).fit(ft300_train, train_labels)

In [16]:
train_preds = k_means.predict(ft300_train)
dev_preds = k_means.predict(ft300_dev)
test_preds = k_means.predict(ft300_test)

In [17]:
computeAllScores(train_labels, train_preds)

F1 score:  0.09425327100502011
Accuracy score:  0.09138037079342765
Recall score:  0.3564043982385188
Precision score:  0.34223776163350256
Confusion matrix: 
[[  624   115  1223]
 [ 5633  1441 13704]
 [    7     0    15]]


In [18]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.10600388101230933
Accuracy score:  0.09672880759760816
Recall score:  0.30960108839176
Precision score:  0.3454517868120958
Confusion matrix: 
[[  98   18  156]
 [ 663  176 1730]
 [   1    0    1]]


In [19]:
computeAllScores(test_labels, test_preds)

F1 score:  0.09230425202211685
Accuracy score:  0.08995080815179199
Recall score:  0.34402810986844923
Precision score:  0.34024789171993636
Confusion matrix: 
[[  74   15  161]
 [ 691  180 1722]
 [   1    0    2]]


### Word2Vec 300

In [20]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [21]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [22]:
k_means = KMeans(n_clusters=3, random_state=0).fit(w2v300_train, train_labels)

In [23]:
train_preds = k_means.predict(w2v300_train)
dev_preds = k_means.predict(w2v300_dev)
test_preds = k_means.predict(w2v300_test)

In [24]:
computeAllScores(train_labels, train_preds)

F1 score:  0.259209304347983
Accuracy score:  0.601704595378262
Recall score:  0.3596312774201172
Precision score:  0.3066791913582484
Confusion matrix: 
[[   24  1452   486]
 [ 1540 13663  5575]
 [    1    12     9]]


In [25]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.2591562570724056
Accuracy score:  0.5993668659866338
Recall score:  0.3909218319480381
Precision score:  0.30496220828068415
Confusion matrix: 
[[   3  195   74]
 [ 178 1700  691]
 [   1    0    1]]


In [26]:
computeAllScores(test_labels, test_preds)

F1 score:  0.25488634436540464
Accuracy score:  0.5948699929725931
Recall score:  0.4408071303080945
Precision score:  0.30158778930235264
Confusion matrix: 
[[   1  193   56]
 [ 194 1690  709]
 [   0    1    2]]


### TF-IDF PCA (1000 Dims)

In [27]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [28]:
k_means = KMeans(n_clusters=3, random_state=0).fit(tfidf_pca_train, train_labels)

In [29]:
train_preds = k_means.predict(tfidf_pca_train)
dev_preds = k_means.predict(tfidf_pca_dev)
test_preds = k_means.predict(tfidf_pca_test)

In [30]:
computeAllScores(train_labels, train_preds)

F1 score:  0.1378300930462873
Accuracy score:  0.13904753536596082
Recall score:  0.36377233556457167
Precision score:  0.3566092863759667
Confusion matrix: 
[[  362   318  1282]
 [ 1736  2786 16256]
 [    1     4    17]]


In [31]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.14548665436659838
Accuracy score:  0.13823425958494548
Recall score:  0.2810823239377495
Precision score:  0.36888950434125073
Confusion matrix: 
[[  58   36  178]
 [ 226  334 2009]
 [   1    0    1]]


In [32]:
computeAllScores(test_labels, test_preds)

F1 score:  0.15596801860050574
Accuracy score:  0.15038650737877723
Recall score:  0.45904974932510606
Precision score:  0.37446859668065735
Confusion matrix: 
[[  59   34  157]
 [ 226  366 2001]
 [   0    0    3]]
