In [1]:
from embeddings_loader import *
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
def computeAllScores(y_pred_train, y_pred_dev, y_pred_test):
    print("Accuracy Train: ", accuracy_score(train_labels, y_pred_train))
    print("Accuracy Dev: ", accuracy_score(dev_labels, y_pred_dev))
    print("Accuracy Test: ", accuracy_score(test_labels, y_pred_test))
    print("F1 Train: ", f1_score(train_labels, y_pred_train, average='macro'))
    print("F1 Dev: ", f1_score(dev_labels, y_pred_dev, average='macro'))
    print("F1 Test: ", f1_score(test_labels, y_pred_test, average='macro'))
    print("Precision Train: ", precision_score(train_labels, y_pred_train, average='macro'))
    print("Precision Dev: ", precision_score(dev_labels, y_pred_dev, average='macro'))
    print("Precision Test: ", precision_score(test_labels, y_pred_test, average='macro'))
    print("Recall Train: ", recall_score(train_labels, y_pred_train, average='macro'))
    print("Recall Dev: ", recall_score(dev_labels, y_pred_dev, average='macro'))
    print("Recall Test: ", recall_score(test_labels, y_pred_test, average='macro'))
    # Confusion Matrix
    print("Confusion Matrix Train: ")
    print(confusion_matrix(train_labels, y_pred_train))
    print("Confusion Matrix Dev: ")
    print(confusion_matrix(dev_labels, y_pred_dev))
    print("Confusion Matrix Test: ")
    print(confusion_matrix(test_labels, y_pred_test))

In [4]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
nearest_centroid = NearestCentroid().fit(gt25_train, train_labels)

In [9]:
train_preds = nearest_centroid.predict(gt25_train)
dev_preds = nearest_centroid.predict(gt25_dev)
test_preds = nearest_centroid.predict(gt25_test)

In [10]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.47970301379492136
Accuracy Dev:  0.48997537812170244
Accuracy Test:  0.4764581869290232
F1 Train:  0.28734117594971553
F1 Dev:  0.2980994632664367
F1 Test:  0.28793564454140674
Precision Train:  0.36637950347720394
Precision Dev:  0.3707864273623076
Precision Test:  0.3677315352310515
Recall Train:  0.5636679644990306
Recall Dev:  0.7329662110075638
Recall Test:  0.5130018425675965
Confusion Matrix Train: 
[[1443  455   64]
 [8757 9465 2556]
 [   5    6   11]]
Confusion Matrix Dev: 
[[ 200   64    8]
 [1074 1191  304]
 [   0    0    2]]
Confusion Matrix Test: 
[[ 189   57    4]
 [1090 1166  337]
 [   1    1    1]]


### FastText 300 

In [11]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [12]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [13]:
nearest_centroid = NearestCentroid().fit(ft300_train, train_labels)

In [14]:
train_preds = nearest_centroid.predict(ft300_train)
dev_preds = nearest_centroid.predict(ft300_dev)
test_preds = nearest_centroid.predict(ft300_test)

In [15]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.6240664265003075
Accuracy Dev:  0.6215265564544495
Accuracy Test:  0.6289529163738581
F1 Train:  0.3472252050771638
F1 Dev:  0.35197322985265717
F1 Test:  0.35126624153239455
Precision Train:  0.3790820077272104
Precision Dev:  0.38347846298226734
Precision Test:  0.38232228019527303
Recall Train:  0.5981714645664077
Recall Dev:  0.6160523473694655
Recall Test:  0.5663835111625316
Confusion Matrix Train: 
[[ 1423   524    15]
 [ 7154 12772   852]
 [    4     8    10]]
Confusion Matrix Dev: 
[[ 201   68    3]
 [ 895 1565  109]
 [   1    0    1]]
Confusion Matrix Test: 
[[ 187   61    2]
 [ 876 1602  115]
 [   1    1    1]]


### Word2Vec 300

In [16]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [17]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [18]:
nearest_centroid = NearestCentroid().fit(w2v300_train, train_labels)

In [19]:
train_preds = nearest_centroid.predict(w2v300_train)
dev_preds = nearest_centroid.predict(w2v300_dev)
test_preds = nearest_centroid.predict(w2v300_test)

In [20]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.6600913803707934
Accuracy Dev:  0.6630320084417869
Accuracy Test:  0.6654954321855235
F1 Train:  0.36425988909189205
F1 Dev:  0.3729726712182186
F1 Test:  0.37511734210431696
Precision Train:  0.3855406107287565
Precision Dev:  0.3916325969283716
Precision Test:  0.3956594876316804
Recall Train:  0.5980371562170766
Recall Dev:  0.6335545798700952
Recall Test:  0.6026436988473239
Confusion Matrix Train: 
[[ 1435   511    16]
 [ 6446 13581   751]
 [    2    11     9]]
Confusion Matrix Dev: 
[[ 203   68    1]
 [ 791 1681   97]
 [   0    1    1]]
Confusion Matrix Test: 
[[ 206   44    0]
 [ 809 1687   97]
 [   1    1    1]]


### TF-IDF PCA (1000 Dims)

In [21]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [22]:
nearest_centroid = NearestCentroid().fit(tfidf_pca_train, train_labels)

In [23]:
train_preds = nearest_centroid.predict(tfidf_pca_train)
dev_preds = nearest_centroid.predict(tfidf_pca_dev)
test_preds = nearest_centroid.predict(tfidf_pca_test)

In [24]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7730867234865126
Accuracy Dev:  0.7583538515652479
Accuracy Test:  0.7614195361911454
F1 Train:  0.42529839422773147
F1 Dev:  0.4121132839539076
F1 Test:  0.4092289643846722
Precision Train:  0.41369761860520576
Precision Dev:  0.40898646970928704
Precision Test:  0.40602382493910955
Recall Train:  0.7393047956501185
Recall Dev:  0.4846486959906578
Recall Test:  0.48699781462912967
Confusion Matrix Train: 
[[ 1298   635    29]
 [ 3737 16282   759]
 [    0     5    17]]
Confusion Matrix Dev: 
[[ 187   78    7]
 [ 516 1969   84]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 173   73    4]
 [ 506 1994   93]
 [   0    3    0]]


### Sentence Transformer Faster No PCA

In [25]:
train, dev, test = load_sent_trans_fast_no_pca()

In [26]:
nearest_centroid = NearestCentroid().fit(train, train_labels)

In [27]:
train_preds = nearest_centroid.predict(train)
dev_preds = nearest_centroid.predict(dev)
test_preds = nearest_centroid.predict(test)

In [28]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7556014409981548
Accuracy Dev:  0.7530777347871966
Accuracy Test:  0.7484188334504568
F1 Train:  0.43042788904055795
F1 Dev:  0.42970491901065877
F1 Test:  0.4228869061367528
Precision Train:  0.42396122188485214
Precision Dev:  0.4256519431690493
Precision Test:  0.41997956405441167
Recall Train:  0.7840566908790092
Recall Dev:  0.6667711362474145
Recall Test:  0.623343531730728
Confusion Matrix Train: 
[[ 1532   367    63]
 [ 3918 15649  1211]
 [    1     3    18]]
Confusion Matrix Dev: 
[[ 203   59   10]
 [ 471 1937  161]
 [   0    1    1]]
Confusion Matrix Test: 
[[ 198   46    6]
 [ 515 1931  147]
 [   0    2    1]]


### Sentence Transformer Faster PCA

In [29]:
train, dev, test = load_sent_trans_fast_pca()

In [30]:
nearest_centroid = NearestCentroid().fit(train, train_labels)

In [31]:
train_preds = nearest_centroid.predict(train)
dev_preds = nearest_centroid.predict(dev)
test_preds = nearest_centroid.predict(test)

In [32]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7510763553290573
Accuracy Dev:  0.748153359127682
Accuracy Test:  0.7452565003513704
F1 Train:  0.42866522750059516
F1 Dev:  0.42754941404575364
F1 Test:  0.4222541821900796
Precision Train:  0.4236775774544243
Precision Dev:  0.4249705883392217
Precision Test:  0.4201772845295531
Recall Train:  0.781942745966091
Recall Dev:  0.6638588677978003
Recall Test:  0.62218657068175
Confusion Matrix Train: 
[[ 1529   363    70]
 [ 3906 15549  1323]
 [    1     3    18]]
Confusion Matrix Dev: 
[[ 202   58   12]
 [ 473 1924  172]
 [   0    1    1]]
Confusion Matrix Test: 
[[ 198   46    6]
 [ 512 1922  159]
 [   0    2    1]]


### Sentence Transformer Better No PCA

In [33]:
train, dev, test = load_sent_trans_better_no_pca()

In [34]:
nearest_centroid = NearestCentroid().fit(train, train_labels)

In [35]:
train_preds = nearest_centroid.predict(train)
dev_preds = nearest_centroid.predict(dev)
test_preds = nearest_centroid.predict(test)

In [36]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7425094455671734
Accuracy Dev:  0.7463946535349982
Accuracy Test:  0.7371749824314828
F1 Train:  0.43146778920882695
F1 Dev:  0.4350685830431395
F1 Test:  0.43059630707966057
Precision Train:  0.429967689934548
Precision Dev:  0.43422554551193193
Precision Test:  0.43090798999843744
Recall Train:  0.7491589003178095
Recall Dev:  0.5098220487104924
Recall Test:  0.621639456656811
Confusion Matrix Train: 
[[ 1533   324   105]
 [ 3526 15352  1900]
 [    2     4    16]]
Confusion Matrix Dev: 
[[ 214   45   13]
 [ 441 1908  220]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 200   39   11]
 [ 445 1897  251]
 [   0    2    1]]


### Sentence Transformer Better PCA

In [37]:
train, dev, test = load_sent_trans_better_pca()

In [38]:
nearest_centroid = NearestCentroid().fit(train, train_labels)

In [39]:
train_preds = nearest_centroid.predict(train)
dev_preds = nearest_centroid.predict(dev)
test_preds = nearest_centroid.predict(test)

In [40]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7367981723925842
Accuracy Dev:  0.7390080900457263
Accuracy Test:  0.7340126493323963
F1 Train:  0.42968197136857356
F1 Dev:  0.43296881250700076
F1 Test:  0.4294071270047755
Precision Train:  0.4297372460239302
Precision Dev:  0.4339658081101865
Precision Test:  0.4306354203541127
Recall Train:  0.7467656570134403
Recall Dev:  0.5060015150474358
Recall Test:  0.619277713502164
Confusion Matrix Train: 
[[ 1531   321   110]
 [ 3525 15224  2029]
 [    2     4    16]]
Confusion Matrix Dev: 
[[ 213   45   14]
 [ 440 1888  241]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 199   39   12]
 [ 444 1889  260]
 [   0    2    1]]
