In [43]:
from embeddings_loader import *
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score


In [59]:
def computeAllScores(y_true, y_pred):
    print("F1 score: ", f1_score(y_true, y_pred, average='macro'))
    print("Accuracy score: ", accuracy_score(y_true, y_pred))
    print("Recall score: ", recall_score(y_true, y_pred, average='macro'))
    print("Precision score: ", precision_score(y_true, y_pred, average='macro'))
    print("Confusion matrix: ")
    print(confusion_matrix(y_true, y_pred))

In [47]:
train_labels, dev_labels, test_labels = load_labels()

In [48]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [49]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

### Glove Twitter 25

In [50]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [51]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [52]:
k_means = KMeans(n_clusters=3, random_state=0).fit(gt25_train, train_labels)

In [53]:
dev_preds = k_means.predict(gt25_dev)
test_preds = k_means.predict(gt25_test)

In [60]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.14283631492082952
Accuracy score:  0.17622230038691522
Recall score:  0.2107389958708279
Precision score:  0.36097332451499115
Confusion matrix: 
[[ 133    8  131]
 [1083  368 1118]
 [   0    2    0]]


In [63]:
computeAllScores(test_labels, test_preds)

F1 score:  0.15016522555352096
Accuracy score:  0.18657765284609978
Recall score:  0.23452063247204014
Precision score:  0.36523584905660383
Confusion matrix: 
[[ 138    5  107]
 [1080  393 1120]
 [   1    2    0]]


### FastText 300 

In [31]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [32]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [33]:
k_means = KMeans(n_clusters=3, random_state=0).fit(ft300_train, train_labels)

In [34]:
dev_preds = k_means.predict(ft300_dev)
test_preds = k_means.predict(ft300_test)

In [61]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.14283631492082952
Accuracy score:  0.17622230038691522
Recall score:  0.2107389958708279
Precision score:  0.36097332451499115
Confusion matrix: 
[[ 133    8  131]
 [1083  368 1118]
 [   0    2    0]]


In [62]:
computeAllScores(test_labels, test_preds)

F1 score:  0.15016522555352096
Accuracy score:  0.18657765284609978
Recall score:  0.23452063247204014
Precision score:  0.36523584905660383
Confusion matrix: 
[[ 138    5  107]
 [1080  393 1120]
 [   1    2    0]]


### Word2Vec 300

In [66]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [67]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [68]:
k_means = KMeans(n_clusters=3, random_state=0).fit(w2v300_train, train_labels)

In [69]:
dev_preds = k_means.predict(w2v300_dev)
test_preds = k_means.predict(w2v300_test)

In [70]:
computeAllScores(dev_labels, dev_preds)

F1 score:  0.2591562570724056
Accuracy score:  0.5993668659866338
Recall score:  0.3909218319480381
Precision score:  0.30496220828068415
Confusion matrix: 
[[   3  195   74]
 [ 178 1700  691]
 [   1    0    1]]


In [71]:
computeAllScores(test_labels, test_preds)

F1 score:  0.25488634436540464
Accuracy score:  0.5948699929725931
Recall score:  0.4408071303080945
Precision score:  0.30158778930235264
Confusion matrix: 
[[   1  193   56]
 [ 194 1690  709]
 [   0    1    2]]
