In [1]:
from embeddings_loader import *
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
def computeAllScores(y_pred_train, y_pred_dev, y_pred_test):
    print("Accuracy Train: ", accuracy_score(train_labels, y_pred_train))
    print("Accuracy Dev: ", accuracy_score(dev_labels, y_pred_dev))
    print("Accuracy Test: ", accuracy_score(test_labels, y_pred_test))
    print("F1 Train: ", f1_score(train_labels, y_pred_train, average='macro'))
    print("F1 Dev: ", f1_score(dev_labels, y_pred_dev, average='macro'))
    print("F1 Test: ", f1_score(test_labels, y_pred_test, average='macro'))
    print("Precision Train: ", precision_score(train_labels, y_pred_train, average='macro'))
    print("Precision Dev: ", precision_score(dev_labels, y_pred_dev, average='macro'))
    print("Precision Test: ", precision_score(test_labels, y_pred_test, average='macro'))
    print("Recall Train: ", recall_score(train_labels, y_pred_train, average='macro'))
    print("Recall Dev: ", recall_score(dev_labels, y_pred_dev, average='macro'))
    print("Recall Test: ", recall_score(test_labels, y_pred_test, average='macro'))
    # Confusion Matrix
    print("Confusion Matrix Train: ")
    print(confusion_matrix(train_labels, y_pred_train))
    print("Confusion Matrix Dev: ")
    print(confusion_matrix(dev_labels, y_pred_dev))
    print("Confusion Matrix Test: ")
    print(confusion_matrix(test_labels, y_pred_test))

In [4]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
k_medoids = KMedoids(n_clusters=3, random_state=0).fit(gt25_train, train_labels)

In [9]:
train_preds = k_medoids.predict(gt25_train)
dev_preds = k_medoids.predict(gt25_dev)
test_preds = k_medoids.predict(gt25_test)

In [10]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.39056321940075567
Accuracy Dev:  0.3939500527611678
Accuracy Test:  0.3882642304989459
F1 Train:  0.23299111756158597
F1 Dev:  0.24149580109249755
F1 Test:  0.2308150733779275
Precision Train:  0.34398741977465647
Precision Dev:  0.34627506487379095
Precision Test:  0.34233578200791315
Recall Train:  0.2999336290008057
Recall Dev:  0.2527047603782657
Recall Test:  0.3445951064832669
Confusion Matrix Train: 
[[ 629  488  845]
 [6474 8257 6047]
 [   6   12    4]]
Confusion Matrix Dev: 
[[  98   75   99]
 [ 802 1022  745]
 [   0    2    0]]
Confusion Matrix Test: 
[[  76   63  111]
 [ 823 1028  742]
 [   1    1    1]]


### FastText 300 

In [11]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [12]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [13]:
k_medoids = KMedoids(n_clusters=3, random_state=0).fit(ft300_train, train_labels)

In [14]:
train_preds = k_medoids.predict(ft300_train)
dev_preds = k_medoids.predict(ft300_dev)
test_preds = k_medoids.predict(ft300_test)

In [15]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.30151129074773747
Accuracy Dev:  0.3119943721421034
Accuracy Test:  0.3021784961349262
F1 Train:  0.18952357012854415
F1 Dev:  0.19813893483270875
F1 Test:  0.19204232944421282
Precision Train:  0.32599741864358023
Precision Dev:  0.33029516357735705
Precision Test:  0.3300046219375368
Recall Train:  0.28628109207736036
Recall Dev:  0.40982844091315
Recall Test:  0.2527183442601877
Confusion Matrix Train: 
[[  850   651   461]
 [10313  6010  4455]
 [   12     7     3]]
Confusion Matrix Dev: 
[[ 117   80   75]
 [1267  769  533]
 [   0    1    1]]
Confusion Matrix Test: 
[[ 118   75   57]
 [1303  742  548]
 [   2    1    0]]


### Word2Vec 300

In [16]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [17]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [18]:
k_medoids = KMedoids(n_clusters=3, random_state=0).fit(w2v300_train, train_labels)

In [19]:
train_preds = k_medoids.predict(w2v300_train)
dev_preds = k_medoids.predict(w2v300_dev)
test_preds = k_medoids.predict(w2v300_test)

In [20]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.3707494947719884
Accuracy Dev:  0.38199085473091804
Accuracy Test:  0.3713984539704849
F1 Train:  0.23889330096627073
F1 Dev:  0.2492302440397095
F1 Test:  0.23993587762796267
Precision Train:  0.35736725425615096
Precision Dev:  0.3628587705486155
Precision Test:  0.3609947692708624
Recall Train:  0.35835533026148364
Recall Dev:  0.442223551545959
Recall Test:  0.2659951150533488
Confusion Matrix Train: 
[[ 859  374  729]
 [6313 7574 6891]
 [   5   11    6]]
Confusion Matrix Dev: 
[[123  47 102]
 [788 962 819]
 [  0   1   1]]
Confusion Matrix Test: 
[[108  36 106]
 [780 949 864]
 [  1   2   0]]


### TF-IDF PCA (1000 Dims)

In [21]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [22]:
k_medoids = KMedoids(n_clusters=3, random_state=0).fit(tfidf_pca_train, train_labels)

In [23]:
train_preds = k_medoids.predict(tfidf_pca_train)
dev_preds = k_medoids.predict(tfidf_pca_dev)
test_preds = k_medoids.predict(tfidf_pca_test)

In [24]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.08619629206572357
Accuracy Dev:  0.09567358424199789
Accuracy Test:  0.08784258608573436
F1 Train:  0.05290406083158065
F1 Dev:  0.05821294810058855
F1 Test:  0.053832902670111975
Precision Train:  0.028732097355241192
Precision Dev:  0.03189119474733263
Precision Test:  0.02928086202857812
Recall Train:  0.3333333333333333
Recall Dev:  0.3333333333333333
Recall Test:  0.3333333333333333
Confusion Matrix Train: 
[[ 1962     0     0]
 [20778     0     0]
 [   22     0     0]]
Confusion Matrix Dev: 
[[ 272    0    0]
 [2569    0    0]
 [   2    0    0]]
Confusion Matrix Test: 
[[ 250    0    0]
 [2593    0    0]
 [   3    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
