In [1]:
from embeddings_loader import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *
import numpy as np

--ip=127.0.0.1


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [5]:
svm = SVC()
gridsearch = GridSearchCV(svm, param_grid = {
    "C" : [1, 0.5],
    "kernel" : ["linear", "poly", "rbf", "sigmoid"]
}, scoring = "f1_micro")

### Glove Twitter 25

In [10]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [11]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [12]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [13]:
best_params

{'C': 1, 'kernel': 'poly'}

In [14]:
svm = svm.fit(gt25_train, train_labels)
save_model(svm, "svm_gt25.joblib")

In [15]:
train_preds = svm.predict(gt25_train)
dev_preds = svm.predict(gt25_dev)
test_preds = svm.predict(gt25_test)

In [16]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9129250505228012
Accuracy Dev:  0.9036229335209286
Accuracy Test:  0.9111033028812369
Weighted F1 Train:  0.8714427733234466
Weighted F1 Dev:  0.85787410059692
Weighted F1 Test:  0.8687225094212346
Macro F1 Train:  0.3737144052960122
Macro F1 Dev:  0.31645725548164577
Macro F1 Test:  0.31782803211374644
Micro F1 Train:  0.9129250505228012
Micro F1 Dev:  0.9036229335209286
Micro F1 Test:  0.9111033028812369
Weighted Recall Train:  0.9129250505228012
Weighted Recall Dev:  0.9036229335209286
Weighted Recall Test:  0.9111033028812369
Macro Recall Train:  0.3636363636363636
Macro Recall Dev:  0.3333333333333333
Macro Recall Test:  0.3333333333333333
Micro Recall Train:  0.9129250505228012
Micro Recall Dev:  0.9036229335209286
Micro Recall Test:  0.9111033028812369
Confusion Matrix Train: 
[[    0  1962     0]
 [    0 20778     0]
 [    0    20     2]]
Confusion Matrix Dev: 
[[   0  272    0]
 [   0 2569    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   0  250    0]
 [

### FastText 300 

In [17]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [18]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [19]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [20]:
best_params

{'C': 1, 'kernel': 'poly'}

In [21]:
svm = svm.fit(ft300_train, train_labels)
save_model(svm, "svm_ft300.joblib")

In [22]:
train_preds = svm.predict(ft300_train)
dev_preds = svm.predict(ft300_dev)
test_preds = svm.predict(ft300_test)

In [23]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9160003514629645
Accuracy Dev:  0.9064368624692226
Accuracy Test:  0.9139142656359803
Weighted F1 Train:  0.8789113587421983
Weighted F1 Dev:  0.8646111464182938
Weighted F1 Test:  0.8754497618294961
Macro F1 Train:  0.4441772568901276
Macro F1 Dev:  0.3359733530717987
Macro F1 Test:  0.33896803544776205
Micro F1 Train:  0.9160003514629645
Micro F1 Dev:  0.9064368624692226
Micro F1 Test:  0.9139142656359803
Weighted Recall Train:  0.9160003514629645
Weighted Recall Dev:  0.9064368624692226
Weighted Recall Test:  0.9139142656359803
Macro Recall Train:  0.4057999352925797
Macro Recall Dev:  0.34313725490196073
Macro Recall Test:  0.34400000000000003
Micro Recall Train:  0.9160003514629645
Micro Recall Dev:  0.9064368624692226
Micro Recall Test:  0.9139142656359803
Confusion Matrix Train: 
[[   70  1892     0]
 [    2 20776     0]
 [    0    18     4]]
Confusion Matrix Dev: 
[[   8  264    0]
 [   0 2569    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   8  242    0]

### Word2Vec 300

In [24]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [25]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [26]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [27]:
best_params

{'C': 1, 'kernel': 'rbf'}

In [28]:
svm = svm.fit(w2v300_train, train_labels)
save_model(svm, "svm_w2v300.joblib")

In [29]:
train_preds = svm.predict(w2v300_train)
dev_preds = svm.predict(w2v300_dev)
test_preds = svm.predict(w2v300_test)

In [30]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9223266848255866
Accuracy Dev:  0.9124164614843475
Accuracy Test:  0.9188334504567814
Weighted F1 Train:  0.8939727630972623
Weighted F1 Dev:  0.8809587172617952
Weighted F1 Test:  0.8880065866005644
Macro F1 Train:  0.49237757557525813
Macro F1 Dev:  0.38477388926519657
Macro F1 Test:  0.37905510003381515
Micro F1 Train:  0.9223266848255866
Micro F1 Dev:  0.9124164614843475
Micro F1 Test:  0.9188334504567814
Weighted Recall Train:  0.9223266848255866
Weighted Recall Dev:  0.9124164614843475
Weighted Recall Test:  0.9188334504567814
Macro Recall Train:  0.43318795616347455
Macro Recall Dev:  0.37054501637167125
Macro Recall Test:  0.366281012983674
Micro Recall Train:  0.9223266848255866
Micro Recall Dev:  0.9124164614843475
Micro Recall Test:  0.9188334504567814
Confusion Matrix Train: 
[[  233  1729     0]
 [   21 20757     0]
 [    0    18     4]]
Confusion Matrix Dev: 
[[  31  241    0]
 [   6 2563    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  25  225    0

### TF-IDF PCA (1000 Dims)

In [31]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [32]:
grid_results = gridsearch.fit(tfidf_pca_train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [33]:
best_params

{'C': 1, 'kernel': 'rbf'}

In [34]:
svm = svm.fit(tfidf_pca_train, train_labels)
save_model(svm, "svm_tfidf_pca.joblib")

In [35]:
train_preds = svm.predict(tfidf_pca_train)
dev_preds = svm.predict(tfidf_pca_dev)
test_preds = svm.predict(tfidf_pca_test)

In [36]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9616466039891046
Accuracy Dev:  0.9208582483292297
Accuracy Test:  0.9262122276879832
Weighted F1 Train:  0.9566986328646805
Weighted F1 Dev:  0.901044745187201
Weighted F1 Test:  0.9073273263576856
Macro F1 Train:  0.5687947207530685
Macro F1 Dev:  0.44320306023291806
Macro F1 Test:  0.44130901164458214
Micro F1 Train:  0.9616466039891046
Micro F1 Dev:  0.9208582483292297
Micro F1 Test:  0.9262122276879832
Weighted Recall Train:  0.9616466039891046
Weighted Recall Dev:  0.9208582483292297
Weighted Recall Test:  0.9262122276879832
Macro Recall Train:  0.5254710517250024
Macro Recall Dev:  0.4120098993275784
Macro Recall Test:  0.40994318035737237
Micro Recall Train:  0.9616466039891046
Micro Recall Dev:  0.9208582483292297
Micro Recall Test:  0.9262122276879832
Confusion Matrix Train: 
[[ 1133   829     0]
 [   22 20756     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[  66  206    0]
 [  17 2552    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  59  191    0]


### Sentence Transformer Faster No PCA

In [37]:
train, dev, test = load_sent_trans_fast_no_pca()

In [39]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [40]:
best_params

{'C': 1, 'kernel': 'poly'}

In [41]:
svm = svm.fit(train, train_labels)
save_model(svm, "svm_faster_no_pca.joblib")

In [42]:
train_preds = svm.predict(train)
dev_preds = svm.predict(dev)
test_preds = svm.predict(test)

In [43]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9683683331868904
Accuracy Dev:  0.9289482940555751
Accuracy Test:  0.9300773014757554
Weighted F1 Train:  0.965583633829597
Weighted F1 Dev:  0.9175831506540932
Weighted F1 Test:  0.9175972917708787
Macro F1 Train:  0.6933108884007634
Macro F1 Dev:  0.48970569660224833
Macro F1 Test:  0.47472252317322744
Micro F1 Train:  0.9683683331868904
Micro F1 Dev:  0.9289482940555752
Micro F1 Test:  0.9300773014757555
Weighted Recall Train:  0.9683683331868904
Weighted Recall Dev:  0.9289482940555751
Weighted Recall Test:  0.9300773014757554
Macro Recall Train:  0.6164685353144213
Macro Recall Dev:  0.45553650615559577
Macro Recall Test:  0.4414767965034066
Micro Recall Train:  0.9683683331868904
Micro Recall Dev:  0.9289482940555751
Micro Recall Test:  0.9300773014757554
Confusion Matrix Train: 
[[ 1315   647     0]
 [   55 20723     0]
 [    0    18     4]]
Confusion Matrix Dev: 
[[ 103  169    0]
 [  31 2538    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  84  166    0]


### Sentence Transformer Faster PCA

In [44]:
train, dev, test = load_sent_trans_fast_pca()

In [45]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [46]:
best_params

{'C': 1, 'kernel': 'poly'}

In [47]:
svm = svm.fit(train, train_labels)
save_model(svm, "svm_faster_pca.joblib")

In [48]:
train_preds = svm.predict(train)
dev_preds = svm.predict(dev)
test_preds = svm.predict(test)

In [49]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9725858887619717
Accuracy Dev:  0.9264861062258178
Accuracy Test:  0.9311314125087843
Weighted F1 Train:  0.9705355307772235
Weighted F1 Dev:  0.914160810577925
Weighted F1 Test:  0.9191963921425864
Macro F1 Train:  0.7796292301529198
Macro F1 Dev:  0.48140988937209633
Macro F1 Test:  0.47909539378809396
Micro F1 Train:  0.9725858887619717
Micro F1 Dev:  0.9264861062258178
Micro F1 Test:  0.9311314125087843
Weighted Recall Train:  0.9725858887619717
Weighted Recall Dev:  0.9264861062258178
Weighted Recall Test:  0.9311314125087843
Macro Recall Train:  0.6905509763428469
Macro Recall Dev:  0.4480538128057763
Macro Recall Test:  0.44547679650340655
Micro Recall Train:  0.9725858887619717
Micro Recall Dev:  0.9264861062258178
Micro Recall Test:  0.9311314125087843
Confusion Matrix Train: 
[[ 1393   569     0]
 [   41 20737     0]
 [    0    14     8]]
Confusion Matrix Dev: 
[[  97  175    0]
 [  32 2537    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  87  163    0]


### Sentence Transformer Better No PCA

In [50]:
train, dev, test = load_sent_trans_better_no_pca()

In [51]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [52]:
best_params

{'C': 1, 'kernel': 'poly'}

In [53]:
svm = svm.fit(train, train_labels)
save_model(svm, "svm_better_no_pca.joblib")

In [54]:
train_preds = svm.predict(train)
dev_preds = svm.predict(dev)
test_preds = svm.predict(test)

In [55]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.968807661892628
Accuracy Dev:  0.9289482940555751
Accuracy Test:  0.9367533380182712
Weighted F1 Train:  0.9662076090426754
Weighted F1 Dev:  0.9182301485196132
Weighted F1 Test:  0.9259832108595365
Macro F1 Train:  0.6950040817203854
Macro F1 Dev:  0.4920824923513611
Macro F1 Test:  0.4954968722619557
Micro F1 Train:  0.968807661892628
Micro F1 Dev:  0.9289482940555752
Micro F1 Test:  0.9367533380182712
Weighted Recall Train:  0.968807661892628
Weighted Recall Dev:  0.9289482940555751
Weighted Recall Test:  0.9367533380182712
Macro Recall Train:  0.6197060025288719
Macro Recall Dev:  0.45882372022378437
Macro Recall Test:  0.4583766550970562
Micro Recall Train:  0.968807661892628
Micro Recall Dev:  0.9289482940555751
Micro Recall Test:  0.9367533380182712
Confusion Matrix Train: 
[[ 1335   627     0]
 [   65 20713     0]
 [    0    18     4]]
Confusion Matrix Dev: 
[[ 106  166    0]
 [  34 2535    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  96  154    0]
 [  2

### Sentence Transformer Better PCA

In [56]:
train, dev, test = load_sent_trans_better_pca()

In [57]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
svm = grid_results.best_estimator_

In [58]:
best_params

{'C': 1, 'kernel': 'rbf'}

In [59]:
svm = svm.fit(train, train_labels)
save_model(svm, "svm_better_pca.joblib")

In [60]:
train_preds = svm.predict(train)
dev_preds = svm.predict(dev)
test_preds = svm.predict(test)

In [61]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9546612775678763
Accuracy Dev:  0.9293000351741119
Accuracy Test:  0.9385101897399859
Weighted F1 Train:  0.9487966054803231
Weighted F1 Dev:  0.9183144016502238
Weighted F1 Test:  0.92705676635867
Macro F1 Train:  0.6515929615669237
Macro F1 Dev:  0.4917810557866642
Macro F1 Test:  0.496461013462327
Micro F1 Train:  0.9546612775678763
Micro F1 Dev:  0.9293000351741119
Micro F1 Test:  0.9385101897399859
Weighted Recall Train:  0.9546612775678763
Weighted Recall Dev:  0.9293000351741119
Weighted Recall Test:  0.9385101897399859
Macro Recall Train:  0.5685385175910176
Macro Recall Dev:  0.4578577343744037
Macro Recall Test:  0.456609847024039
Micro Recall Train:  0.9546612775678763
Micro Recall Dev:  0.9293000351741119
Micro Recall Test:  0.9385101897399859
Confusion Matrix Train: 
[[ 1036   926     0]
 [   88 20690     0]
 [    0    18     4]]
Confusion Matrix Dev: 
[[ 105  167    0]
 [  32 2537    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  94  156    0]
 [  16

## Only 2 Class Augmented Data Sentence Transformer Better

In [7]:
train, dev, test, train_labels, dev_labels, test_labels = load_only_2_class("sent_trans_augmented_no_pca", True)

In [8]:
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [64]:
# grid_results = gridsearch.fit(train, train_labels)
# best_params = grid_results.best_params_
# svm = grid_results.best_estimator_

In [65]:
# best_params

{'C': 1, 'kernel': 'poly'}

In [6]:
svm = load_model("svm_sent_trans_augmented_no_pca.joblib")

In [66]:
# svm = svm.fit(train, train_labels)
# save_model(svm, "svm_sent_trans_augmented_no_pca.joblib")

In [9]:
train_preds = svm.predict(train)
dev_preds = svm.predict(dev)
test_preds = svm.predict(test)

In [10]:
computeAllScores(train_preds, dev_preds, test_preds, aug=True)

Accuracy Train:  0.9897598036902459
Accuracy Dev:  0.9085473091804432
Accuracy Test:  0.9139142656359803
Weighted F1 Train:  0.9896126358869287
Weighted F1 Dev:  0.9119448461070339
Weighted F1 Test:  0.9180450838463163
Macro F1 Train:  0.7625364949429466
Macro F1 Dev:  0.5062863886703384
Macro F1 Test:  0.5091925620523928
Micro F1 Train:  0.9897598036902459
Micro F1 Dev:  0.9085473091804432
Micro F1 Test:  0.9139142656359803
Weighted Recall Train:  0.9897598036902459
Weighted Recall Dev:  0.9085473091804432
Weighted Recall Test:  0.9139142656359803
Macro Recall Train:  0.7206539847009079
Macro Recall Dev:  0.5225210656469672
Macro Recall Test:  0.5319460084843811
Micro Recall Train:  0.9897598036902459
Micro Recall Dev:  0.9085473091804432
Micro Recall Test:  0.9139142656359803
Confusion Matrix Train: 
[[21490    92     0]
 [  324 20454     0]
 [    1    17     4]]
Confusion Matrix Dev: 
[[ 171  101    0]
 [ 157 2412    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 164   86    0]
 [