In [1]:
from embeddings_loader import *
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *

--ip=127.0.0.1


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [5]:
perceptron = Perceptron()
gridsearch = GridSearchCV(perceptron, param_grid = {
    "penalty": ["l2", "l1", None],
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "eta0": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "max_iter": [100, 1000, 10000]
}, scoring = "f1_micro")

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [9]:
best_params

{'alpha': 0.01, 'eta0': 0.1, 'max_iter': 100, 'penalty': 'l2'}

In [10]:
perceptron = perceptron.fit(gt25_train, train_labels)
save_model(perceptron, "perceptron_gt25.joblib")

In [11]:
train_preds = perceptron.predict(gt25_train)
dev_preds = perceptron.predict(gt25_dev)
test_preds = perceptron.predict(gt25_test)

In [12]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9132325806168176
Accuracy Dev:  0.9050298979950756
Accuracy Test:  0.9132115249472944
Weighted F1 Train:  0.8744351101760547
Weighted F1 Dev:  0.8619334282377243
Weighted F1 Test:  0.8744332920127035
Macro F1 Train:  0.33004307939745064
Macro F1 Dev:  0.3286755723057514
Macro F1 Test:  0.3362613040174669
Micro F1 Train:  0.9132325806168176
Micro F1 Dev:  0.9050298979950756
Micro F1 Test:  0.9132115249472944
Weighted Recall Train:  0.9132325806168176
Weighted Recall Dev:  0.9050298979950756
Weighted Recall Test:  0.9132115249472944
Macro Recall Train:  0.3390163908368149
Macro Recall Dev:  0.3393310321403766
Macro Recall Test:  0.3425381154390024
Micro Recall Train:  0.9132325806168176
Micro Recall Dev:  0.9050298979950756
Micro Recall Test:  0.9132115249472944
Confusion Matrix Train: 
[[   36  1926     0]
 [   27 20751     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   5  267    0]
 [   1 2568    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   7  243    0]
 

### FastText 300 

In [13]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [14]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [15]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [16]:
best_params

{'alpha': 0.001, 'eta0': 0.0001, 'max_iter': 100, 'penalty': 'l1'}

In [17]:
perceptron = perceptron.fit(ft300_train, train_labels)
save_model(perceptron, "perceptron_ft300.joblib")

In [18]:
train_preds = perceptron.predict(ft300_train)
dev_preds = perceptron.predict(ft300_dev)
test_preds = perceptron.predict(ft300_test)

In [19]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9128371847816537
Accuracy Dev:  0.9036229335209286
Accuracy Test:  0.9111033028812369
Weighted F1 Train:  0.8712416640051998
Weighted F1 Dev:  0.85787410059692
Weighted F1 Test:  0.8687225094212346
Macro F1 Train:  0.31814423518603585
Macro F1 Dev:  0.31645725548164577
Macro F1 Test:  0.31782803211374644
Micro F1 Train:  0.9128371847816537
Micro F1 Dev:  0.9036229335209286
Micro F1 Test:  0.9111033028812369
Weighted Recall Train:  0.9128371847816537
Weighted Recall Dev:  0.9036229335209286
Weighted Recall Test:  0.9111033028812369
Macro Recall Train:  0.3333333333333333
Macro Recall Dev:  0.3333333333333333
Macro Recall Test:  0.3333333333333333
Micro Recall Train:  0.9128371847816537
Micro Recall Dev:  0.9036229335209286
Micro Recall Test:  0.9111033028812369
Confusion Matrix Train: 
[[    0  1962     0]
 [    0 20778     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   0  272    0]
 [   0 2569    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   0  250    0]
 

### Word2Vec 300

In [20]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [21]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [22]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [23]:
best_params

{'alpha': 0.01, 'eta0': 1.0, 'max_iter': 100, 'penalty': 'l2'}

In [24]:
perceptron = perceptron.fit(w2v300_train, train_labels)
save_model(perceptron, "perceptron_w2v300.joblib")

In [25]:
train_preds = perceptron.predict(w2v300_train)
dev_preds = perceptron.predict(w2v300_dev)
test_preds = perceptron.predict(w2v300_test)

In [26]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9126175204287849
Accuracy Dev:  0.9032711924023918
Accuracy Test:  0.9111033028812369
Weighted F1 Train:  0.871132047293869
Weighted F1 Dev:  0.8576986484131379
Weighted F1 Test:  0.8687225094212346
Macro F1 Train:  0.3181042073427511
Macro F1 Dev:  0.31639253372759196
Macro F1 Test:  0.31782803211374644
Micro F1 Train:  0.9126175204287849
Micro F1 Dev:  0.9032711924023918
Micro F1 Test:  0.9111033028812369
Weighted Recall Train:  0.9126175204287849
Weighted Recall Dev:  0.9032711924023918
Weighted Recall Test:  0.9111033028812369
Macro Recall Train:  0.33325312028748355
Macro Recall Dev:  0.33320358115998444
Macro Recall Test:  0.3333333333333333
Micro Recall Train:  0.9126175204287849
Micro Recall Dev:  0.9032711924023918
Micro Recall Test:  0.9111033028812369
Confusion Matrix Train: 
[[    0  1962     0]
 [    5 20773     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   0  272    0]
 [   1 2568    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   0  250    0]

### TF-IDF PCA (1000 Dims)

In [27]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [28]:
grid_results = gridsearch.fit(tfidf_pca_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [29]:
best_params

{'alpha': 0.0001, 'eta0': 0.1, 'max_iter': 100, 'penalty': 'l2'}

In [30]:
perceptron = perceptron.fit(tfidf_pca_train, train_labels)
save_model(perceptron, "perceptron_tfidf_pca.joblib")

In [31]:
train_preds = perceptron.predict(tfidf_pca_train)
dev_preds = perceptron.predict(tfidf_pca_dev)
test_preds = perceptron.predict(tfidf_pca_test)

In [32]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9260170459537826
Accuracy Dev:  0.9060851213506859
Accuracy Test:  0.9114546732255797
Weighted F1 Train:  0.9211818014286257
Weighted F1 Dev:  0.9006572083433716
Weighted F1 Test:  0.9062913943906947
Macro F1 Train:  0.495687024158139
Macro F1 Dev:  0.46626192408040185
Macro F1 Test:  0.4625893576809945
Micro F1 Train:  0.9260170459537826
Micro F1 Dev:  0.9060851213506859
Micro F1 Test:  0.9114546732255797
Weighted Recall Train:  0.9260170459537826
Weighted Recall Dev:  0.9060851213506859
Weighted Recall Test:  0.9114546732255797
Macro Recall Train:  0.48144035138924757
Macro Recall Dev:  0.4492940909333761
Macro Recall Test:  0.44430183828255565
Micro Recall Train:  0.9260170459537826
Micro Recall Dev:  0.9060851213506859
Micro Recall Test:  0.9114546732255797
Confusion Matrix Train: 
[[  833  1090    39]
 [  471 20244    63]
 [    0    21     1]]
Confusion Matrix Dev: 
[[ 105  165    2]
 [  90 2471    8]
 [   0    2    0]]
Confusion Matrix Test: 
[[  92  154    4]


### Sentence Transformer Faster No PCA

In [33]:
train, dev, test = load_sent_trans_fast_no_pca()

In [34]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [35]:
best_params

{'alpha': 0.0001, 'eta0': 0.0001, 'max_iter': 100, 'penalty': 'l1'}

In [36]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_faster_no_pca.joblib")

In [37]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [38]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9169229417450137
Accuracy Dev:  0.9074920858248329
Accuracy Test:  0.9167252283907238
Weighted F1 Train:  0.882225042636173
Weighted F1 Dev:  0.8688417753470915
Weighted F1 Test:  0.885613807882912
Macro F1 Train:  0.3542029696889917
Macro F1 Dev:  0.34937899054295435
Macro F1 Test:  0.3734899506114149
Micro F1 Train:  0.9169229417450135
Micro F1 Dev:  0.9074920858248328
Micro F1 Test:  0.9167252283907238
Weighted Recall Train:  0.9169229417450137
Weighted Recall Dev:  0.9074920858248329
Weighted Recall Test:  0.9167252283907238
Macro Recall Train:  0.3519028742174077
Macro Recall Dev:  0.35010093955838467
Macro Recall Test:  0.36310014140635044
Micro Recall Train:  0.9169229417450137
Micro Recall Dev:  0.9074920858248329
Micro Recall Test:  0.9167252283907238
Confusion Matrix Train: 
[[  111  1851     0]
 [   18 20760     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[  14  258    0]
 [   3 2566    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  23  227    0]
 

### Sentence Transformer Faster PCA

In [39]:
train, dev, test = load_sent_trans_fast_pca()

In [40]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [41]:
best_params

{'alpha': 0.0001, 'eta0': 0.0001, 'max_iter': 100, 'penalty': 'l1'}

In [42]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_faster_pca.joblib")

In [43]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [44]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9221509533432914
Accuracy Dev:  0.9124164614843475
Accuracy Test:  0.9223471539002108
Weighted F1 Train:  0.8983445474134106
Weighted F1 Dev:  0.8858697352656212
Weighted F1 Test:  0.9014003653283681
Macro F1 Train:  0.4077915563624817
Macro F1 Dev:  0.40242015423112165
Macro F1 Test:  0.4244990220535703
Micro F1 Train:  0.9221509533432914
Micro F1 Dev:  0.9124164614843475
Micro F1 Test:  0.9223471539002108
Weighted Recall Train:  0.9221509533432914
Weighted Recall Dev:  0.9124164614843475
Weighted Recall Test:  0.9223471539002108
Macro Recall Train:  0.38504391210455585
Macro Recall Dev:  0.38259813462169606
Macro Recall Test:  0.39768607790204397
Micro Recall Train:  0.9221509533432914
Micro Recall Dev:  0.9124164614843475
Micro Recall Test:  0.9223471539002108
Confusion Matrix Train: 
[[  314  1636    12]
 [  101 20676     1]
 [    0    22     0]]
Confusion Matrix Dev: 
[[  42  230    0]
 [  17 2552    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  50  199    1

### Sentence Transformer Better No PCA

In [45]:
train, dev, test = load_sent_trans_better_no_pca()

In [46]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [47]:
best_params

{'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 100, 'penalty': 'l2'}

In [48]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_better_no_pca.joblib")

In [49]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [50]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9280379580001757
Accuracy Dev:  0.9215617305663032
Accuracy Test:  0.9244553759662685
Weighted F1 Train:  0.9215964199546101
Weighted F1 Dev:  0.9146653507073376
Weighted F1 Test:  0.9203758493574914
Macro F1 Train:  0.49062131325026287
Macro F1 Dev:  0.491946364227066
Macro F1 Test:  0.4961027624687639
Micro F1 Train:  0.9280379580001757
Micro F1 Dev:  0.9215617305663032
Micro F1 Test:  0.9244553759662685
Weighted Recall Train:  0.9280379580001757
Weighted Recall Dev:  0.9215617305663032
Weighted Recall Test:  0.9244553759662685
Macro Recall Train:  0.4682736553178208
Macro Recall Dev:  0.47034351887894116
Macro Recall Test:  0.4803825684535287
Micro Recall Train:  0.9280379580001757
Micro Recall Dev:  0.9215617305663032
Micro Recall Test:  0.9244553759662685
Confusion Matrix Train: 
[[  841  1112     9]
 [  492 20283     3]
 [    0    22     0]]
Confusion Matrix Dev: 
[[ 119  153    0]
 [  68 2501    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 118  131    1]
 

### Sentence Transformer Better PCA

In [51]:
train, dev, test = load_sent_trans_better_pca()

In [52]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [53]:
best_params

{'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 100, 'penalty': 'l2'}

In [54]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_better_pca.joblib")

In [55]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [56]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.8921008698708374
Accuracy Dev:  0.8793527963418923
Accuracy Test:  0.8903724525650035
Weighted F1 Train:  0.9022027060688871
Weighted F1 Dev:  0.8894659086974668
Weighted F1 Test:  0.9018371137612198
Macro F1 Train:  0.5393647762286241
Macro F1 Dev:  0.4762443582331762
Macro F1 Test:  0.4907040375334903
Micro F1 Train:  0.8921008698708374
Micro F1 Dev:  0.8793527963418923
Micro F1 Test:  0.8903724525650035
Weighted Recall Train:  0.8921008698708374
Weighted Recall Dev:  0.8793527963418923
Weighted Recall Test:  0.8903724525650035
Macro Recall Train:  0.5983692163729094
Macro Recall Dev:  0.5073686831680901
Macro Recall Test:  0.5353808972875691
Micro Recall Train:  0.8921008698708374
Micro Recall Dev:  0.8793527963418923
Micro Recall Test:  0.8903724525650035
Confusion Matrix Train: 
[[ 1280   672    10]
 [ 1733 19021    24]
 [    0    17     5]]
Confusion Matrix Dev: 
[[ 167  104    1]
 [ 233 2333    3]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 174   74    2]
 [

## Only 2 Class Augmented Data Sentence Transformer Better

In [6]:
train, dev, test, train_labels, dev_labels, test_labels = load_only_2_class("sent_trans_augmented_no_pca", True)

In [7]:
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [8]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [9]:
best_params

{'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 100, 'penalty': 'l2'}

In [10]:
# perceptron = load_model("perceptron_sent_trans_augmented_no_pca.joblib")

In [11]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_sent_trans_augmented_no_pca.joblib")

In [12]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [13]:
computeAllScores(train_preds, dev_preds, test_preds, aug=True)

Accuracy Train:  0.8696616488131754
Accuracy Dev:  0.7942314456559972
Accuracy Test:  0.7856640899508082
Weighted F1 Train:  0.869626865029039
Weighted F1 Dev:  0.8293696731261618
Weighted F1 Test:  0.8259145649125239
Macro F1 Train:  0.6233691899550546
Macro F1 Dev:  0.42151561526827547
Macro F1 Test:  0.41278390939380527
Micro F1 Train:  0.8696616488131754
Micro F1 Dev:  0.7942314456559972
Micro F1 Test:  0.7856640899508082
Weighted Recall Train:  0.8696616488131754
Weighted Recall Dev:  0.7942314456559972
Weighted Recall Test:  0.7856640899508082
Macro Recall Train:  0.6702302488609636
Macro Recall Dev:  0.4945962036040574
Macro Recall Test:  0.4982774135492994
Micro Recall Train:  0.8696616488131754
Micro Recall Dev:  0.7942314456559972
Micro Recall Test:  0.7856640899508082
Confusion Matrix Train: 
[[19880  1676    26]
 [ 3768 16972    38]
 [    2    14     6]]
Confusion Matrix Dev: 
[[ 184   86    2]
 [ 493 2074    2]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 175   75    0]
 