In [22]:
# imports
from embeddings_loader import *
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
import numpy as np
from utils import *

In [23]:
train_labels, dev_labels = load_labels()

In [24]:
label_replacement = {
    'OFF': 0,
    'NOT': 1,
}

In [25]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]

In [26]:
perceptron = Perceptron(max_iter=1000)
gridsearch = GridSearchCV(perceptron, param_grid = {
	'eta0': [1e-4, 1e-3, 1e-2, 1e-1],
    'penalty': ['l1', 'l2'],
	'alpha': [0.0001, 0.05],
    'early_stopping': [True, False]
}, scoring = "f1_macro")

### Glove Twitter 25

In [27]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [28]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [29]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [30]:
best_params

{'alpha': 0.05, 'early_stopping': True, 'eta0': 0.001, 'penalty': 'l2'}

In [31]:
perceptron = perceptron.fit(gt25_train, train_labels)
save_model(perceptron, "perceptron_gt25.joblib")

In [32]:
train_preds = perceptron.predict(gt25_train)
dev_preds = perceptron.predict(gt25_dev)
test_preds = perceptron.predict(gt25_test)

In [33]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.585630664652568
Accuracy Dev:  0.5981873111782477
Accuracy Test:  0.622093023255814
Weighted F1 Train:  0.5908412471086509
Weighted F1 Dev:  0.6003885835333231
Weighted F1 Test:  0.6404192619397714
Macro F1 Train:  0.5849045595792605
Macro F1 Dev:  0.5980439991759641
Macro F1 Test:  0.6097409769181683
Micro F1 Train:  0.585630664652568
Micro F1 Dev:  0.5981873111782477
Micro F1 Test:  0.622093023255814
Weighted Recall Train:  0.585630664652568
Weighted Recall Dev:  0.5981873111782477
Weighted Recall Test:  0.622093023255814
Macro Recall Train:  0.6469830197519814
Macro Recall Dev:  0.654844090446145
Macro Recall Test:  0.6753360215053763
Micro Recall Train:  0.585630664652568
Micro Recall Dev:  0.5981873111782477
Micro Recall Test:  0.622093023255814
Confusion Matrix Train: 
[[2880  605]
 [3784 3323]]
Confusion Matrix Dev: 
[[767 148]
 [916 817]]
Confusion Matrix Test: 
[[191  49]
 [276 344]]


### FastText 300 

In [34]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [35]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [36]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [37]:
best_params

{'alpha': 0.05, 'early_stopping': True, 'eta0': 0.0001, 'penalty': 'l2'}

In [38]:
perceptron = perceptron.fit(ft300_train, train_labels)
save_model(perceptron, "perceptron_ft300.joblib")

In [39]:
train_preds = perceptron.predict(ft300_train)
dev_preds = perceptron.predict(ft300_dev)
test_preds = perceptron.predict(ft300_test)

In [40]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7381986404833837
Accuracy Dev:  0.724320241691843
Accuracy Test:  0.7872093023255814
Weighted F1 Train:  0.7001254342558083
Weighted F1 Dev:  0.6871177002155713
Weighted F1 Test:  0.7567358517910278
Macro F1 Train:  0.632889377231782
Macro F1 Dev:  0.6290419770713154
Macro F1 Test:  0.6696688502637318
Micro F1 Train:  0.7381986404833837
Micro F1 Dev:  0.7243202416918428
Micro F1 Test:  0.7872093023255814
Weighted Recall Train:  0.7381986404833837
Weighted Recall Dev:  0.724320241691843
Weighted Recall Test:  0.7872093023255814
Macro Recall Train:  0.628547985204233
Macro Recall Dev:  0.6276597958623821
Macro Recall Test:  0.6506720430107527
Micro Recall Train:  0.7381986404833837
Micro Recall Dev:  0.724320241691843
Micro Recall Test:  0.7872093023255814
Confusion Matrix Train: 
[[1073 2412]
 [ 361 6746]]
Confusion Matrix Dev: 
[[ 288  627]
 [ 103 1630]]
Confusion Matrix Test: 
[[ 82 158]
 [ 25 595]]


### Word2Vec 300

In [41]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [42]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [43]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [44]:
best_params

{'alpha': 0.05, 'early_stopping': True, 'eta0': 0.0001, 'penalty': 'l2'}

In [45]:
perceptron = perceptron.fit(w2v300_train, train_labels)
save_model(perceptron, "perceptron_w2v300.joblib")

In [46]:
train_preds = perceptron.predict(w2v300_train)
dev_preds = perceptron.predict(w2v300_dev)
test_preds = perceptron.predict(w2v300_test)

In [47]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7263972809667674
Accuracy Dev:  0.7095921450151057
Accuracy Test:  0.7848837209302325
Weighted F1 Train:  0.6723591195145725
Weighted F1 Dev:  0.6536572679809516
Weighted F1 Test:  0.7518819846038707
Macro F1 Train:  0.5924708661350419
Macro F1 Dev:  0.5824990573838471
Macro F1 Test:  0.6616614104742482
Micro F1 Train:  0.7263972809667674
Micro F1 Dev:  0.7095921450151056
Micro F1 Test:  0.7848837209302325
Weighted Recall Train:  0.7263972809667674
Weighted Recall Dev:  0.7095921450151057
Weighted Recall Test:  0.7848837209302325
Macro Recall Train:  0.6005966998810355
Macro Recall Dev:  0.5960310778554514
Macro Recall Test:  0.6439516129032258
Micro Recall Train:  0.7263972809667674
Micro Recall Dev:  0.7095921450151057
Micro Recall Test:  0.7848837209302325
Confusion Matrix Train: 
[[ 811 2674]
 [ 224 6883]]
Confusion Matrix Dev: 
[[ 209  706]
 [  63 1670]]
Confusion Matrix Test: 
[[ 78 162]
 [ 23 597]]


### Sentence Transformer

In [48]:
train, dev, test = load_sent_trans()

In [49]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [50]:
best_params

{'alpha': 0.0001, 'early_stopping': True, 'eta0': 0.0001, 'penalty': 'l2'}

In [51]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_better_no_pca.joblib")

In [52]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [53]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7333836858006042
Accuracy Dev:  0.7163897280966768
Accuracy Test:  0.7883720930232558
Weighted F1 Train:  0.6787661460722013
Weighted F1 Dev:  0.6636018076047253
Weighted F1 Test:  0.7492491158447048
Macro F1 Train:  0.5996362545136912
Macro F1 Dev:  0.5951699233536694
Macro F1 Test:  0.6539495560507977
Micro F1 Train:  0.7333836858006041
Micro F1 Dev:  0.7163897280966769
Micro F1 Test:  0.7883720930232558
Weighted Recall Train:  0.7333836858006042
Weighted Recall Dev:  0.7163897280966768
Weighted Recall Test:  0.7883720930232558
Macro Recall Train:  0.6066802608780439
Macro Recall Dev:  0.6050933502344398
Macro Recall Test:  0.6374327956989247
Micro Recall Train:  0.7333836858006042
Micro Recall Dev:  0.7163897280966768
Micro Recall Test:  0.7883720930232558
Confusion Matrix Train: 
[[ 823 2662]
 [ 162 6945]]
Confusion Matrix Dev: 
[[ 224  691]
 [  60 1673]]
Confusion Matrix Test: 
[[ 71 169]
 [ 13 607]]


In [54]:
df = pd.DataFrame({"id": range(test_preds.shape[0]), "label": test_preds})

In [55]:
df.head()

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [56]:
df.to_csv("../Results/Perceptron_SBERT.csv", index = False)