In [22]:
# imports
from embeddings_loader import *
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
import numpy as np
from utils import *

--ip=127.0.0.1


In [3]:
train_labels, dev_labels, _ = load_labels()

In [4]:
label_replacement = {
    'NOT': 0,
    'OFF': 1,
}

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
# test_labels = [label_replacement[label] for label in test_labels]

In [16]:
perceptron = Perceptron(max_iter=1000)
gridsearch = GridSearchCV(perceptron, param_grid = {
	'eta0': [1e-4, 1e-3, 1e-2, 1e-1],
    'penalty': ['l1', 'l2'],
	'alpha': [0.0001, 0.05],
    'early_stopping': [True, False]
}, scoring = "f1_macro")

### Glove Twitter 25

In [17]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [18]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [19]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [20]:
best_params

{'alpha': 0.05, 'early_stopping': True, 'eta0': 0.001, 'penalty': 'l2'}

In [24]:
perceptron = perceptron.fit(gt25_train, train_labels)
save_model(perceptron, "perceptron_gt25.joblib")

In [25]:
train_preds = perceptron.predict(gt25_train)
dev_preds = perceptron.predict(gt25_dev)
test_preds = perceptron.predict(gt25_test)

In [26]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.6642749244712991
Accuracy Dev:  0.6620090634441088
Accuracy Test:  0.7093023255813954
Weighted F1 Train:  0.6648779590343605
Weighted F1 Dev:  0.6604994820482796
Weighted F1 Test:  0.7193010097372466
Macro F1 Train:  0.6211941352861997
Macro F1 Dev:  0.6230801613880299
Macro F1 Test:  0.6665301766924756
Micro F1 Train:  0.6642749244712991
Micro F1 Dev:  0.6620090634441088
Micro F1 Test:  0.7093023255813953
Weighted Recall Train:  0.6642749244712991
Weighted Recall Dev:  0.6620090634441088
Weighted Recall Test:  0.7093023255813954
Macro Recall Train:  0.6216467729696045
Macro Recall Dev:  0.6220972507323288
Macro Recall Test:  0.684744623655914
Micro Recall Train:  0.6642749244712991
Micro Recall Dev:  0.6620090634441088
Micro Recall Test:  0.7093023255813954
Confusion Matrix Train: 
[[5304 1803]
 [1753 1732]]
Confusion Matrix Dev: 
[[1302  431]
 [ 464  451]]
Confusion Matrix Test: 
[[459 161]
 [ 89 151]]


### FastText 300 

In [27]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [28]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [33]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [34]:
best_params

{'alpha': 0.0001, 'early_stopping': True, 'eta0': 0.01, 'penalty': 'l2'}

In [35]:
perceptron = perceptron.fit(ft300_train, train_labels)
save_model(perceptron, "perceptron_ft300.joblib")

In [36]:
train_preds = perceptron.predict(ft300_train)
dev_preds = perceptron.predict(ft300_dev)
test_preds = perceptron.predict(ft300_test)

In [37]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.6876888217522659
Accuracy Dev:  0.6714501510574018
Accuracy Test:  0.7430232558139535
Weighted F1 Train:  0.5772865318196864
Weighted F1 Dev:  0.5567151908973788
Weighted F1 Test:  0.6577642270906181
Macro F1 Train:  0.4558171836617875
Macro F1 Dev:  0.44833967425661775
Macro F1 Test:  0.5070738507423977
Micro F1 Train:  0.6876888217522659
Micro F1 Dev:  0.6714501510574018
Micro F1 Test:  0.7430232558139535
Weighted Recall Train:  0.6876888217522659
Weighted Recall Dev:  0.6714501510574018
Weighted Recall Test:  0.7430232558139535
Macro Recall Train:  0.525979498863347
Macro Recall Dev:  0.525106026064281
Macro Recall Test:  0.5434139784946236
Micro Recall Train:  0.6876888217522659
Micro Recall Dev:  0.6714501510574018
Micro Recall Test:  0.7430232558139535
Confusion Matrix Train: 
[[7099    8]
 [3300  185]]
Confusion Matrix Dev: 
[[1731    2]
 [ 868   47]]
Confusion Matrix Test: 
[[617   3]
 [218  22]]


### Word2Vec 300

In [38]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [39]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [41]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [42]:
best_params

{'alpha': 0.05, 'early_stopping': True, 'eta0': 0.0001, 'penalty': 'l2'}

In [43]:
perceptron = perceptron.fit(w2v300_train, train_labels)
save_model(perceptron, "perceptron_w2v300.joblib")

In [44]:
train_preds = perceptron.predict(w2v300_train)
dev_preds = perceptron.predict(w2v300_dev)
test_preds = perceptron.predict(w2v300_test)

In [45]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7302681268882175
Accuracy Dev:  0.7145015105740181
Accuracy Test:  0.7883720930232558
Weighted F1 Train:  0.6774233816683244
Weighted F1 Dev:  0.6571334989434339
Weighted F1 Test:  0.7569970254191454
Macro F1 Train:  0.5989470933589953
Macro F1 Dev:  0.5858158188312484
Macro F1 Test:  0.6693313953488372
Micro F1 Train:  0.7302681268882174
Micro F1 Dev:  0.7145015105740181
Micro F1 Test:  0.7883720930232558
Weighted Recall Train:  0.7302681268882175
Weighted Recall Dev:  0.7145015105740181
Weighted Recall Test:  0.7883720930232558
Macro Recall Train:  0.6053822700718006
Macro Recall Dev:  0.5995238680830803
Macro Recall Test:  0.6502016129032258
Micro Recall Train:  0.7302681268882175
Micro Recall Dev:  0.7145015105740181
Micro Recall Test:  0.7883720930232558
Confusion Matrix Train: 
[[6898  209]
 [2648  837]]
Confusion Matrix Dev: 
[[1684   49]
 [ 707  208]]
Confusion Matrix Test: 
[[597  23]
 [159  81]]


### Sentence Transformer

In [46]:
train, dev, test = load_sent_trans()

In [48]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
perceptron = grid_results.best_estimator_

In [49]:
best_params

{'alpha': 0.05, 'early_stopping': True, 'eta0': 0.0001, 'penalty': 'l2'}

In [50]:
perceptron = perceptron.fit(train, train_labels)
save_model(perceptron, "perceptron_better_no_pca.joblib")

In [51]:
train_preds = perceptron.predict(train)
dev_preds = perceptron.predict(dev)
test_preds = perceptron.predict(test)

In [52]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7553814199395771
Accuracy Dev:  0.7292296072507553
Accuracy Test:  0.7383720930232558
Weighted F1 Train:  0.7451591918833306
Weighted F1 Dev:  0.7199455155362555
Weighted F1 Test:  0.739509717429957
Macro F1 Train:  0.7020568519302173
Macro F1 Dev:  0.6821624955240131
Macro F1 Test:  0.6777529639233202
Micro F1 Train:  0.7553814199395771
Micro F1 Dev:  0.7292296072507554
Micro F1 Test:  0.7383720930232558
Weighted Recall Train:  0.7553814199395771
Weighted Recall Dev:  0.7292296072507553
Weighted Recall Test:  0.7383720930232558
Macro Recall Train:  0.6915849126459879
Macro Recall Dev:  0.6747429360627359
Macro Recall Test:  0.6793682795698924
Micro Recall Train:  0.7553814199395771
Micro Recall Dev:  0.7292296072507553
Micro Recall Test:  0.7383720930232558
Confusion Matrix Train: 
[[6241  866]
 [1725 1760]]
Confusion Matrix Dev: 
[[1475  258]
 [ 459  456]]
Confusion Matrix Test: 
[[504 116]
 [109 131]]
