In [1]:
from embeddings_loader import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *

--ip=127.0.0.1


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
label_replacement = {
    'NOT': 0,
    'OFF': 1,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [5]:
support_vector_classifier = SVC()
gridsearch = GridSearchCV(support_vector_classifier, param_grid = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 1, 10],
    "gamma": ['scale', 'auto'],
    "degree": [1, 2, 3, 4],
    "class_weight": ['balanced', None]
}, scoring = "f1_macro")

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [None]:
best_params

{'alpha': 0.01, 'eta0': 0.1, 'max_iter': 100, 'penalty': 'l2'}

In [None]:
support_vector_classifier = support_vector_classifier.fit(gt25_train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_gt25.joblib")

In [None]:
train_preds = support_vector_classifier.predict(gt25_train)
dev_preds = support_vector_classifier.predict(gt25_dev)
test_preds = support_vector_classifier.predict(gt25_test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9132325806168176
Accuracy Dev:  0.9050298979950756
Accuracy Test:  0.9132115249472944
Weighted F1 Train:  0.8744351101760547
Weighted F1 Dev:  0.8619334282377243
Weighted F1 Test:  0.8744332920127035
Macro F1 Train:  0.33004307939745064
Macro F1 Dev:  0.3286755723057514
Macro F1 Test:  0.3362613040174669
Micro F1 Train:  0.9132325806168176
Micro F1 Dev:  0.9050298979950756
Micro F1 Test:  0.9132115249472944
Weighted Recall Train:  0.9132325806168176
Weighted Recall Dev:  0.9050298979950756
Weighted Recall Test:  0.9132115249472944
Macro Recall Train:  0.3390163908368149
Macro Recall Dev:  0.3393310321403766
Macro Recall Test:  0.3425381154390024
Micro Recall Train:  0.9132325806168176
Micro Recall Dev:  0.9050298979950756
Micro Recall Test:  0.9132115249472944
Confusion Matrix Train: 
[[   36  1926     0]
 [   27 20751     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   5  267    0]
 [   1 2568    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   7  243    0]
 

### FastText 300 

In [None]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [None]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [None]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [None]:
best_params

{'alpha': 0.001, 'eta0': 0.0001, 'max_iter': 100, 'penalty': 'l1'}

In [None]:
support_vector_classifier = support_vector_classifier.fit(ft300_train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_ft300.joblib")

In [None]:
train_preds = support_vector_classifier.predict(ft300_train)
dev_preds = support_vector_classifier.predict(ft300_dev)
test_preds = support_vector_classifier.predict(ft300_test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9128371847816537
Accuracy Dev:  0.9036229335209286
Accuracy Test:  0.9111033028812369
Weighted F1 Train:  0.8712416640051998
Weighted F1 Dev:  0.85787410059692
Weighted F1 Test:  0.8687225094212346
Macro F1 Train:  0.31814423518603585
Macro F1 Dev:  0.31645725548164577
Macro F1 Test:  0.31782803211374644
Micro F1 Train:  0.9128371847816537
Micro F1 Dev:  0.9036229335209286
Micro F1 Test:  0.9111033028812369
Weighted Recall Train:  0.9128371847816537
Weighted Recall Dev:  0.9036229335209286
Weighted Recall Test:  0.9111033028812369
Macro Recall Train:  0.3333333333333333
Macro Recall Dev:  0.3333333333333333
Macro Recall Test:  0.3333333333333333
Micro Recall Train:  0.9128371847816537
Micro Recall Dev:  0.9036229335209286
Micro Recall Test:  0.9111033028812369
Confusion Matrix Train: 
[[    0  1962     0]
 [    0 20778     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   0  272    0]
 [   0 2569    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   0  250    0]
 

### Word2Vec 300

In [None]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [None]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [None]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [None]:
best_params

{'alpha': 0.01, 'eta0': 1.0, 'max_iter': 100, 'penalty': 'l2'}

In [None]:
support_vector_classifier = support_vector_classifier.fit(w2v300_train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_w2v300.joblib")

In [None]:
train_preds = support_vector_classifier.predict(w2v300_train)
dev_preds = support_vector_classifier.predict(w2v300_dev)
test_preds = support_vector_classifier.predict(w2v300_test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9126175204287849
Accuracy Dev:  0.9032711924023918
Accuracy Test:  0.9111033028812369
Weighted F1 Train:  0.871132047293869
Weighted F1 Dev:  0.8576986484131379
Weighted F1 Test:  0.8687225094212346
Macro F1 Train:  0.3181042073427511
Macro F1 Dev:  0.31639253372759196
Macro F1 Test:  0.31782803211374644
Micro F1 Train:  0.9126175204287849
Micro F1 Dev:  0.9032711924023918
Micro F1 Test:  0.9111033028812369
Weighted Recall Train:  0.9126175204287849
Weighted Recall Dev:  0.9032711924023918
Weighted Recall Test:  0.9111033028812369
Macro Recall Train:  0.33325312028748355
Macro Recall Dev:  0.33320358115998444
Macro Recall Test:  0.3333333333333333
Micro Recall Train:  0.9126175204287849
Micro Recall Dev:  0.9032711924023918
Micro Recall Test:  0.9111033028812369
Confusion Matrix Train: 
[[    0  1962     0]
 [    5 20773     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   0  272    0]
 [   1 2568    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   0  250    0]

### Sentence Transformer

In [None]:
train, dev, test = load_sent_trans()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [None]:
best_params

{'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 100, 'penalty': 'l2'}

In [None]:
support_vector_classifier = support_vector_classifier.fit(train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_better_no_pca.joblib")

In [None]:
train_preds = support_vector_classifier.predict(train)
dev_preds = support_vector_classifier.predict(dev)
test_preds = support_vector_classifier.predict(test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9280379580001757
Accuracy Dev:  0.9215617305663032
Accuracy Test:  0.9244553759662685
Weighted F1 Train:  0.9215964199546101
Weighted F1 Dev:  0.9146653507073376
Weighted F1 Test:  0.9203758493574914
Macro F1 Train:  0.49062131325026287
Macro F1 Dev:  0.491946364227066
Macro F1 Test:  0.4961027624687639
Micro F1 Train:  0.9280379580001757
Micro F1 Dev:  0.9215617305663032
Micro F1 Test:  0.9244553759662685
Weighted Recall Train:  0.9280379580001757
Weighted Recall Dev:  0.9215617305663032
Weighted Recall Test:  0.9244553759662685
Macro Recall Train:  0.4682736553178208
Macro Recall Dev:  0.47034351887894116
Macro Recall Test:  0.4803825684535287
Micro Recall Train:  0.9280379580001757
Micro Recall Dev:  0.9215617305663032
Micro Recall Test:  0.9244553759662685
Confusion Matrix Train: 
[[  841  1112     9]
 [  492 20283     3]
 [    0    22     0]]
Confusion Matrix Dev: 
[[ 119  153    0]
 [  68 2501    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 118  131    1]
 