In [1]:
from embeddings_loader import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *

--ip=127.0.0.1


In [3]:
train_labels, dev_labels = load_labels()

In [4]:
label_replacement = {
    'NOT': 1,
    'OFF': 0,
}

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]

In [6]:
support_vector_classifier = SVC()
gridsearch = GridSearchCV(support_vector_classifier, param_grid = {
    "kernel": ["linear", "poly", "rbf"],
    "C": [0.1, 1, 10],
}, scoring = "f1_macro")

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [9]:
best_params

{'C': 10, 'kernel': 'rbf'}

In [10]:
support_vector_classifier = support_vector_classifier.fit(gt25_train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_gt25.joblib")

In [11]:
train_preds = support_vector_classifier.predict(gt25_train)
dev_preds = support_vector_classifier.predict(gt25_dev)
test_preds = support_vector_classifier.predict(gt25_test)

In [12]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7475453172205438
Accuracy Dev:  0.725453172205438
Accuracy Test:  0.7895348837209303
Weighted F1 Train:  0.7168966782693725
Weighted F1 Dev:  0.6928039176176084
Weighted F1 Test:  0.767706758577008
Macro F1 Train:  0.6563681230451947
Macro F1 Dev:  0.6377377521478258
Macro F1 Test:  0.6901863141406482
Micro F1 Train:  0.7475453172205438
Micro F1 Dev:  0.725453172205438
Micro F1 Test:  0.7895348837209303
Weighted Recall Train:  0.7475453172205438
Weighted Recall Dev:  0.725453172205438
Weighted Recall Test:  0.7895348837209303
Macro Recall Train:  0.647065727628448
Macro Recall Dev:  0.6339418992933699
Macro Recall Test:  0.6701612903225806
Micro Recall Train:  0.7475453172205438
Micro Recall Dev:  0.725453172205438
Micro Recall Test:  0.7895348837209303
Confusion Matrix Train: 
[[1231 2254]
 [ 420 6687]]
Confusion Matrix Dev: 
[[ 309  606]
 [ 121 1612]]
Confusion Matrix Test: 
[[ 96 144]
 [ 37 583]]


### FastText 300 

In [7]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [8]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [9]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [None]:
best_params

{'C': 10, 'kernel': 'rbf'}

: 

In [17]:
support_vector_classifier = support_vector_classifier.fit(ft300_train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_ft300.joblib")

In [18]:
train_preds = support_vector_classifier.predict(ft300_train)
dev_preds = support_vector_classifier.predict(ft300_dev)
test_preds = support_vector_classifier.predict(ft300_test)

In [19]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.8601774924471299
Accuracy Dev:  0.7760574018126888
Accuracy Test:  0.8174418604651162
Weighted F1 Train:  0.8551616403383264
Weighted F1 Dev:  0.7664600545579516
Weighted F1 Test:  0.8057443596218022
Macro F1 Train:  0.8312885571082529
Macro F1 Dev:  0.7336122300626585
Macro F1 Test:  0.7464770220156786
Micro F1 Train:  0.8601774924471299
Micro F1 Dev:  0.7760574018126888
Micro F1 Test:  0.8174418604651161
Weighted Recall Train:  0.8601774924471299
Weighted Recall Dev:  0.7760574018126888
Weighted Recall Test:  0.8174418604651162
Macro Recall Train:  0.8138407200127423
Macro Recall Dev:  0.7216100826451493
Macro Recall Test:  0.7252688172043011
Micro Recall Train:  0.8601774924471299
Micro Recall Dev:  0.7760574018126888
Micro Recall Test:  0.8174418604651162
Confusion Matrix Train: 
[[2364 1121]
 [ 360 6747]]
Confusion Matrix Dev: 
[[ 499  416]
 [ 177 1556]]
Confusion Matrix Test: 
[[124 116]
 [ 41 579]]


In [None]:
save_list = []
for i, j in enumerate(test_preds):
    save_list.append([i, j])
df_pred = pd.DataFrame(save_list, columns = ["id", "label"])
df_pred.to_csv("svm_pred.csv", index = False)

### Word2Vec 300

In [20]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [21]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [22]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [23]:
best_params

{'C': 10, 'kernel': 'rbf'}

In [24]:
support_vector_classifier = support_vector_classifier.fit(w2v300_train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_w2v300.joblib")

In [25]:
train_preds = support_vector_classifier.predict(w2v300_train)
dev_preds = support_vector_classifier.predict(w2v300_dev)
test_preds = support_vector_classifier.predict(w2v300_test)

In [26]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9217333836858006
Accuracy Dev:  0.7654833836858006
Accuracy Test:  0.8023255813953488
Weighted F1 Train:  0.9203809589771392
Weighted F1 Dev:  0.7607127363748155
Weighted F1 Test:  0.7911509698035536
Macro F1 Train:  0.9084625105713686
Macro F1 Dev:  0.7309235035474431
Macro F1 Test:  0.7287046109082271
Micro F1 Train:  0.9217333836858006
Micro F1 Dev:  0.7654833836858006
Micro F1 Test:  0.8023255813953488
Weighted Recall Train:  0.9217333836858006
Weighted Recall Dev:  0.7654833836858006
Weighted Recall Test:  0.8023255813953488
Macro Recall Train:  0.8963435326256026
Macro Recall Dev:  0.7238488486121227
Macro Recall Test:  0.7109543010752688
Micro Recall Train:  0.9217333836858006
Micro Recall Dev:  0.7654833836858006
Micro Recall Test:  0.8023255813953488
Confusion Matrix Train: 
[[2865  620]
 [ 209 6898]]
Confusion Matrix Dev: 
[[ 539  376]
 [ 245 1488]]
Confusion Matrix Test: 
[[121 119]
 [ 51 569]]


### Sentence Transformer

In [27]:
train, dev, test = load_sent_trans()

In [28]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
support_vector_classifier = grid_results.best_estimator_

In [29]:
best_params

{'C': 10, 'kernel': 'linear'}

In [30]:
support_vector_classifier = support_vector_classifier.fit(train, train_labels)
save_model(support_vector_classifier, "support_vector_classifier_better_no_pca.joblib")

In [31]:
train_preds = support_vector_classifier.predict(train)
dev_preds = support_vector_classifier.predict(dev)
test_preds = support_vector_classifier.predict(test)

In [32]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.8033421450151057
Accuracy Dev:  0.7643504531722054
Accuracy Test:  0.8069767441860465
Weighted F1 Train:  0.7958439702076071
Weighted F1 Dev:  0.7569223018374682
Weighted F1 Test:  0.7991354013093129
Macro F1 Train:  0.7618534956734898
Macro F1 Dev:  0.7246019412427074
Macro F1 Test:  0.7418332802870287
Micro F1 Train:  0.8033421450151057
Micro F1 Dev:  0.7643504531722054
Micro F1 Test:  0.8069767441860464
Weighted Recall Train:  0.8033421450151057
Weighted Recall Dev:  0.7643504531722054
Weighted Recall Test:  0.8069767441860465
Macro Recall Train:  0.7480900779012508
Macro Recall Dev:  0.7152453655967888
Macro Recall Test:  0.7269489247311828
Micro Recall Train:  0.8033421450151057
Micro Recall Dev:  0.7643504531722054
Micro Recall Test:  0.8069767441860465
Confusion Matrix Train: 
[[2044 1441]
 [ 642 6465]]
Confusion Matrix Dev: 
[[ 509  406]
 [ 218 1515]]
Confusion Matrix Test: 
[[131 109]
 [ 57 563]]
