In [1]:
from embeddings_loader import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *

--ip=127.0.0.1


In [2]:
train_labels, dev_labels = load_labels()

In [3]:
label_replacement = {
    'NOT': 0,
    'OFF': 1,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]

In [5]:
support_vector_classifier = SVC()
gridsearch = GridSearchCV(support_vector_classifier, param_grid = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 1, 10],
    "gamma": ['scale', 'auto'],
    "degree": [1, 2, 3, 4],
    "class_weight": ['balanced', None]
}, scoring = "f1_macro")

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [None]:
# grid_results = gridsearch.fit(gt25_train, train_labels)
# best_params = grid_results.best_params_
# support_vector_classifier = grid_results.best_estimator_

In [None]:
# best_params

{'alpha': 0.01, 'eta0': 0.1, 'max_iter': 100, 'penalty': 'l2'}

In [10]:
support_vector_classifier = load_model("support_vector_classifier_gt25.joblib")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
# support_vector_classifier = support_vector_classifier.fit(gt25_train, train_labels)
# save_model(support_vector_classifier, "support_vector_classifier_gt25.joblib")

In [11]:
train_preds = support_vector_classifier.predict(gt25_train)
dev_preds = support_vector_classifier.predict(gt25_dev)
test_preds = support_vector_classifier.predict(gt25_test)

In [12]:
computeAllScores(train_preds, dev_preds)

Accuracy Train:  0.7475453172205438
Accuracy Dev:  0.725453172205438
Weighted F1 Train:  0.7168966782693725
Weighted F1 Dev:  0.6928039176176084
Macro F1 Train:  0.6563681230451947
Macro F1 Dev:  0.6377377521478258
Micro F1 Train:  0.7475453172205438
Micro F1 Dev:  0.725453172205438
Weighted Recall Train:  0.7475453172205438
Weighted Recall Dev:  0.725453172205438
Macro Recall Train:  0.647065727628448
Macro Recall Dev:  0.6339418992933699
Micro Recall Train:  0.7475453172205438
Micro Recall Dev:  0.725453172205438
Confusion Matrix Train: 
[[1231 2254]
 [ 420 6687]]
Confusion Matrix Dev: 
[[ 309  606]
 [ 121 1612]]


### FastText 300 

In [17]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [18]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [19]:
# grid_results = gridsearch.fit(ft300_train, train_labels)
# best_params = grid_results.best_params_
# support_vector_classifier = grid_results.best_estimator_

In [20]:
# best_params

In [25]:
support_vector_classifier = load_model("support_vector_classifier_ft300.joblib")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [26]:
# support_vector_classifier = support_vector_classifier.fit(ft300_train, train_labels)
# save_model(support_vector_classifier, "support_vector_classifier_ft300.joblib")

In [27]:
train_preds = support_vector_classifier.predict(ft300_train)
dev_preds = support_vector_classifier.predict(ft300_dev)
test_preds = support_vector_classifier.predict(ft300_test)

In [28]:
computeAllScores(train_preds, dev_preds)

Accuracy Train:  0.8601774924471299
Accuracy Dev:  0.7760574018126888
Weighted F1 Train:  0.8551616403383264
Weighted F1 Dev:  0.7664600545579516
Macro F1 Train:  0.8312885571082529
Macro F1 Dev:  0.7336122300626585
Micro F1 Train:  0.8601774924471299
Micro F1 Dev:  0.7760574018126888
Weighted Recall Train:  0.8601774924471299
Weighted Recall Dev:  0.7760574018126888
Macro Recall Train:  0.8138407200127423
Macro Recall Dev:  0.7216100826451493
Micro Recall Train:  0.8601774924471299
Micro Recall Dev:  0.7760574018126888
Confusion Matrix Train: 
[[2364 1121]
 [ 360 6747]]
Confusion Matrix Dev: 
[[ 499  416]
 [ 177 1556]]


### Word2Vec 300

In [29]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [30]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [31]:
# grid_results = gridsearch.fit(w2v300_train, train_labels)
# best_params = grid_results.best_params_
# support_vector_classifier = grid_results.best_estimator_

In [32]:
# best_params

In [33]:
support_vector_classifier = load_model("support_vector_classifier_w2v300.joblib")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [34]:
# support_vector_classifier = support_vector_classifier.fit(w2v300_train, train_labels)
# save_model(support_vector_classifier, "support_vector_classifier_w2v300.joblib")

In [35]:
train_preds = support_vector_classifier.predict(w2v300_train)
dev_preds = support_vector_classifier.predict(w2v300_dev)
test_preds = support_vector_classifier.predict(w2v300_test)

In [36]:
computeAllScores(train_preds, dev_preds)

Accuracy Train:  0.9217333836858006
Accuracy Dev:  0.7654833836858006
Weighted F1 Train:  0.9203809589771392
Weighted F1 Dev:  0.7607127363748155
Macro F1 Train:  0.9084625105713686
Macro F1 Dev:  0.7309235035474431
Micro F1 Train:  0.9217333836858006
Micro F1 Dev:  0.7654833836858006
Weighted Recall Train:  0.9217333836858006
Weighted Recall Dev:  0.7654833836858006
Macro Recall Train:  0.8963435326256026
Macro Recall Dev:  0.7238488486121227
Micro Recall Train:  0.9217333836858006
Micro Recall Dev:  0.7654833836858006
Confusion Matrix Train: 
[[2865  620]
 [ 209 6898]]
Confusion Matrix Dev: 
[[ 539  376]
 [ 245 1488]]


### Sentence Transformer

In [37]:
train, dev, test = load_sent_trans()

In [38]:
# grid_results = gridsearch.fit(train, train_labels)
# best_params = grid_results.best_params_
# support_vector_classifier = grid_results.best_estimator_

In [39]:
# best_params

In [40]:
support_vector_classifier = load_model("support_vector_classifier_better_no_pca.joblib")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [41]:
# support_vector_classifier = support_vector_classifier.fit(train, train_labels)
# save_model(support_vector_classifier, "support_vector_classifier_better_no_pca.joblib")

In [42]:
train_preds = support_vector_classifier.predict(train)
dev_preds = support_vector_classifier.predict(dev)
test_preds = support_vector_classifier.predict(test)

In [43]:
computeAllScores(train_preds, dev_preds)

Accuracy Train:  0.8033421450151057
Accuracy Dev:  0.7643504531722054
Weighted F1 Train:  0.7958439702076071
Weighted F1 Dev:  0.7569223018374682
Macro F1 Train:  0.7618534956734898
Macro F1 Dev:  0.7246019412427074
Micro F1 Train:  0.8033421450151057
Micro F1 Dev:  0.7643504531722054
Weighted Recall Train:  0.8033421450151057
Weighted Recall Dev:  0.7643504531722054
Macro Recall Train:  0.7480900779012508
Macro Recall Dev:  0.7152453655967888
Micro Recall Train:  0.8033421450151057
Micro Recall Dev:  0.7643504531722054
Confusion Matrix Train: 
[[2044 1441]
 [ 642 6465]]
Confusion Matrix Dev: 
[[ 509  406]
 [ 218 1515]]
