In [1]:
from embeddings_loader import *
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *

In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [5]:
extratrees_classifier = ExtraTreesClassifier(verbose=3,n_jobs=-1)
gridsearch = GridSearchCV(extratrees_classifier, param_grid = {
    "n_estimators": [100, 125, 150],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "bootstrap": [True, False],
}, scoring = "f1_micro")

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  77 out of 100 | elapsed:    2.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent w

building tree 1 of 125building tree 2 of 125
building tree 3 of 125
building tree 4 of 125

building tree 5 of 125
building tree 6 of 125
building tree 7 of 125building tree 8 of 125

building tree 9 of 125
building tree 10 of 125building tree 11 of 125
building tree 12 of 125

building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125building tree 17 of 125
building tree 18 of 125building tree 19 of 125
building tree 20 of 125


building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125building tree 28 of 125building tree 29 of 125building tree 30 of 125building tree 31 of 125building tree 32 of 125





building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125building tree 38 of 125
building tree 39 of 125

building tree 40 of 125
building tree 41 of 125
building tree 42 of 125bu

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    0.5s finished


In [9]:
extratrees_classifier = extratrees_classifier.fit(gt25_train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_gt25.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 125
building tree 2 of 125building tree 3 of 125

building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125building tree 8 of 125

building tree 9 of 125
building tree 10 of 125building tree 11 of 125
building tree 12 of 125

building tree 13 of 125building tree 14 of 125
building tree 15 of 125building tree 16 of 125

building tree 17 of 125

building tree 18 of 125building tree 19 of 125building tree 20 of 125


building tree 21 of 125
building tree 22 of 125
building tree 23 of 125building tree 24 of 125

building tree 25 of 125
building tree 26 of 125building tree 27 of 125
building tree 28 of 125building tree 29 of 125


building tree 30 of 125building tree 31 of 125building tree 32 of 125
building tree 33 of 125


building tree 34 of 125building tree 35 of 125

building tree 36 of 125
building tree 37 of 125
building tree 38 of 125building tree 39 of 125

building tree 40 of 125
building tree 41 of 125

[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s



building tree 42 of 125
building tree 43 of 125building tree 44 of 125building tree 45 of 125
building tree 46 of 125


building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125building tree 53 of 125

building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125building tree 59 of 125

building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125building tree 64 of 125
building tree 65 of 125

building tree 66 of 125
building tree 67 of 125building tree 68 of 125building tree 69 of 125

building tree 70 of 125

building tree 71 of 125
building tree 72 of 125building tree 73 of 125

building tree 74 of 125
building tree 75 of 125
building tree 76 of 125
building tree 77 of 125
building tree 78 of 125building tree 79 of 125

building tree 80 of 125
building tree 81 of 125
building tree 82 of 125
building tree 8

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    0.4s finished


In [10]:
train_preds = extratrees_classifier.predict(gt25_train)
dev_preds = extratrees_classifier.predict(gt25_dev)
test_preds = extratrees_classifier.predict(gt25_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished


In [11]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9461822335471399
Accuracy Dev:  0.9074920858248329
Accuracy Test:  0.9142656359803233
Weighted F1 Train:  0.9349348669511506
Weighted F1 Dev:  0.8688417753470915
Weighted F1 Test:  0.8798319907587914
Macro F1 Train:  0.7551710367647836
Macro F1 Dev:  0.34937899054295435
Macro F1 Test:  0.35535868094007633
Micro F1 Train:  0.9461822335471399
Micro F1 Dev:  0.9074920858248328
Micro F1 Test:  0.9142656359803233
Weighted Recall Train:  0.9461822335471399
Weighted Recall Dev:  0.9074920858248329
Weighted Recall Test:  0.9142656359803233
Macro Recall Train:  0.6571983026785698
Macro Recall Dev:  0.35010093955838467
Macro Recall Test:  0.35256202596734804
Micro Recall Train:  0.9461822335471399
Micro Recall Dev:  0.9074920858248329
Micro Recall Test:  0.9142656359803233
Confusion Matrix Train: 
[[  747  1215     0]
 [    1 20777     0]
 [    0     9    13]]
Confusion Matrix Dev: 
[[  14  258    0]
 [   3 2566    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  15  235    0

### FastText 300 

In [12]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [13]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [14]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n

building tree 1 of 150
building tree 2 of 150
building tree 3 of 150building tree 4 of 150building tree 5 of 150


building tree 6 of 150building tree 7 of 150

building tree 8 of 150
building tree 9 of 150
building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150building tree 17 of 150

building tree 18 of 150building tree 19 of 150

building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150
building tree 24 of 150


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150building tree 32 of 150

building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150
building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150
building tree 60 of 150
building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65 of 150
building tree 66

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    1.9s


building tree 122 of 150
building tree 123 of 150
building tree 124 of 150
building tree 125 of 150
building tree 126 of 150
building tree 127 of 150
building tree 128 of 150
building tree 129 of 150
building tree 130 of 150
building tree 131 of 150
building tree 132 of 150
building tree 133 of 150
building tree 134 of 150
building tree 135 of 150
building tree 136 of 150
building tree 137 of 150
building tree 138 of 150
building tree 139 of 150
building tree 140 of 150
building tree 141 of 150
building tree 142 of 150
building tree 143 of 150
building tree 144 of 150
building tree 145 of 150
building tree 146 of 150
building tree 147 of 150
building tree 148 of 150
building tree 149 of 150
building tree 150 of 150
{'bootstrap': False, 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.6s finished


In [15]:
extratrees_classifier = extratrees_classifier.fit(ft300_train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_ft300.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 150building tree 2 of 150

building tree 3 of 150
building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150
building tree 9 of 150
building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150
building tree 17 of 150
building tree 18 of 150
building tree 19 of 150building tree 20 of 150
building tree 21 of 150

building tree 22 of 150
building tree 23 of 150
building tree 24 of 150
building tree 25 of 150


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150
building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150
building tree 60 of 150
building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65 of 150
building tree 66 of 150
building tree 67

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    1.7s



building tree 121 of 150
building tree 122 of 150
building tree 123 of 150
building tree 124 of 150
building tree 125 of 150
building tree 126 of 150
building tree 127 of 150
building tree 128 of 150
building tree 129 of 150
building tree 130 of 150
building tree 131 of 150
building tree 132 of 150
building tree 133 of 150
building tree 134 of 150
building tree 135 of 150
building tree 136 of 150
building tree 137 of 150
building tree 138 of 150
building tree 139 of 150
building tree 140 of 150
building tree 141 of 150
building tree 142 of 150
building tree 143 of 150
building tree 144 of 150
building tree 145 of 150
building tree 146 of 150
building tree 147 of 150
building tree 148 of 150
building tree 149 of 150
building tree 150 of 150


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.4s finished


In [16]:
train_preds = extratrees_classifier.predict(ft300_train)
dev_preds = extratrees_classifier.predict(ft300_dev)
test_preds = extratrees_classifier.predict(ft300_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished


In [17]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9793515508303312
Accuracy Dev:  0.910306014773127
Accuracy Test:  0.9142656359803233
Weighted F1 Train:  0.9780982316568629
Weighted F1 Dev:  0.8760740085049812
Weighted F1 Test:  0.8814839424576891
Macro F1 Train:  0.8778169616921719
Macro F1 Dev:  0.3706990280915947
Macro F1 Test:  0.3617832621013957
Micro F1 Train:  0.9793515508303311
Micro F1 Dev:  0.910306014773127
Micro F1 Test:  0.9142656359803233
Weighted Recall Train:  0.9793515508303312
Weighted Recall Dev:  0.910306014773127
Weighted Recall Test:  0.9142656359803233
Macro Recall Train:  0.8009119516403597
Macro Recall Dev:  0.3620963371724712
Macro Recall Test:  0.35617637228435534
Micro Recall Train:  0.9793515508303312
Micro Recall Dev:  0.910306014773127
Micro Recall Test:  0.9142656359803233
Confusion Matrix Train: 
[[ 1504   458     0]
 [    4 20774     0]
 [    0     8    14]]
Confusion Matrix Dev: 
[[  24  248    0]
 [   5 2564    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  18  232    0]
 [   

### Word2Vec 300

In [18]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [19]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [20]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  77 out of 100 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent w

building tree 1 of 100building tree 2 of 100building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100

building tree 9 of 100building tree 10 of 100

building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100building tree 51 of 100

building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.6s finished


In [21]:
extratrees_classifier = extratrees_classifier.fit(w2v300_train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_w2v300.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100building tree 23 of 100
building tree 24 of 100

building tree 25 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s


building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100building tree 42 of 100
building tree 43 of 100

building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.6s finished


In [22]:
train_preds = extratrees_classifier.predict(w2v300_train)
dev_preds = extratrees_classifier.predict(w2v300_dev)
test_preds = extratrees_classifier.predict(w2v300_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [23]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9792636850891837
Accuracy Dev:  0.9110094970102005
Accuracy Test:  0.9153197470133521
Weighted F1 Train:  0.977992775313842
Weighted F1 Dev:  0.8770454454786019
Weighted F1 Test:  0.8837323726046391
Macro F1 Train:  0.8775578892123784
Macro F1 Dev:  0.3730377141935493
Macro F1 Test:  0.3686607545767011
Micro F1 Train:  0.9792636850891837
Micro F1 Dev:  0.9110094970102005
Micro F1 Test:  0.9153197470133521
Weighted Recall Train:  0.9792636850891837
Weighted Recall Dev:  0.9110094970102005
Weighted Recall Test:  0.9153197470133521
Macro Recall Train:  0.800418310253607
Macro Recall Dev:  0.3634515795418985
Macro Recall Test:  0.36017637228435534
Micro Recall Train:  0.9792636850891837
Micro Recall Dev:  0.9110094970102005
Micro Recall Test:  0.9153197470133521
Confusion Matrix Train: 
[[ 1501   461     0]
 [    3 20775     0]
 [    0     8    14]]
Confusion Matrix Dev: 
[[  25  247    0]
 [   4 2565    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  21  229    0]
 [ 

### TF-IDF PCA (1000 Dims)

In [24]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [25]:
grid_results = gridsearch.fit(tfidf_pca_train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100building tree 4 of 100
building tree 5 of 100

building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100building tree 11 of 100
building tree 12 of 100

building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.6s


building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.2s finished


In [26]:
extratrees_classifier = extratrees_classifier.fit(tfidf_pca_train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_tfidf_pca.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100building tree 4 of 100

building tree 5 of 100building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100

building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.5s


building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.9s finished


In [27]:
train_preds = extratrees_classifier.predict(tfidf_pca_train)
dev_preds = extratrees_classifier.predict(tfidf_pca_dev)
test_preds = extratrees_classifier.predict(tfidf_pca_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [28]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9649855021527106
Accuracy Dev:  0.9096025325360535
Accuracy Test:  0.9125087842586086
Weighted F1 Train:  0.9607841927629873
Weighted F1 Dev:  0.8729535576261955
Weighted F1 Test:  0.8770468487884773
Macro F1 Train:  0.6580133466597712
Macro F1 Dev:  0.36067359558730266
Macro F1 Test:  0.3476464839077867
Micro F1 Train:  0.9649855021527106
Micro F1 Dev:  0.9096025325360535
Micro F1 Test:  0.9125087842586086
Weighted Recall Train:  0.9649855021527106
Weighted Recall Dev:  0.9096025325360535
Weighted Recall Test:  0.9125087842586086
Macro Recall Train:  0.5800970145681074
Macro Recall Dev:  0.3563581427121257
Macro Recall Test:  0.3483049235120195
Micro Recall Train:  0.9649855021527106
Micro Recall Dev:  0.9096025325360535
Micro Recall Test:  0.9125087842586086
Confusion Matrix Train: 
[[ 1185   777     0]
 [    1 20777     0]
 [    0    19     3]]
Confusion Matrix Dev: 
[[  19  253    0]
 [   2 2567    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  12  238    0]
 

### Sentence Transformer Faster No PCA

In [29]:
train, dev, test = load_sent_trans_fast_no_pca()

In [30]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n

building tree 1 of 125building tree 2 of 125

building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s


building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125
building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.4s finished


In [31]:
extratrees_classifier = extratrees_classifier.fit(train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_sent_trans_fast_no_pca.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125building tree 6 of 125
building tree 7 of 125
building tree 8 of 125building tree 9 of 125

building tree 10 of 125

building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125building tree 18 of 125

building tree 19 of 125building tree 20 of 125

building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s


building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125building tree 31 of 125

building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.3s finished


In [32]:
train_preds = extratrees_classifier.predict(train)
dev_preds = extratrees_classifier.predict(dev)
test_preds = extratrees_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished


In [33]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9972761620244267
Accuracy Dev:  0.9162856137882518
Accuracy Test:  0.9205903021784961
Weighted F1 Train:  0.9972468361543093
Weighted F1 Dev:  0.889897353531116
Weighted F1 Test:  0.8953040049779076
Macro F1 Train:  0.9518600370966007
Macro F1 Dev:  0.4105513533649203
Macro F1 Test:  0.4043680193248746
Micro F1 Train:  0.9972761620244267
Micro F1 Dev:  0.9162856137882518
Micro F1 Test:  0.920590302178496
Weighted Recall Train:  0.9972761620244267
Weighted Recall Dev:  0.9162856137882518
Weighted Recall Test:  0.9205903021784961
Macro Recall Train:  0.9148661324321713
Macro Recall Dev:  0.38731262259672267
Macro Recall Test:  0.38258593649569356
Micro Recall Train:  0.9972761620244267
Micro Recall Dev:  0.9162856137882518
Micro Recall Test:  0.9205903021784961
Confusion Matrix Train: 
[[ 1907    55     0]
 [    2 20776     0]
 [    0     5    17]]
Confusion Matrix Dev: 
[[  45  227    0]
 [   9 2560    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  38  212    0]
 [

### Sentence Transformer Faster PCA

In [34]:
train, dev, test = load_sent_trans_fast_pca()

In [35]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n

building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100building tree 9 of 100

building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100building tree 66 

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.1s finished


In [36]:
extratrees_classifier = extratrees_classifier.fit(train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_sent_trans_fast_pca.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100building tree 7 of 100

building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100building tree 21 of 100

building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100building tree 32 of 100

building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.1s finished


In [37]:
train_preds = extratrees_classifier.predict(train)
dev_preds = extratrees_classifier.predict(dev)
test_preds = extratrees_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [38]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9920042175555751
Accuracy Dev:  0.911712979247274
Accuracy Test:  0.9153197470133521
Weighted F1 Train:  0.9918290647267677
Weighted F1 Dev:  0.8785236329654765
Weighted F1 Test:  0.8837323726046391
Macro F1 Train:  0.9493824201821658
Macro F1 Dev:  0.37719073895867566
Macro F1 Test:  0.3686607545767011
Micro F1 Train:  0.9920042175555751
Micro F1 Dev:  0.911712979247274
Micro F1 Test:  0.9153197470133521
Weighted Recall Train:  0.9920042175555751
Weighted Recall Dev:  0.911712979247274
Weighted Recall Test:  0.9153197470133521
Macro Recall Train:  0.9093065410253404
Macro Recall Dev:  0.36590255993405535
Macro Recall Test:  0.36017637228435534
Micro Recall Train:  0.9920042175555751
Micro Recall Dev:  0.911712979247274
Micro Recall Test:  0.9153197470133521
Confusion Matrix Train: 
[[ 1785   177     0]
 [    1 20777     0]
 [    0     4    18]]
Confusion Matrix Dev: 
[[  27  245    0]
 [   4 2565    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  21  229    0]
 [ 

### Sentence Transformer Better No PCA

In [6]:
train, dev, test = load_sent_trans_better_no_pca()

In [7]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n

building tree 1 of 125
building tree 2 of 125
building tree 3 of 125building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125building tree 9 of 125
building tree 10 of 125

building tree 11 of 125

building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.3s


building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125
building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    4.0s finished


In [8]:
extratrees_classifier = extratrees_classifier.fit(train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_sent_trans_better_no_pca.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125

building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.3s


building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125
building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    4.0s finished


In [9]:
train_preds = extratrees_classifier.predict(train)
dev_preds = extratrees_classifier.predict(dev)
test_preds = extratrees_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished


In [10]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9973200948950004
Accuracy Dev:  0.915933872669715
Accuracy Test:  0.9244553759662685
Weighted F1 Train:  0.9972907491048535
Weighted F1 Dev:  0.8896404064704567
Weighted F1 Test:  0.9032851892348208
Macro F1 Train:  0.9519529233144347
Macro F1 Dev:  0.4102049449147391
Macro F1 Test:  0.42862619808306707
Micro F1 Train:  0.9973200948950004
Micro F1 Dev:  0.915933872669715
Micro F1 Test:  0.9244553759662685
Weighted Recall Train:  0.9973200948950004
Weighted Recall Dev:  0.915933872669715
Weighted Recall Test:  0.9244553759662685
Macro Recall Train:  0.9148821750413413
Macro Recall Dev:  0.38718287042337374
Macro Recall Test:  0.3996621673736984
Micro Recall Train:  0.9973200948950004
Micro Recall Dev:  0.915933872669715
Micro Recall Test:  0.9244553759662685
Confusion Matrix Train: 
[[ 1907    55     0]
 [    1 20777     0]
 [    0     5    17]]
Confusion Matrix Dev: 
[[  45  227    0]
 [  10 2559    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  51  199    0]
 [  

### Sentence Transformer Better PCA

In [11]:
train, dev, test = load_sent_trans_better_pca()

In [12]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
extratrees_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100

building tree 10 of 100
building tree 11 of 100building tree 12 of 100

building tree 13 of 100building tree 14 of 100
building tree 15 of 100

building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100building tree 22 of 100
building tree 23 of 100

building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100building tree 64 of 100

building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100building tree 72 

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.4s finished


In [13]:
extratrees_classifier = extratrees_classifier.fit(train, train_labels)
save_model(extratrees_classifier, "extratrees_classifier_sent_trans_better_pca.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100building tree 6 of 100

building tree 7 of 100
building tree 8 of 100
building tree 9 of 100building tree 10 of 100
building tree 11 of 100
building tree 12 of 100

building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100building tree 68 of 100

building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished


In [14]:
train_preds = extratrees_classifier.predict(train)
dev_preds = extratrees_classifier.predict(dev)
test_preds = extratrees_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [15]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9927510763553291
Accuracy Dev:  0.9110094970102005
Accuracy Test:  0.9149683766690091
Weighted F1 Train:  0.9926037024897008
Weighted F1 Dev:  0.8770454454786019
Weighted F1 Test:  0.8829870711655989
Macro F1 Train:  0.9417848145568062
Macro F1 Dev:  0.3730377141935493
Macro F1 Test:  0.3663843047558257
Micro F1 Train:  0.9927510763553291
Micro F1 Dev:  0.9110094970102005
Micro F1 Test:  0.914968376669009
Weighted Recall Train:  0.9927510763553291
Weighted Recall Dev:  0.9110094970102005
Weighted Recall Test:  0.9149683766690091
Macro Recall Train:  0.8972131298493604
Macro Recall Dev:  0.3634515795418985
Macro Recall Test:  0.358843038951022
Micro Recall Train:  0.9927510763553291
Micro Recall Dev:  0.9110094970102005
Micro Recall Test:  0.9149683766690091
Confusion Matrix Train: 
[[ 1803   159     0]
 [    1 20777     0]
 [    0     5    17]]
Confusion Matrix Dev: 
[[  25  247    0]
 [   4 2565    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  20  230    0]
 [  