In [1]:
from embeddings_loader import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *

In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [5]:
random_forest_classifier = RandomForestClassifier(verbose=3,n_jobs=os.cpu_count()//2)
gridsearch = GridSearchCV(random_forest_classifier, param_grid = {
    "n_estimators": [100, 125, 150],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "bootstrap": [True, False],
}, scoring = "f1_micro")

### Glove Twitter 25

In [6]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [7]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [8]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.4s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 100

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.7s


building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    3.5s finished


In [9]:
random_forest_classifier = random_forest_classifier.fit(gt25_train, train_labels)
save_model(random_forest_classifier, "random_forest_gt25")

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.8s


building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    3.5s finished


In [10]:
train_preds = random_forest_classifier.predict(gt25_train)
dev_preds = random_forest_classifier.predict(gt25_dev)
test_preds = random_forest_classifier.predict(gt25_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished


In [11]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9961778402600826
Accuracy Dev:  0.9120647203658108
Accuracy Test:  0.9177793394237527
Weighted F1 Train:  0.9960377218304128
Weighted F1 Dev:  0.8816645610136377
Weighted F1 Test:  0.8902444007904476
Macro F1 Train:  0.8531412457258272
Macro F1 Dev:  0.3878787159931522
Macro F1 Test:  0.38965443707667785
Micro F1 Train:  0.9961778402600826
Micro F1 Dev:  0.9120647203658108
Micro F1 Test:  0.9177793394237527
Weighted Recall Train:  0.9961778402600826
Weighted Recall Dev:  0.9120647203658108
Weighted Recall Test:  0.9177793394237527
Macro Recall Train:  0.7918427663027853
Macro Recall Dev:  0.37260674024378143
Macro Recall Test:  0.37312405193469594
Micro Recall Train:  0.9961778402600826
Micro Recall Dev:  0.9120647203658108
Micro Recall Test:  0.9177793394237527
Confusion Matrix Train: 
[[ 1897    65     0]
 [    9 20769     0]
 [    0    13     9]]
Confusion Matrix Dev: 
[[  33  239    0]
 [   9 2560    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  31  219    0]

### FastText 300 

In [12]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [13]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [14]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    0.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done 100

building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    2.4s


building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67 of 125
building tree 68 of 125
building tree 69 of 125
building tree 70 of 125
building tree 71 of 125
building tree 72

[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:   12.9s finished


In [15]:
random_forest_classifier = random_forest_classifier.fit(ft300_train, train_labels)
save_model(random_forest_classifier, "random_forest_ft300")

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    2.3s


building tree 30 of 125
building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67 of 125
building tree 68 of 125
building tree 69 of 125
building tree 70 of 125
building tree 71

[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:   13.1s finished


In [16]:
train_preds = random_forest_classifier.predict(ft300_train)
dev_preds = random_forest_classifier.predict(ft300_dev)
test_preds = random_forest_classifier.predict(ft300_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.0s finished


In [17]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9971882962832791
Accuracy Dev:  0.915933872669715
Accuracy Test:  0.9167252283907238
Weighted F1 Train:  0.9971669194625724
Weighted F1 Dev:  0.887528096347266
Weighted F1 Test:  0.8871656133086411
Macro F1 Train:  0.9517079357268635
Macro F1 Dev:  0.4026115116779588
Macro F1 Test:  0.37953054021320193
Micro F1 Train:  0.9971882962832791
Micro F1 Dev:  0.915933872669715
Micro F1 Test:  0.9167252283907238
Weighted Recall Train:  0.9971882962832791
Weighted Recall Dev:  0.915933872669715
Weighted Recall Test:  0.9167252283907238
Macro Recall Train:  0.9168341239436195
Macro Recall Dev:  0.3817041803097261
Macro Recall Test:  0.36671448772335774
Micro Recall Train:  0.9971882962832791
Micro Recall Dev:  0.915933872669715
Micro Recall Test:  0.9167252283907238
Confusion Matrix Train: 
[[ 1920    42     0]
 [   17 20761     0]
 [    0     5    17]]
Confusion Matrix Dev: 
[[  40  232    0]
 [   5 2564    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  26  224    0]
 [   

### Word2Vec 300

In [18]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [19]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [20]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 100

building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    2.4s


building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67 of 125
building tree 68 of 125
building tree 69 of 125
building tree 70 of 125
building tree 71 of 125
building tree 72

[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:   13.0s finished


In [21]:
random_forest_classifier = random_forest_classifier.fit(w2v300_train, train_labels)
save_model(random_forest_classifier, "random_forest_w2v300")

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    2.4s


building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67 of 125
building tree 68 of 125
building tree 69 of 125
building tree 70 of 125
building tree 71 of 125
building tree 72

[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:   13.0s finished


In [22]:
train_preds = random_forest_classifier.predict(w2v300_train)
dev_preds = random_forest_classifier.predict(w2v300_dev)
test_preds = random_forest_classifier.predict(w2v300_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.0s finished


In [23]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9971004305421316
Accuracy Dev:  0.9152303904326415
Accuracy Test:  0.9177793394237527
Weighted F1 Train:  0.99708621722539
Weighted F1 Dev:  0.8878822534320885
Weighted F1 Test:  0.8893126805653159
Macro F1 Train:  0.9515525194232911
Macro F1 Dev:  0.40503587331122287
Macro F1 Test:  0.38601851851851854
Micro F1 Train:  0.9971004305421316
Micro F1 Dev:  0.9152303904326415
Micro F1 Test:  0.9177793394237527
Weighted Recall Train:  0.9971004305421316
Weighted Recall Dev:  0.9152303904326415
Weighted Recall Test:  0.9177793394237527
Macro Recall Train:  0.9184944113427927
Macro Recall Dev:  0.3836361520084874
Macro Recall Test:  0.37071448772335774
Micro Recall Train:  0.9971004305421316
Micro Recall Dev:  0.9152303904326415
Micro Recall Test:  0.9177793394237527
Confusion Matrix Train: 
[[ 1931    31     0]
 [   30 20748     0]
 [    0     5    17]]
Confusion Matrix Dev: 
[[  42  230    0]
 [   9 2560    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  29  221    0]
 

### TF-IDF PCA (1000 Dims)

In [24]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [25]:
grid_results = gridsearch.fit(tfidf_pca_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.1s
[Parallel(n_jobs=6)]: Done 100

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    6.8s


building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   29.3s finished


In [26]:
random_forest_classifier = random_forest_classifier.fit(tfidf_pca_train, train_labels)
save_model(random_forest_classifier, "random_forest_tfidf_pca")

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    6.7s


building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   28.9s finished


In [27]:
train_preds = random_forest_classifier.predict(tfidf_pca_train)
dev_preds = random_forest_classifier.predict(tfidf_pca_dev)
test_preds = random_forest_classifier.predict(tfidf_pca_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished


In [28]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9869080045690185
Accuracy Dev:  0.9120647203658108
Accuracy Test:  0.9153197470133521
Weighted F1 Train:  0.9861261764325336
Weighted F1 Dev:  0.8792571015609638
Weighted F1 Test:  0.884199677360386
Macro F1 Train:  0.6944467325127307
Macro F1 Dev:  0.3792473749429335
Macro F1 Test:  0.37048550583015355
Micro F1 Train:  0.9869080045690185
Micro F1 Dev:  0.9120647203658108
Micro F1 Test:  0.9153197470133521
Weighted Recall Train:  0.9869080045690185
Weighted Recall Dev:  0.9120647203658108
Weighted Recall Test:  0.9153197470133521
Macro Recall Train:  0.6498928320703469
Macro Recall Dev:  0.3671280501301338
Macro Recall Test:  0.36138115439002444
Micro Recall Train:  0.9869080045690185
Micro Recall Dev:  0.9120647203658108
Micro Recall Test:  0.9153197470133521
Confusion Matrix Train: 
[[ 1685   277     0]
 [    1 20777     0]
 [    0    20     2]]
Confusion Matrix Dev: 
[[  28  244    0]
 [   4 2565    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  22  228    0]
 

### Sentence Transformer Faster No PCA

In [29]:
train, dev, test = load_sent_trans_fast_no_pca()

In [30]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.9s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done 100

building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    4.6s


building tree 29 of 125
building tree 30 of 125
building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67 of 125
building tree 68 of 125
building tree 69 of 125
building tree 70

[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:   23.1s finished


In [31]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)
save_model(random_forest_classifier, "random_forest_faster_no_pca")

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125
building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    4.1s


building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65 of 125
building tree 66 of 125
building tree 67 of 125
building tree 68 of 125
building tree 69 of 125
building tree 70 of 125
building tree 71 of 125
building tree 72

[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:   22.5s finished


In [32]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 125 out of 125 | elapsed:    0.0s finished


In [33]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9962217731306564
Accuracy Dev:  0.9169890960253254
Accuracy Test:  0.9230498945888967
Weighted F1 Train:  0.99613126467102
Weighted F1 Dev:  0.8927910434135883
Weighted F1 Test:  0.9011103445089564
Macro F1 Train:  0.8947023966423554
Macro F1 Dev:  0.41981426078284717
Macro F1 Test:  0.4226508803225042
Micro F1 Train:  0.9962217731306564
Micro F1 Dev:  0.9169890960253254
Micro F1 Test:  0.9230498945888967
Weighted Recall Train:  0.9962217731306564
Weighted Recall Dev:  0.9169890960253254
Weighted Recall Test:  0.9230498945888967
Macro Recall Train:  0.8361882621460279
Macro Recall Dev:  0.3941465550797976
Macro Recall Test:  0.39553361614603416
Micro Recall Train:  0.9962217731306564
Micro Recall Dev:  0.9169890960253254
Micro Recall Test:  0.9230498945888967
Confusion Matrix Train: 
[[ 1890    72     0]
 [    4 20774     0]
 [    0    10    12]]
Confusion Matrix Dev: 
[[  51  221    0]
 [  13 2556    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  48  202    0]
 [

### Sentence Transformer Faster PCA

In [34]:
train, dev, test = load_sent_trans_fast_pca()

In [35]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 100

KeyboardInterrupt: 

In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)
save_model(random_forest_classifier, "random_forest_faster_pca")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 125
building tree 2 of 125
building tree 3 of 125
building tree 4 of 125
building tree 5 of 125
building tree 6 of 125
building tree 7 of 125
building tree 8 of 125
building tree 9 of 125
building tree 10 of 125
building tree 11 of 125
building tree 12 of 125
building tree 13 of 125
building tree 14 of 125
building tree 15 of 125
building tree 16 of 125
building tree 17 of 125
building tree 18 of 125
building tree 19 of 125
building tree 20 of 125
building tree 21 of 125
building tree 22 of 125
building tree 23 of 125


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.9s


building tree 24 of 125
building tree 25 of 125
building tree 26 of 125
building tree 27 of 125
building tree 28 of 125
building tree 29 of 125
building tree 30 of 125
building tree 31 of 125
building tree 32 of 125
building tree 33 of 125
building tree 34 of 125
building tree 35 of 125
building tree 36 of 125
building tree 37 of 125
building tree 38 of 125
building tree 39 of 125
building tree 40 of 125
building tree 41 of 125
building tree 42 of 125
building tree 43 of 125
building tree 44 of 125
building tree 45 of 125
building tree 46 of 125
building tree 47 of 125
building tree 48 of 125
building tree 49 of 125
building tree 50 of 125
building tree 51 of 125
building tree 52 of 125
building tree 53 of 125
building tree 54 of 125
building tree 55 of 125
building tree 56 of 125
building tree 57 of 125
building tree 58 of 125
building tree 59 of 125
building tree 60 of 125
building tree 61 of 125
building tree 62 of 125
building tree 63 of 125
building tree 64 of 125
building tree 65

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   10.6s finished


In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:    0.0s finished


In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9964853703540989
Accuracy Dev:  0.9187478016180092
Accuracy Test:  0.9226985242445538
F1 Train:  0.9403243190373994
F1 Dev:  0.4244107560905512
F1 Test:  0.420772801976991
Precision Train:  0.9985610068943402
Precision Dev:  0.5872578127973811
Precision Test:  0.5697885497705965
Recall Train:  0.8966725559142908
Recall Dev:  0.39698679199200115
Recall Test:  0.39420028281270086
Confusion Matrix Train: 
[[ 1889    73     0]
 [    1 20777     0]
 [    0     6    16]]
Confusion Matrix Dev: 
[[  53  219    0]
 [  10 2559    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  47  203    0]
 [  13 2579    1]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Sentence Transformer Better No PCA

In [None]:
train, dev, test = load_sent_trans_better_no_pca()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100

[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.6s



building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 6

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.6s finished


In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)
save_model(random_forest_classifier, "random_forest_better_no_pca")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100

building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.6s


building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.8s finished


In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9957824444249187
Accuracy Dev:  0.9194512838550827
Accuracy Test:  0.9276177090653549
F1 Train:  0.880759214599236
F1 Dev:  0.4367787231716227
F1 Test:  0.44731653253333886
Precision Train:  0.9976629039044457
Precision Dev:  0.5693698974762925
Precision Test:  0.5822705010992631
Recall Train:  0.8196615470628829
Recall Dev:  0.4071079385432647
Recall Test:  0.41407173158503663
Confusion Matrix Train: 
[[ 1882    80     0]
 [    5 20773     0]
 [    0    11    11]]
Confusion Matrix Dev: 
[[  62  210    0]
 [  17 2552    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  62  188    0]
 [  14 2578    1]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Sentence Transformer Better PCA

In [None]:
train, dev, test = load_sent_trans_better_pca()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n

building tree 1 of 150
building tree 2 of 150
building tree 3 of 150
building tree 4 of 150building tree 5 of 150building tree 6 of 150
building tree 7 of 150
building tree 8 of 150
building tree 9 of 150
building tree 10 of 150


building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150
building tree 17 of 150
building tree 18 of 150
building tree 19 of 150
building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150
building tree 24 of 150


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.9s


building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150
building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150
building tree 60 of 150
building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65 of 150
building tree 66

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    9.1s


building tree 120 of 150
building tree 121 of 150
building tree 122 of 150
building tree 123 of 150
building tree 124 of 150building tree 125 of 150

building tree 126 of 150
building tree 127 of 150
building tree 128 of 150
building tree 129 of 150
building tree 130 of 150
building tree 131 of 150
building tree 132 of 150
building tree 133 of 150
building tree 134 of 150
building tree 135 of 150
building tree 136 of 150
building tree 137 of 150
building tree 138 of 150
building tree 139 of 150
building tree 140 of 150
building tree 141 of 150
building tree 142 of 150
building tree 143 of 150
building tree 144 of 150
building tree 145 of 150
building tree 146 of 150
building tree 147 of 150
building tree 148 of 150
building tree 149 of 150
building tree 150 of 150
{'bootstrap': False, 'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 150}


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   12.8s finished


In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)
save_model(random_forest_classifier, "random_forest_better_pca")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 150
building tree 2 of 150
building tree 3 of 150
building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150
building tree 9 of 150
building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150building tree 17 of 150

building tree 18 of 150
building tree 19 of 150
building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.9s


building tree 24 of 150
building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150
building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150
building tree 60 of 150
building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    9.2s


building tree 117 of 150
building tree 118 of 150
building tree 119 of 150
building tree 120 of 150
building tree 121 of 150
building tree 122 of 150
building tree 123 of 150
building tree 124 of 150
building tree 125 of 150
building tree 126 of 150
building tree 127 of 150
building tree 128 of 150
building tree 129 of 150
building tree 130 of 150
building tree 131 of 150
building tree 132 of 150
building tree 133 of 150
building tree 134 of 150
building tree 135 of 150
building tree 136 of 150
building tree 137 of 150
building tree 138 of 150
building tree 139 of 150
building tree 140 of 150
building tree 141 of 150
building tree 142 of 150
building tree 143 of 150
building tree 144 of 150
building tree 145 of 150
building tree 146 of 150
building tree 147 of 150
building tree 148 of 150
building tree 149 of 150
building tree 150 of 150


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   13.2s finished


In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished


In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9946401897900009
Accuracy Dev:  0.915933872669715
Accuracy Test:  0.9234012649332396
F1 Train:  0.8782145491415165
F1 Dev:  0.41166802418366877
F1 Test:  0.4212288912148154
Precision Train:  0.9978901022795669
Precision Dev:  0.5750599278766431
Precision Test:  0.5742318553250665
Recall Train:  0.8146288775403373
Recall Dev:  0.38827860844610323
Recall Test:  0.3944573852680293
Confusion Matrix Train: 
[[ 1852   110     0]
 [    1 20777     0]
 [    0    11    11]]
Confusion Matrix Dev: 
[[  46  226    0]
 [  11 2558    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  47  203    0]
 [  12 2581    0]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
