In [1]:
from embeddings_loader import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
def computeAllScores(y_pred_train, y_pred_dev, y_pred_test):
    print("Accuracy Train: ", accuracy_score(train_labels, y_pred_train))
    print("Accuracy Dev: ", accuracy_score(dev_labels, y_pred_dev))
    print("Accuracy Test: ", accuracy_score(test_labels, y_pred_test))
    print("F1 Train: ", f1_score(train_labels, y_pred_train, average='macro'))
    print("F1 Dev: ", f1_score(dev_labels, y_pred_dev, average='macro'))
    print("F1 Test: ", f1_score(test_labels, y_pred_test, average='macro'))
    print("Precision Train: ", precision_score(train_labels, y_pred_train, average='macro'))
    print("Precision Dev: ", precision_score(dev_labels, y_pred_dev, average='macro'))
    print("Precision Test: ", precision_score(test_labels, y_pred_test, average='macro'))
    print("Recall Train: ", recall_score(train_labels, y_pred_train, average='macro'))
    print("Recall Dev: ", recall_score(dev_labels, y_pred_dev, average='macro'))
    print("Recall Test: ", recall_score(test_labels, y_pred_test, average='macro'))
    # Confusion Matrix
    print("Confusion Matrix Train: ")
    print(confusion_matrix(train_labels, y_pred_train))
    print("Confusion Matrix Dev: ")
    print(confusion_matrix(dev_labels, y_pred_dev))
    print("Confusion Matrix Test: ")
    print(confusion_matrix(test_labels, y_pred_test))

In [4]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [6]:
random_forest_classifier = RandomForestClassifier(verbose=3,n_jobs=-1)
gridsearch = GridSearchCV(random_forest_classifier, param_grid = {
    "n_estimators": [100, 125, 150],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "bootstrap": [True, False],
}, scoring = "f1_micro")

### Glove Twitter 25

In [7]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [8]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [9]:
grid_results = gridsearch.fit(gt25_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  77 out of 100 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent w

building tree 1 of 150building tree 2 of 150

building tree 3 of 150building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150
building tree 9 of 150

building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150
building tree 17 of 150
building tree 18 of 150
building tree 19 of 150
building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150
building tree 24 of 150
building tree 25 of 150
building tree 26 of 150


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.4s


building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150building tree 52 of 150

building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150building tree 60 of 150

building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65 of 150
building tree 66 of 150
building tree 67 of 150
building tree 68

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    3.1s


building tree 123 of 150
building tree 124 of 150
building tree 125 of 150
building tree 126 of 150
building tree 127 of 150
building tree 128 of 150
building tree 129 of 150building tree 130 of 150

building tree 131 of 150
building tree 132 of 150
building tree 133 of 150
building tree 134 of 150
building tree 135 of 150
building tree 136 of 150
building tree 137 of 150
building tree 138 of 150
building tree 139 of 150
building tree 140 of 150
building tree 141 of 150
building tree 142 of 150
building tree 143 of 150
building tree 144 of 150
building tree 145 of 150
building tree 146 of 150
building tree 147 of 150
building tree 148 of 150
building tree 149 of 150
building tree 150 of 150
{'bootstrap': False, 'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 150}


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    4.4s finished


In [10]:
random_forest_classifier = random_forest_classifier.fit(gt25_train, train_labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 150building tree 2 of 150building tree 3 of 150

building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150

building tree 9 of 150building tree 10 of 150

building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150building tree 17 of 150
building tree 18 of 150
building tree 19 of 150

building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s


building tree 24 of 150
building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150building tree 35 of 150

building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150building tree 40 of 150

building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150
building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150
building tree 60 of 150
building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    3.7s


building tree 119 of 150

building tree 120 of 150
building tree 121 of 150
building tree 122 of 150
building tree 123 of 150
building tree 124 of 150
building tree 125 of 150
building tree 126 of 150
building tree 127 of 150
building tree 128 of 150
building tree 129 of 150
building tree 130 of 150
building tree 131 of 150
building tree 132 of 150
building tree 133 of 150
building tree 134 of 150
building tree 135 of 150
building tree 136 of 150
building tree 137 of 150building tree 138 of 150

building tree 139 of 150
building tree 140 of 150
building tree 141 of 150
building tree 142 of 150
building tree 143 of 150
building tree 144 of 150
building tree 145 of 150
building tree 146 of 150
building tree 147 of 150
building tree 148 of 150
building tree 149 of 150
building tree 150 of 150


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    5.0s finished


In [11]:
train_preds = random_forest_classifier.predict(gt25_train)
dev_preds = random_forest_classifier.predict(gt25_dev)
test_preds = random_forest_classifier.predict(gt25_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished


In [12]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9971882962832791
Accuracy Dev:  0.9120647203658108
Accuracy Test:  0.9188334504567814
F1 Train:  0.9516899316899318
F1 Dev:  0.39439642357113125
F1 Test:  0.39761368086354176
Precision Train:  0.997402127058732
Precision Dev:  0.5517150017901898
Precision Test:  0.555589352070121
Recall Train:  0.9157571595506567
Recall Dev:  0.3769896923346996
Recall Test:  0.37832883404036516
Confusion Matrix Train: 
[[ 1913    49     0]
 [   10 20768     0]
 [    0     5    17]]
Confusion Matrix Dev: 
[[  37  235    0]
 [  13 2556    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  35  215    0]
 [  12 2580    1]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### FastText 300 

In [13]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [14]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [15]:
grid_results = gridsearch.fit(ft300_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n

building tree 1 of 100building tree 2 of 100

building tree 3 of 100building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100

building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.3s


building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.1s finished


In [16]:
random_forest_classifier = random_forest_classifier.fit(ft300_train, train_labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100building tree 5 of 100

building tree 6 of 100building tree 7 of 100
building tree 8 of 100

building tree 9 of 100
building tree 10 of 100
building tree 11 of 100building tree 12 of 100

building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.7s


building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.2s finished


In [17]:
train_preds = random_forest_classifier.predict(ft300_train)
dev_preds = random_forest_classifier.predict(ft300_dev)
test_preds = random_forest_classifier.predict(ft300_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


In [18]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9970564976715579
Accuracy Dev:  0.9155821315511783
Accuracy Test:  0.9174279690794097
F1 Train:  0.9312560321351931
F1 Dev:  0.4038357568978374
F1 Test:  0.38199310974085227
Precision Train:  0.9970426355330705
Precision Dev:  0.5902678393957465
Precision Test:  0.5636671750550755
Recall Train:  0.885591938694594
Recall Dev:  0.3826701661591067
Recall Test:  0.36817637228435535
Confusion Matrix Train: 
[[ 1914    48     0]
 [   12 20766     0]
 [    0     7    15]]
Confusion Matrix Dev: 
[[  41  231    0]
 [   7 2562    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  27  223    0]
 [   8 2584    1]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Word2Vec 300

In [19]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [20]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [21]:
grid_results = gridsearch.fit(w2v300_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.2s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n

In [None]:
random_forest_classifier = random_forest_classifier.fit(w2v300_train, train_labels)

In [None]:
train_preds = random_forest_classifier.predict(w2v300_train)
dev_preds = random_forest_classifier.predict(w2v300_dev)
test_preds = random_forest_classifier.predict(w2v300_test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9126175204287849
Accuracy Dev:  0.9032711924023918
Accuracy Test:  0.9111033028812369
F1 Train:  0.3181042073427511
F1 Dev:  0.31639253372759196
F1 Test:  0.31782803211374644
Precision Train:  0.30427267800383767
Precision Dev:  0.3011963406052076
Precision Test:  0.3037011009604123
Recall Train:  0.33325312028748355
Recall Dev:  0.33320358115998444
Recall Test:  0.3333333333333333
Confusion Matrix Train: 
[[    0  1962     0]
 [    5 20773     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[   0  272    0]
 [   1 2568    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[   0  250    0]
 [   0 2593    0]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### TF-IDF PCA (1000 Dims)

In [None]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [None]:
grid_results = gridsearch.fit(tfidf_pca_train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

In [None]:
random_forest_classifier = random_forest_classifier.fit(tfidf_pca_train, train_labels)

In [None]:
train_preds = random_forest_classifier.predict(tfidf_pca_train)
dev_preds = random_forest_classifier.predict(tfidf_pca_dev)
test_preds = random_forest_classifier.predict(tfidf_pca_test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9260170459537826
Accuracy Dev:  0.9060851213506859
Accuracy Test:  0.9114546732255797
F1 Train:  0.495687024158139
F1 Dev:  0.46626192408040185
F1 Test:  0.4625893576809945
Precision Train:  0.5321623773425325
Precision Dev:  0.49171866798856945
Precision Test:  0.49091571794921957
Recall Train:  0.48144035138924757
Recall Dev:  0.4492940909333761
Recall Test:  0.44430183828255565
Confusion Matrix Train: 
[[  833  1090    39]
 [  471 20244    63]
 [    0    21     1]]
Confusion Matrix Dev: 
[[ 105  165    2]
 [  90 2471    8]
 [   0    2    0]]
Confusion Matrix Test: 
[[  92  154    4]
 [  81 2502   10]
 [   0    3    0]]


### Sentence Transformer Faster No PCA

In [None]:
train, dev, test = load_sent_trans_fast_no_pca()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)

In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9169229417450137
Accuracy Dev:  0.9074920858248329
Accuracy Test:  0.9167252283907238
F1 Train:  0.3542029696889917
F1 Dev:  0.34937899054295435
F1 Test:  0.3734899506114149
Precision Train:  0.5925699491412861
Precision Dev:  0.5771755269694573
Precision Test:  0.5616635101010101
Recall Train:  0.3519028742174077
Recall Dev:  0.35010093955838467
Recall Test:  0.36310014140635044
Confusion Matrix Train: 
[[  111  1851     0]
 [   18 20760     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[  14  258    0]
 [   3 2566    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  23  227    0]
 [   7 2586    0]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Sentence Transformer Faster PCA

In [None]:
train, dev, test = load_sent_trans_fast_pca()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)

In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9221509533432914
Accuracy Dev:  0.9124164614843475
Accuracy Test:  0.9223471539002108
F1 Train:  0.4077915563624817
F1 Dev:  0.40242015423112165
F1 Test:  0.4244990220535703
Precision Train:  0.5607966386905193
Precision Dev:  0.5428436911487758
Precision Test:  0.5541845834480713
Recall Train:  0.38504391210455585
Recall Dev:  0.38259813462169606
Recall Test:  0.39768607790204397
Confusion Matrix Train: 
[[  314  1636    12]
 [  101 20676     1]
 [    0    22     0]]
Confusion Matrix Dev: 
[[  42  230    0]
 [  17 2552    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  50  199    1]
 [  18 2575    0]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Sentence Transformer Better No PCA

In [None]:
train, dev, test = load_sent_trans_better_no_pca()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)

In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9280379580001757
Accuracy Dev:  0.9215617305663032
Accuracy Test:  0.9244553759662685
F1 Train:  0.49062131325026287
F1 Dev:  0.491946364227066
F1 Test:  0.4961027624687639
Precision Train:  0.5259863782306411
Precision Dev:  0.5260017342095655
Precision Test:  0.5171391563803459
Recall Train:  0.4682736553178208
Recall Dev:  0.47034351887894116
Recall Test:  0.4803825684535287
Confusion Matrix Train: 
[[  841  1112     9]
 [  492 20283     3]
 [    0    22     0]]
Confusion Matrix Dev: 
[[ 119  153    0]
 [  68 2501    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 118  131    1]
 [  78 2513    2]
 [   0    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Sentence Transformer Better PCA

In [None]:
train, dev, test = load_sent_trans_better_pca()

In [None]:
grid_results = gridsearch.fit(train, train_labels)
best_params = grid_results.best_params_
random_forest_classifier = grid_results.best_estimator_
print(best_params)

In [None]:
random_forest_classifier = random_forest_classifier.fit(train, train_labels)

In [None]:
train_preds = random_forest_classifier.predict(train)
dev_preds = random_forest_classifier.predict(dev)
test_preds = random_forest_classifier.predict(test)

In [None]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.8921008698708374
Accuracy Dev:  0.8793527963418923
Accuracy Test:  0.8903724525650035
F1 Train:  0.5393647762286241
F1 Dev:  0.4762443582331762
F1 Test:  0.4907040375334903
Precision Train:  0.5060246695278755
Precision Dev:  0.45801318846521805
Precision Test:  0.46707986534296103
Recall Train:  0.5983692163729094
Recall Dev:  0.5073686831680901
Recall Test:  0.5353808972875691
Confusion Matrix Train: 
[[ 1280   672    10]
 [ 1733 19021    24]
 [    0    17     5]]
Confusion Matrix Dev: 
[[ 167  104    1]
 [ 233 2333    3]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 174   74    2]
 [ 228 2360    5]
 [   0    3    0]]
