In [1]:
from embeddings_loader import *
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from utils import *


In [2]:
train_labels, dev_labels, test_labels = load_labels()

In [3]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

In [4]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

### Glove Twitter 25

In [5]:
gt25_train, gt25_dev, gt25_test = load_glove_twitter_25()

In [6]:
# Set all NaN values to 0
gt25_train = np.nan_to_num(gt25_train)
gt25_dev = np.nan_to_num(gt25_dev)
gt25_test = np.nan_to_num(gt25_test)

In [7]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(gt25_train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_gt25.joblib")



In [8]:
train_preds = quadratic_discriminant_analysis.predict(gt25_train)
dev_preds = quadratic_discriminant_analysis.predict(gt25_dev)
test_preds = quadratic_discriminant_analysis.predict(gt25_test)

In [9]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7539759247869255
Accuracy Dev:  0.7544846992613436
Accuracy Test:  0.7477160927617709
Weighted F1 Train:  0.8044120054997739
Weighted F1 Dev:  0.8009118247842949
Weighted F1 Test:  0.7983234423877871
Macro F1 Train:  0.6744216725831986
Macro F1 Dev:  0.40317686012469395
Macro F1 Test:  0.3896183335950176
Micro F1 Train:  0.7539759247869257
Micro F1 Dev:  0.7544846992613436
Micro F1 Test:  0.7477160927617709
Weighted Recall Train:  0.7539759247869255
Weighted Recall Dev:  0.7544846992613436
Weighted Recall Test:  0.7477160927617709
Macro Recall Train:  0.7744438140684701
Macro Recall Dev:  0.49417880231111516
Macro Recall Test:  0.47957475253888676
Micro Recall Train:  0.7539759247869255
Micro Recall Dev:  0.7544846992613436
Micro Recall Test:  0.7477160927617709
Confusion Matrix Train: 
[[ 1375   587     0]
 [ 5006 15768     4]
 [    0     3    19]]
Confusion Matrix Dev: 
[[ 197   75    0]
 [ 621 1948    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 171   79    0]

### FastText 300 

In [10]:
ft300_train, ft300_dev, ft300_test = load_fasttext_300()

In [11]:
# Set all NaN values to 0
ft300_train = np.nan_to_num(ft300_train)
ft300_dev = np.nan_to_num(ft300_dev)
ft300_test = np.nan_to_num(ft300_test)

In [13]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(ft300_train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_ft300.joblib")



In [14]:
train_preds = quadratic_discriminant_analysis.predict(ft300_train)
dev_preds = quadratic_discriminant_analysis.predict(ft300_dev)
test_preds = quadratic_discriminant_analysis.predict(ft300_test)

In [15]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7192250241630788
Accuracy Dev:  0.7073513893774183
Accuracy Test:  0.7041461700632466
Weighted F1 Train:  0.7801558429961895
Weighted F1 Dev:  0.7662375529960141
Weighted F1 Test:  0.7669719949449605
Macro F1 Train:  0.5135054305552197
Macro F1 Dev:  0.3831116903688154
Macro F1 Test:  0.37618064195981277
Micro F1 Train:  0.7192250241630788
Micro F1 Dev:  0.7073513893774183
Micro F1 Test:  0.7041461700632466
Weighted Recall Train:  0.7192250241630788
Weighted Recall Dev:  0.7073513893774183
Weighted Recall Test:  0.7041461700632466
Macro Recall Train:  0.6183216596034141
Macro Recall Dev:  0.4921323434005755
Macro Recall Test:  0.49254917084458155
Micro Recall Train:  0.7192250241630788
Micro Recall Dev:  0.7073513893774183
Micro Recall Test:  0.7041461700632466
Confusion Matrix Train: 
[[ 1820   142     0]
 [ 6231 14546     1]
 [    3    14     5]]
Confusion Matrix Dev: 
[[ 211   61    0]
 [ 769 1800    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 195   55    0]


### Word2Vec 300

In [16]:
w2v300_train, w2v300_dev, w2v300_test = load_word2vec_300()

In [17]:
# Set all NaN values to 0
w2v300_train = np.nan_to_num(w2v300_train)
w2v300_dev = np.nan_to_num(w2v300_dev)
w2v300_test = np.nan_to_num(w2v300_test)

In [18]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(w2v300_train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_w2v300.joblib")



In [19]:
train_preds = quadratic_discriminant_analysis.predict(w2v300_train)
dev_preds = quadratic_discriminant_analysis.predict(w2v300_dev)
test_preds = quadratic_discriminant_analysis.predict(w2v300_test)

In [20]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.7210702047271769
Accuracy Dev:  0.7059444249032711
Accuracy Test:  0.7097680955727337
Weighted F1 Train:  0.7815394573906098
Weighted F1 Dev:  0.7652563146261453
Weighted F1 Test:  0.7710907602758333
Macro F1 Train:  0.5140102376131676
Macro F1 Dev:  0.3817181050891236
Macro F1 Test:  0.3772495249220289
Micro F1 Train:  0.7210702047271769
Micro F1 Dev:  0.7059444249032711
Micro F1 Test:  0.7097680955727337
Weighted Recall Train:  0.7210702047271769
Weighted Recall Dev:  0.7059444249032711
Weighted Recall Test:  0.7097680955727337
Macro Recall Train:  0.6171492245149015
Macro Recall Dev:  0.4894218586617208
Macro Recall Test:  0.4885820799588636
Micro Recall Train:  0.7210702047271769
Micro Recall Dev:  0.7059444249032711
Micro Recall Test:  0.7097680955727337
Confusion Matrix Train: 
[[ 1808   154     0]
 [ 6177 14600     1]
 [    3    14     5]]
Confusion Matrix Dev: 
[[ 209   63    0]
 [ 771 1798    0]
 [   1    1    0]]
Confusion Matrix Test: 
[[ 190   60    0]
 [

### TF-IDF PCA (1000 Dims)

In [21]:
tfidf_pca_train, tfidf_pca_dev, tfidf_pca_test = load_tfidf_pca()

In [22]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(tfidf_pca_train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_tfidf_pca.joblib")



In [23]:
train_preds = quadratic_discriminant_analysis.predict(tfidf_pca_train)
dev_preds = quadratic_discriminant_analysis.predict(tfidf_pca_dev)
test_preds = quadratic_discriminant_analysis.predict(tfidf_pca_test)

In [24]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.842808189087075
Accuracy Dev:  0.798804080196975
Accuracy Test:  0.780744905130007
Weighted F1 Train:  0.8720393310806308
Weighted F1 Dev:  0.8286355443674975
Weighted F1 Test:  0.8184920217557787
Macro F1 Train:  0.475424310117472
Macro F1 Dev:  0.40571322585412933
Macro F1 Test:  0.3852103279623214
Micro F1 Train:  0.8428081890870749
Micro F1 Dev:  0.798804080196975
Micro F1 Test:  0.780744905130007
Weighted Recall Train:  0.842808189087075
Weighted Recall Dev:  0.798804080196975
Weighted Recall Test:  0.780744905130007
Macro Recall Train:  0.6046958826619035
Macro Recall Dev:  0.4524534609484121
Macro Recall Test:  0.43503380897287575
Micro Recall Train:  0.842808189087075
Micro Recall Dev:  0.798804080196975
Micro Recall Test:  0.780744905130007
Confusion Matrix Train: 
[[ 1930    32     0]
 [ 3524 17254     0]
 [    6    16     0]]
Confusion Matrix Dev: 
[[ 144  128    0]
 [ 442 2127    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 124  126    0]
 [ 495 2098 

### Sentence Transformer Faster No PCA

In [25]:
train, dev, test = load_sent_trans_fast_no_pca()

In [26]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_sent_trans_fast_no_pca")



In [27]:
train_preds = quadratic_discriminant_analysis.predict(train)
dev_preds = quadratic_discriminant_analysis.predict(dev)
test_preds = quadratic_discriminant_analysis.predict(test)

In [28]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9198664440734557
Accuracy Dev:  0.9078438269433696
Accuracy Test:  0.9068868587491216
Weighted F1 Train:  0.9098520799275851
Weighted F1 Dev:  0.8934551973639238
Weighted F1 Test:  0.8916687491400068
Macro F1 Train:  0.4592698309444445
Macro F1 Dev:  0.4376954218634636
Macro F1 Test:  0.4152862631054917
Micro F1 Train:  0.9198664440734557
Micro F1 Dev:  0.9078438269433696
Micro F1 Test:  0.9068868587491216
Weighted Recall Train:  0.9198664440734557
Weighted Recall Dev:  0.9078438269433696
Weighted Recall Test:  0.9068868587491216
Macro Recall Train:  0.43667324757062403
Macro Recall Dev:  0.4159749730955052
Macro Recall Test:  0.39805373441316366
Micro Recall Train:  0.9198664440734557
Micro Recall Dev:  0.9078438269433696
Micro Recall Test:  0.9068868587491216
Confusion Matrix Train: 
[[  655  1307     0]
 [  495 20283     0]
 [    1    21     0]]
Confusion Matrix Dev: 
[[  74  198    0]
 [  62 2507    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[  55  195    0]


### Sentence Transformer Faster PCA

In [29]:
train, dev, test = load_sent_trans_fast_pca()

In [30]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_sent_trans_fast_pca")



In [31]:
train_preds = quadratic_discriminant_analysis.predict(train)
dev_preds = quadratic_discriminant_analysis.predict(dev)
test_preds = quadratic_discriminant_analysis.predict(test)

In [32]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.8861259994728056
Accuracy Dev:  0.8607105170594442
Accuracy Test:  0.865425158116655
Weighted F1 Train:  0.9025300606685084
Weighted F1 Dev:  0.877619209390376
Weighted F1 Test:  0.8823586104229608
Macro F1 Train:  0.503509673166497
Macro F1 Dev:  0.46967280439711284
Macro F1 Test:  0.46504811311613486
Micro F1 Train:  0.8861259994728056
Micro F1 Dev:  0.8607105170594441
Micro F1 Test:  0.865425158116655
Weighted Recall Train:  0.8861259994728056
Weighted Recall Dev:  0.8607105170594442
Weighted Recall Test:  0.865425158116655
Macro Recall Train:  0.5925128210864448
Macro Recall Dev:  0.5245980544806478
Macro Recall Test:  0.5214346317007328
Micro Recall Train:  0.8861259994728056
Micro Recall Dev:  0.8607105170594442
Micro Recall Test:  0.865425158116655
Confusion Matrix Train: 
[[ 1748   214     0]
 [ 2356 18422     0]
 [    1    21     0]]
Confusion Matrix Dev: 
[[ 189   83    0]
 [ 311 2258    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 170   80    0]
 [ 300

### Sentence Transformer Better No PCA

In [33]:
train, dev, test = load_sent_trans_better_no_pca()

In [34]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_sent_trans_better_no_pca")



In [35]:
train_preds = quadratic_discriminant_analysis.predict(train)
dev_preds = quadratic_discriminant_analysis.predict(dev)
test_preds = quadratic_discriminant_analysis.predict(test)

In [36]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.8807661892628065
Accuracy Dev:  0.8445304256067534
Accuracy Test:  0.8489107519325368
Weighted F1 Train:  0.8899776238011531
Weighted F1 Dev:  0.8476942825620342
Weighted F1 Test:  0.8567079061351852
Macro F1 Train:  0.4581842209848508
Macro F1 Dev:  0.38136066923376094
Macro F1 Test:  0.3893906492133005
Micro F1 Train:  0.8807661892628065
Micro F1 Dev:  0.8445304256067534
Micro F1 Test:  0.8489107519325368
Weighted Recall Train:  0.8807661892628065
Weighted Recall Dev:  0.8445304256067534
Weighted Recall Test:  0.8489107519325368
Macro Recall Train:  0.4851669643134857
Macro Recall Dev:  0.38385367771086637
Macro Recall Test:  0.39732407764494154
Micro Recall Train:  0.8807661892628065
Micro Recall Dev:  0.8445304256067534
Micro Recall Test:  0.8489107519325368
Confusion Matrix Train: 
[[ 1063   899     0]
 [ 1793 18985     0]
 [    0    22     0]]
Confusion Matrix Dev: 
[[  66  206    0]
 [ 234 2335    0]
 [   1    1    0]]
Confusion Matrix Test: 
[[  72  178    0]

### Sentence Transformer Better PCA

In [37]:
train, dev, test = load_sent_trans_better_pca()

In [38]:
quadratic_discriminant_analysis = QuadraticDiscriminantAnalysis().fit(train, train_labels)
save_model(quadratic_discriminant_analysis, "quadratic_discriminant_analysis_sent_trans_better_pca")



In [39]:
train_preds = quadratic_discriminant_analysis.predict(train)
dev_preds = quadratic_discriminant_analysis.predict(dev)
test_preds = quadratic_discriminant_analysis.predict(test)

In [40]:
computeAllScores(train_preds, dev_preds, test_preds)

Accuracy Train:  0.9039188120551797
Accuracy Dev:  0.8797045374604291
Accuracy Test:  0.8805340829234013
Weighted F1 Train:  0.9166933425690814
Weighted F1 Dev:  0.892257253587789
Weighted F1 Test:  0.894419624085436
Macro F1 Train:  0.5243740172499892
Macro F1 Dev:  0.48798658968150493
Macro F1 Test:  0.4827967694269703
Micro F1 Train:  0.9039188120551798
Micro F1 Dev:  0.8797045374604291
Micro F1 Test:  0.8805340829234013
Weighted Recall Train:  0.9039188120551797
Weighted Recall Dev:  0.8797045374604291
Weighted Recall Test:  0.8805340829234013
Macro Recall Train:  0.6116259464035562
Macro Recall Dev:  0.5370833619551363
Macro Recall Test:  0.5390101555469854
Micro Recall Train:  0.9039188120551797
Micro Recall Dev:  0.8797045374604291
Micro Recall Test:  0.8805340829234013
Confusion Matrix Train: 
[[ 1830   132     0]
 [ 2033 18745     0]
 [    1    21     0]]
Confusion Matrix Dev: 
[[ 194   78    0]
 [ 262 2307    0]
 [   0    2    0]]
Confusion Matrix Test: 
[[ 180   70    0]
 [ 