# Sanity check: Linear SVC on embeddings


In [2]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, feature_extraction, linear_model
from sklearn import svm
import evaluation

## BERTweet Embeddings with Linear SVC

In [3]:
model = Pipeline([
    ('clf', svm.LinearSVC(C=1e-2, class_weight='balanced', random_state=42))
])

evaluation.evaluate(model, store_model=True, store_submission=True, embeddings='bertweet')

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 768
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.75
INFO:root:F1-Score: 0.76 (training); 0.75 (test)
INFO:root:Accuracy: 80.07% (training); 79.35% (test)
INFO:root:Recall: 74.40% (training); 73.56% (test)
INFO:root:Precision: 78.16% (training); 77.29% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.77
INFO:root:-> Stored model to ../models/model_2021-01-13_200919_Pipeline_1x10cv_0.75_ber

Submission results: 0.79834

## USE 4 Embeddings with Linear SVC

In [4]:
model = Pipeline([
    ('clf', svm.LinearSVC(C=1e-2, class_weight='balanced', random_state=42))
])

evaluation.evaluate(model, store_model=True, store_submission=True, embeddings='use4')

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 512
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.77
INFO:root:F1-Score: 0.78 (training); 0.77 (test)
INFO:root:Accuracy: 80.92% (training); 80.35% (test)
INFO:root:Recall: 78.36% (training); 77.77% (test)
INFO:root:Precision: 77.49% (training); 76.79% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.78
INFO:root:-> Stored model to ../models/model_2021-01-13_201242_Pipeline_1x10cv_0.77_use

Submission results: 0.79405

## nnlm-en-dim128-with-normalization embeddings with Linear SVC


In [5]:
model = Pipeline([
    ('clf', svm.LinearSVC(C=1e-2, class_weight='balanced', random_state=42))
])

evaluation.evaluate(model, store_model=True, store_submission=True, embeddings='nnlm_en_128_norm')

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 128
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.75
INFO:root:F1-Score: 0.76 (training); 0.75 (test)
INFO:root:Accuracy: 79.51% (training); 79.01% (test)
INFO:root:Recall: 75.50% (training); 74.96% (test)
INFO:root:Precision: 76.50% (training); 75.89% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.76
INFO:root:-> Stored model to ../models/model_2021-01-13_201437_Pipeline_1x10cv_0.75_nnl

Submission results: 0.77903

## BERT Large cased with Whole Word Masking with Linear SVC

In [6]:
model = Pipeline([
    ('clf', svm.LinearSVC(C=1e-2, class_weight='balanced', random_state=42))
])

evaluation.evaluate(model, store_model=True, store_submission=True, embeddings='bert_large_cased_whole')

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 1024
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.72
INFO:root:F1-Score: 0.76 (training); 0.72 (test)
INFO:root:Accuracy: 79.87% (training); 76.40% (test)
INFO:root:Recall: 73.61% (training); 70.04% (test)
INFO:root:Precision: 78.24% (training); 73.71% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.76
INFO:root:-> Stored model to ../models/model_2021-01-13_201659_Pipeline_1x10cv_0.72_be

Submission results: 0.75789