# Sklearn models for sarcasm on Reddit data

In [1]:
import os
import time

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (KFold, GridSearchCV, 
                                     train_test_split, cross_val_score)
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tabulate import tabulate

from sarcsdet.configs.sklearn_models_config import *
from sarcsdet.utils.count_model_metrics import get_best_model_metrics

### Get data

In [2]:
data_path = '../data/Sarcasm_on_Reddit'

In [3]:
df = pd.read_pickle(os.path.join(data_path, 'train-balanced-sarcasm-ling_feat.pkl'))

## Процент датасета

In [4]:
dft = df.sample(frac=0.01)
dft.shape

(10108, 25)

In [5]:
del df

## Выбор параметров модели

In [6]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 3), 
    max_features=50000,
    min_df=2
)

### Данные

In [7]:
X = tfidf.fit_transform(dft.comment_tokenized)
y = dft.label

### Метрики

In [8]:
scoring = {
    'AUC': 'roc_auc', 
    'Accuracy': 'accuracy', 
    'F1': 'f1', 
    'F1_micro': 'f1_micro', 
    'F1_macro': 'f1_macro'
}

In [9]:
cv = KFold(n_splits=5, shuffle=True)

### Подбор параметров для SVM

In [10]:
grid = {
    'C': np.logspace(-5, 5, 5),
    'gamma': ['scale', 'auto'],
    'degree': [3, 5, 9, 12],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [11]:
clf = SVC()

In [12]:
gs = GridSearchCV(
    clf, grid, scoring=scoring, 
    refit='Accuracy', cv=cv, 
    verbose=10, n_jobs=6
)

In [32]:
gs.fit(X, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  9.0min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 21.1min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed: 44.7min
[Parallel(n_jobs=3)]: Done 800 out of 800 | elapsed: 45.1min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=3,
             param_...rray([1.00000000e-05, 3.16227766e-03, 1.00000000e+00, 3.16227766e+02,
       1.00000000e+05]),
                         'degree': [3, 5, 9, 12], 'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit='Accuracy',
             return_train_score=False,
             scoring={'AUC': 'roc_auc', 'Accuracy': 'accuracy', 'F1': 'f1',
                      'F1_macro': 'f1_macro', 'F1

In [33]:
best_estimator = gs.best_estimator_
best_estimator

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [34]:
get_best_model_metrics(X, y, cv, best_estimator)

0.59423
0.63608
0.63504


### NN Sklearn

In [13]:
grid_clf = MLPClassifier(random_state=8, early_stopping=True)

In [14]:
nn_grid = {
    'hidden_layer_sizes': [
        [64, 64], 
        [128, 128], [128, 128, 128], 
        [256, 256], [256, 256, 256]
    ],
    'activation': ['identity', 'logistic', 'relu'],
    'solver': ['adam', 'lbfgs']
}

In [15]:
nn_gs = GridSearchCV(
    grid_clf, nn_grid, scoring=scoring, 
    refit='Accuracy', cv=cv, verbose=1, 
    n_jobs=6
)

In [45]:
nn_gs.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed: 42.7min
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed: 217.5min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=True,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,...
             iid='warn', n_jobs=6,
             param_grid={'activation': ['identity', 'logistic', 'relu'],
                         'hidden_layer_sizes': [[64, 64], [128, 128],
                                                [128, 128, 128], [256, 256],
                                                [256, 256, 256]],
                         'solver': ['adam', 'lbfgs']},
      

In [46]:
best_estimator = nn_gs.best_estimator_
best_estimator

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=[256, 256, 256], learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=8, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [47]:
get_best_model_metrics(X, y, cv, best_estimator)

0.58986
0.62928
0.62772


## Сравнение моделей из Sklearn

In [16]:
svm = SVC(**svm_params)

In [17]:
nn_auto_sklearn = MLPClassifier(**nn_params)

In [18]:
logit = LogisticRegression(**logit_params)

## Import Embeddings

In [19]:
# from sarcsdet.embeddings.mean_ft_v import MeanFastTextEmbeddingVectorizer
from sarcsdet.embeddings.mean_glove_v import MeanGloVeEmbeddingVectorizer
from sarcsdet.embeddings.mean_w2v_v import MeanW2VEmbeddingVectorizer
from sarcsdet.embeddings.tfidf_v import TfidfEmbeddingVectorizer

## ColumnTransformer

In [20]:
embeddings = [
    ('tfidf', tfidf), 
    ('word2vec vectorizer', MeanW2VEmbeddingVectorizer()), 
    ('glove vectorizer', MeanGloVeEmbeddingVectorizer()),
    ('tfidf embedding vectorizer', TfidfEmbeddingVectorizer()),
]

In [21]:
preprocessors = {}

for model in embeddings:
    preprocessors[(model[0], 'comment')] = ColumnTransformer([
        ('c', model[1], 'comment_tokenized'),
    ])
    
    preprocessors[(model[0], 'comment + funny_mark')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            ('funny_mark', StandardScaler(), ['funny_mark', ]),
    ])
    
    preprocessors[(model[0], 'comment + interjections')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            ('interjections', StandardScaler(), ['interjections', ]),
    ])
    
    preprocessors[(model[0], 'comment + c_punctuation')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            (
                'c_punct', 
                StandardScaler(), 
                [
                    'c_exclamation_mark', 'c_question_mark', 'c_quotes', 
                    'c_three_dots', 'c_openeing_bracket', 'c_closing_bracket',
                ]
            ),
    ])
    
    preprocessors[(model[0], 'comment + all_c_feats')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            (
                'all_c_feats', 
                StandardScaler(), 
                [
                    'funny_mark', 'interjections', 'c_exclamation_mark', 
                    'c_question_mark', 'c_quotes', 'c_three_dots', 
                    'c_openeing_bracket', 'c_closing_bracket',
                ]
            ),
    ])
    
    preprocessors[(model[0], 'comment + par_comment')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            ('pc', TfidfVectorizer(), 'parent_comment_tokenized'),
    ])
    
    preprocessors[(model[0], 'comment + par_comment + c_pc_punctuation')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            ('pc', model[1], 'parent_comment_tokenized'),
            (
                'category', 
                StandardScaler(), 
                [
                    'c_exclamation_mark', 'c_question_mark', 'c_quotes', 
                    'c_three_dots', 'c_openeing_bracket', 'c_closing_bracket',
                    'pc_exclamation_mark', 'pc_question_mark', 'pc_quotes', 
                    'pc_three_dots', 'pc_openeing_bracket', 'pc_closing_bracket'
                ]
            ),
    ])
    
    preprocessors[(model[0], 'comment + par_comment + all_feats')] = ColumnTransformer([
            ('c', model[1], 'comment_tokenized'),
            ('pc', model[1], 'parent_comment_tokenized'),
            (
                'category', 
                StandardScaler(), 
                [
                    'funny_mark', 'interjections', 'c_exclamation_mark', 
                    'c_question_mark', 'c_quotes', 'c_three_dots', 
                    'c_openeing_bracket', 'c_closing_bracket',
                    'pc_exclamation_mark', 'pc_question_mark', 'pc_quotes', 
                    'pc_three_dots', 'pc_openeing_bracket', 'pc_closing_bracket'
                ]
            ),
    ])

In [22]:
pipelines = {}

for prep in preprocessors.keys():
    pipelines[('logit', prep[0], prep[1])] = Pipeline([
        ("preprocessor", preprocessors[prep]),
        ("LogReg", logit),
    ])
    
    pipelines[('svm', prep[0], prep[1])] = Pipeline([
        ("preprocessor", preprocessors[prep]),
        ("SVM", svm),
    ])
    
    pipelines[('sklearn_nn', prep[0], prep[1])] = Pipeline([
        ("preprocessor", preprocessors[prep]),
        ("NN", nn_auto_sklearn),
    ])

In [24]:
unsorted_scores = []

for name, pipe in pipelines.items():
    start_time = time.time()
    print("{:15} ".format("_".join(name)), end="")
    cvs = cross_val_score(pipe, dft, dft.label, cv=5, n_jobs=1, verbose=0, scoring="accuracy").mean()
    print("done: mean acc: {:.3}; spent time: {:.3}".format(cvs, time.time() - start_time))
    unsorted_scores.append((name, cvs))

logit_tfidf_comment done: mean acc: 0.627; spent time: 1.25
svm_tfidf_comment done: mean acc: 0.628; spent time: 19.6
sklearn_nn_tfidf_comment done: mean acc: 0.623; spent time: 2.8e+02
logit_tfidf_comment + funny_mark done: mean acc: 0.627; spent time: 1.55
svm_tfidf_comment + funny_mark done: mean acc: 0.628; spent time: 21.8
sklearn_nn_tfidf_comment + funny_mark done: mean acc: 0.631; spent time: 2.89e+02
logit_tfidf_comment + interjections done: mean acc: 0.627; spent time: 1.55
svm_tfidf_comment + interjections done: mean acc: 0.626; spent time: 21.8
sklearn_nn_tfidf_comment + interjections done: mean acc: 0.627; spent time: 2.87e+02
logit_tfidf_comment + c_punctuation done: mean acc: 0.642; spent time: 1.77
svm_tfidf_comment + c_punctuation done: mean acc: 0.619; spent time: 23.7
sklearn_nn_tfidf_comment + c_punctuation done: mean acc: 0.634; spent time: 2.81e+02
logit_tfidf_comment + all_c_feats done: mean acc: 0.643; spent time: 1.76
svm_tfidf_comment + all_c_feats done: mean a

In [25]:
scores = sorted(unsorted_scores, key=lambda x: -x[1])

print(tabulate(
    [[item[0][0], item[0][1], item[0][2], item[1]] for item in scores], 
    floatfmt=".4f", 
    headers=("model", "embedding", "data", "score"))
)

model       embedding                   data                                        score
----------  --------------------------  ----------------------------------------  -------
logit       tfidf                       comment + all_c_feats                      0.6430
logit       tfidf                       comment + c_punctuation                    0.6418
logit       tfidf                       comment + par_comment + all_feats          0.6376
logit       tfidf                       comment + par_comment + c_pc_punctuation   0.6359
sklearn_nn  tfidf                       comment + par_comment + c_pc_punctuation   0.6353
sklearn_nn  tfidf                       comment + all_c_feats                      0.6339
sklearn_nn  tfidf                       comment + c_punctuation                    0.6336
sklearn_nn  tfidf                       comment + funny_mark                       0.6312
svm         tfidf                       comment + par_comment                      0.6311
svm       

In [23]:
del dft, X, y

# Обучение лучшей модели из Sklearn 

In [24]:
data_path = '../data/Sarcasm_on_Reddit'

In [25]:
df = pd.read_pickle(os.path.join(data_path, 'train-balanced-sarcasm-ling_feat.pkl'))

In [26]:
train_texts, valid_texts, y_train, y_valid = train_test_split(
    df[[
        'comment_tokenized', 'funny_mark', 'interjections', 
        'c_exclamation_mark', 'c_question_mark', 'c_quotes', 
        'c_three_dots', 'c_openeing_bracket', 'c_closing_bracket'
    ]], 
    df['label'], 
)

In [27]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 3), 
    max_features=50000,
    min_df=2
)

In [28]:
prepr = ColumnTransformer([
        ('c', tfidf, 'comment_tokenized'),
        (
            'all_c_feats', 
            StandardScaler(), 
            [
                'funny_mark', 'interjections', 'c_exclamation_mark', 
                'c_question_mark', 'c_quotes', 'c_three_dots', 
                'c_openeing_bracket', 'c_closing_bracket',
            ]
        ),
])

In [29]:
logit = LogisticRegression(
    C=1, solver='lbfgs',
    dual=False, max_iter=2000
)

In [30]:
tfidf_logit_pipeline = Pipeline([("preprocessor", prepr), ('logit', logit)])

In [31]:
%%time
tfidf_logit_pipeline.fit(train_texts, y_train)

Wall time: 1min 38s


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('c',
                                                  TfidfVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.float64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
                              

In [32]:
%%time
valid_pred = tfidf_logit_pipeline.predict(valid_texts)

Wall time: 3.27 s


In [34]:
accuracy_score(y_valid, valid_pred)

0.7003169678718832