In [None]:
import os, re
import random
import numpy as np
import pandas as pd

pd.set_option('precision', 3)
np.set_printoptions('precision', 3)

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.neptune import NeptuneLogger

from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from bpemb import BPEmb

import torch
import torch.nn as nn
from itertools import product
from tqdm import tqdm

import neptune

import nn_tokenize
import nn_models
import nn_train_eval

In [5]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
seed_everything(SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
DATAPATH = 'train-balanced-sarcasm.csv'
FMT = os.path.splitext(DATAPATH)[1][1:]

In [7]:
os.environ["NEPTUNE_API_TOKEN"] = 'ANONIMOUS'
PROJECT = 'ayeffkay/sarcasm-on-reddit'
EXPERIMENT_NAME = 'nn_classifiers'

# Simple NN classifier

In [None]:
simple_dataset = nn_tokenize.DatasetsFromVectorized(DATAPATH)
input_size = simple_dataset.train_data[0].comment.shape[1]
simple_nn = nn_models.SimpleNN(input_size)

In [None]:
neptune_logger = NeptuneLogger(
    api_key=os.environ['NEPTUNE_API_TOKEN'],
    project_name=PROJECT,
    experiment_name=EXPERIMENT_NAME, 
    params={'max_epochs': 5, 
            'input_size': input_size},
    tags=['simple_nn', 'bpe', 'tf-idf'], 
    close_after_fit=False
)

In [10]:
model1 = nn_train_eval.ClassificationModel(simple_nn, train_data=simple_dataset.train_data, batch_size=128, lr=1e-3,
                                test_data=simple_dataset.test_data, valid_data=simple_dataset.valid_data)

In [None]:
early_stopping = EarlyStopping(monitor='valid_accuracy', min_delta=0.1, patience=2, verbose=False, mode='max')
checkpoint_callback = ModelCheckpoint(monitor='valid_loss', mode='min')
trainer = Trainer(gpus=-1, auto_select_gpus=True, 
                  max_epochs=5, logger=neptune_logger, 
                  deterministic=True, progress_bar_refresh_rate=30, 
                  gradient_clip_val=0.5, num_sanity_val_steps=0,
                  callbacks=[early_stopping, checkpoint_callback], automatic_optimization=True)
trainer.fit(model1)
model_checkpoint = torch.load(checkpoint_callback.best_model_path)['state_dict']
model1.load_state_dict(model_checkpoint)
trainer.test(model1, verbose=False)
neptune_logger.close()

# Tokenization for deep NNs with embeddings

In [9]:
bpemb_en = BPEmb(lang='en')
weights = bpemb_en.vectors
emb_datasets = nn_tokenize.DatasetsTokenizeSimple(bpemb_en.encode_ids, DATAPATH, FMT, 
                                      use_vocab=False, make_iters=False)

# RNNs

In [13]:
model_type = [nn.GRU, nn.LSTM]
use_hidden = [True, False]
bidirectional = [True, False]
n_layers = [1, 2]
from_pretrained = [True, False]

In [None]:
num_emb = bpemb_en.vocab_size
all_types = list(product(model_type, use_hidden, bidirectional, n_layers, from_pretrained))
with tqdm(total=len(all_types)) as pbar:
    for mtype, use_hid, bi, n_layers, from_pr in all_types:
        tags = [mtype.__name__, 'use_hid_true' if use_hid else 'use_hid_false', 
                    'bi_true' if bi else 'bi_false', 'n_layers' + str(n_layers), 
                    'from_pretrained' if from_pr else 'from_scratch']
        print(f"Running config: {'.'.join(tags)}")
        if from_pr:
            rnn = nn_models.SimpleRNN(rnn_class=mtype, num_embeddings=num_emb, hidden_size=64, num_layers=n_layers, 
                                    bidirectional=bi, use_hidden=use_hid, from_pretrained=True, vectors=weights)
        else:
            rnn = nn_models.SimpleRNN(rnn_class=mtype, num_embeddings=num_emb, input_size=64, hidden_size=64, num_layers=n_layers, 
                                    bidirectional=bi, use_hidden=use_hid)
            
        neptune_logger = NeptuneLogger(
            api_key=os.environ['NEPTUNE_API_TOKEN'],
            project_name=PROJECT,
            experiment_name='RNN', 
            params={'max_epochs': 5, 
                    'mtype': mtype, 
                    'use_hid': use_hid, 
                    'bi': bi, 
                    'n_layers': n_layers, 
                    'from_pretrained': from_pr},
            tags = tags,
            close_after_fit=False
        )
        model2 = nn_train_eval.ClassificationModel(rnn, train_data=emb_datasets.train_data, batch_size=128, lr=1e-3,
                                    test_data=emb_datasets.test_data, valid_data=emb_datasets.valid_data)
        early_stopping = EarlyStopping(monitor='valid_accuracy', min_delta=0.1, patience=2, verbose=False, mode='max')
        checkpoint_callback = ModelCheckpoint(monitor='valid_loss', mode='min')
        trainer = Trainer(gpus=-1, auto_select_gpus=True,
                        max_epochs=5, logger=neptune_logger, 
                        deterministic=True, progress_bar_refresh_rate=30, 
                        gradient_clip_val=0.5, num_sanity_val_steps=0,
                        callbacks=[early_stopping, checkpoint_callback], automatic_optimization=True)
        trainer.fit(model2)
        model_checkpoint = torch.load(checkpoint_callback.best_model_path)['state_dict']
        model2.load_state_dict(model_checkpoint)
        trainer.test(model2, verbose=False)
        neptune_logger.close()
        
        pbar.update(1)

# CNNs

In [10]:
models = [nn_models.SimpleCNN, nn_models.CombinedCNN]
pretrained = [True, False]
kernels = [3, 5, 7]

In [None]:
num_emb = bpemb_en.vocab_size
all_types = list(product(models, pretrained))
with tqdm(total=len(all_types)) as pbar:
    for mtype, from_pr in all_types:
        tags = [mtype.__name__, 'from_pretrained' if from_pr else 'from_scratch']
        print(f"Running config: {'.'.join(tags)}")
        if from_pr:
            cnn = mtype(num_embeddings=num_emb, hidden_size=64, 
                        from_pretrained=True, vectors=weights, kernels=kernels)
        else:
            cnn = mtype(num_embeddings=num_emb, input_size=64, hidden_size=64, 
                        kernels=kernels)
            
        neptune_logger = NeptuneLogger(
            api_key=os.environ['NEPTUNE_API_TOKEN'],
            project_name=PROJECT,
            experiment_name='CNN', 
            params={'max_epochs': 5, 
                    'mtype': mtype,  
                    'from_pretrained': from_pr},
            tags = tags,
            close_after_fit=False
        )
        model3 = nn_train_eval.ClassificationModel(cnn, train_data=emb_datasets.train_data, batch_size=128, lr=1e-3,
                                    test_data=emb_datasets.test_data, valid_data=emb_datasets.valid_data)
        early_stopping = EarlyStopping(monitor='valid_accuracy', min_delta=0.1, patience=2, verbose=False, mode='max')
        checkpoint_callback = ModelCheckpoint(monitor='valid_loss', mode='min')
        trainer = Trainer(gpus=-1, auto_select_gpus=True,
                        max_epochs=5, logger=neptune_logger, 
                        deterministic=True, progress_bar_refresh_rate=30, 
                        gradient_clip_val=0.5, num_sanity_val_steps=0,
                        callbacks=[early_stopping, checkpoint_callback], automatic_optimization=True)
        trainer.fit(model3)
        model_checkpoint = torch.load(checkpoint_callback.best_model_path)['state_dict']
        model3.load_state_dict(model_checkpoint)
        trainer.test(model3, verbose=False)
        neptune_logger.close()
        
        pbar.update(1)

# biRNN + CNN (RCNN)

In [17]:
model_type = [nn.GRU, nn.LSTM]
num_layers = [1, 2]
kernel = [3, 5]
from_pretrained = [True, False]

In [None]:
num_emb = bpemb_en.vocab_size
all_types = list(product(model_type, num_layers, kernel, from_pretrained))
with tqdm(total=len(all_types)) as pbar:
    for mtype, n_l, kern, from_pr in all_types:
        tags = [mtype.__name__ + 'cnn',
                'n_layers' + str(n_l), 
                'kernels' + str(kern),
                'from_pretrained' if from_pr else 'from_scratch']
        print(f"Running config: {'.'.join(tags)}")
        if from_pr:
            rсnn = nn_models.RCNN(rnn_class=mtype, num_embeddings=num_emb, input_size=64, hidden_size=64, num_layers=n_l, 
                                    from_pretrained=True, vectors=weights, kernel=kern)
        else:
            rсnn = nn_models.RCNN(rnn_class=mtype, num_embeddings=num_emb, input_size=64, hidden_size=64, 
                                 num_layers=n_l, kernel=kern)
            
        neptune_logger = NeptuneLogger(
            api_key=os.environ['NEPTUNE_API_TOKEN'],
            project_name=PROJECT,
            experiment_name='RNN', 
            params={'max_epochs': 5, 
                    'mtype': mtype,  
                    'kernel': kern,
                    'n_layers': n_l, 
                    'from_pretrained': from_pr},
            tags = tags,
            close_after_fit=False
        )
        model4 = nn_train_eval.ClassificationModel(rсnn, train_data=emb_datasets.train_data, batch_size=128, lr=1e-3,
                                    test_data=emb_datasets.test_data, valid_data=emb_datasets.valid_data)
        early_stopping = EarlyStopping(monitor='valid_accuracy', min_delta=0.1, patience=2, verbose=False, mode='max')
        checkpoint_callback = ModelCheckpoint(monitor='valid_loss', mode='min')
        trainer = Trainer(gpus=-1, auto_select_gpus=True,
                        max_epochs=5, logger=neptune_logger, 
                        deterministic=True, progress_bar_refresh_rate=30, 
                        gradient_clip_val=0.5, num_sanity_val_steps=0,
                        callbacks=[early_stopping, checkpoint_callback], automatic_optimization=True)
        trainer.fit(model4)
        model_checkpoint = torch.load(checkpoint_callback.best_model_path)['state_dict']
        model4.load_state_dict(model_checkpoint)
        trainer.test(model4, verbose=False)
        neptune_logger.close()
        
        pbar.update(1)

# BERT

In [15]:
pretrained_model_name = 'google/bert_uncased_L-2_H-128_A-2'
datasets = nn_tokenize.DatasetsTokenizeBert(DATAPATH, pretrained_model_name)

100%|██████████| 1010773/1010773 [47:04<00:00, 357.91it/s]


In [None]:
for fr_head in range(2):
    tags = [pretrained_model_name, 'fr_head_true' if fr_head else 'fr_head_false', 
            'seq_len' + str(256)]
    print(f"Running config: {'.'.join(tags)}")

    bert = nn_models.BertForSeqClf(pretrained_model_name, freeze_head=fr_head)

    neptune_logger = NeptuneLogger(
        api_key=os.environ['NEPTUNE_API_TOKEN'],
        project_name=PROJECT,
        experiment_name='BERT', 
        params={'max_epochs': 10, 
                'frozen_head': fr_head,
                'seqlen': 256, 
                'lr': 2e-5, 
                'batch_size': 32},
        tags = tags,
        close_after_fit=False
    )
    model5 = nn_train_eval.ClassificationModel(bert, train_data=datasets.train_data, batch_size=32, lr=2e-5,
                                test_data=datasets.test_data, valid_data=datasets.valid_data)
    early_stopping = EarlyStopping(monitor='valid_accuracy', min_delta=0.1, patience=2, verbose=False, mode='max')
    checkpoint_callback = ModelCheckpoint(monitor='valid_loss', mode='min')
    trainer = Trainer(gpus=-1, auto_select_gpus=True,
                    max_epochs=5, logger=neptune_logger, 
                    deterministic=True, progress_bar_refresh_rate=30, 
                    gradient_clip_val=0.5, num_sanity_val_steps=0,
                    callbacks=[early_stopping, checkpoint_callback], automatic_optimization=True)
    trainer.fit(model5)
    model_checkpoint = torch.load(checkpoint_callback.best_model_path)['state_dict']
    model5.load_state_dict(model_checkpoint)
    trainer.test(model5, verbose=False)
    neptune_logger.close()

# Report

## Device specifications

In [46]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  2
Core(s) per socket:  2
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) CPU @ 2.00GHz
Stepping:            3
CPU MHz:             2000.166
BogoMIPS:            4000.33
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            39424K
NUMA node0 CPU(s):   0-3
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_si

In [47]:
!nvidia-smi

Thu Dec 10 10:32:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    41W / 300W |   1545MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Logs

All experiments is available here: https://ui.neptune.ai/ayeffkay/sarcasm-on-reddit/experiments?viewId=standard-view
(you can select columns of interest via 'Manage columns')

In [29]:
session = neptune.init(PROJECT, os.environ['NEPTUNE_API_TOKEN'])
results = session.get_leaderboard()
results.rename(columns=lambda x: re.sub('channel_','',x), inplace=True)
results['size'] /= 1024**2
results['model_name'] = results.tags.map(lambda x: '.'.join(x))
num_cols = ['size', 'test_loss', 'test_f1', 'test_accuracy', 'test_precision', 'test_recall', 'test_auc']
for col in num_cols:
    results[col] = results[col].astype('float')

result_columns = ['model_name'] + num_cols

## Experiments description

Here we applied NN models for classification based on text features (i.e., 'comment' and 'parent comment'). 

1.   First 'comment' tokenized into BPE-tokens (with youtokentome) and vectorized using TF-IDF. Then we applied SimpleNN -- this was our baseline.
2.   Second 'comment' tokenized into BPE-tokens using pretrained bpe-tokenizer for Wikipedia (with vocabulary size=10000 and embeddings of the size=100). Then we trained RNN, CNN and their combination (bidirectional RNN with further output states convolution) with input embeddings layer (pretrained embeddings and not). In the summary tables these options will be refered to *from_pretrained* and *from_scratch*.
3. Third 'comment' and 'parent comment' were tokenized with pretrained Google BERT tokenizer and BERT model for classification. Two configurations were applied: *fr_head_true* (e.g., classification head was initialized randomly and wasn't trained at all) and *fr_head_false* (model had all weights trainable).

For all models (except BERT) same hyperparameters were used:


*   batch_size = 128
*   learning_rate = 1e-3, reduce on plateu based on validation loss
* optimizer = Adam
* patience between no changing validation accuracy per epochs = 2
* dropout=0.3 (if needed)
* max number of epochs = 5

After each epoch classification metrics were evaluated and after training model with minimum validation loss was loaded from checkpoints.

Also we don't truncate sequences and group them by length using BucketIterator (this allows to minimize padding).


For BERT the following hyperparameters were used:
* batch_size = 32
* learning_rate=2e-5, reduce on plateu based on validation loss
* optimizer AdamW
* truncation parent comment to the lenght=256
* max number of epochs = 10



## SimpleNN results

In [39]:
results.loc[results.name=='nn_classifiers', result_columns]

Unnamed: 0,model_name,size,test_loss,test_f1,test_accuracy,test_precision,test_recall,test_auc
0,simple_nn.bpe.tf-idf,1.471,0.535,0.714,0.728,0.754,0.678,0.805


## RNN results

LSTM and GRU were used with combination of the following hyperparameters:


1.   Bidirectional or not (*'bi_true'*, *'bi_false'*)
2.   Stacked or not (*'n_layers1'*, *'n_layers2'*)
3.  Use hidden states or output states for classification (*use_hid_true*, *use_hid_false*)
4. Pretrained embeddings or not (*from_pretrained*, *from_scratch*)



In [42]:
rnn_logs = results.loc[results.name=='RNN', result_columns]
rnn_logs.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0,model_name,size,test_loss,test_f1,test_accuracy,test_precision,test_recall,test_auc
1,GRU.use_hid_true.bi_true.n_layers1.from_pretrained,1.445,0.544,0.701,0.719,0.751,0.657,0.794
2,GRU.use_hid_true.bi_true.n_layers1.from_scratch,1.42,0.53,0.724,0.732,0.747,0.703,0.809
3,GRU.use_hid_true.bi_true.n_layers2.from_pretrained,1.395,0.534,0.701,0.727,0.774,0.641,0.804
4,GRU.use_hid_true.bi_true.n_layers2.from_scratch,1.352,0.527,0.728,0.736,0.751,0.706,0.813
5,GRU.use_hid_true.bi_false.n_layers1.from_pretrained,1.31,0.543,0.702,0.719,0.747,0.663,0.795
6,GRU.use_hid_true.bi_false.n_layers1.from_scratch,1.291,0.535,0.709,0.728,0.764,0.661,0.804
7,GRU.use_hid_true.bi_false.n_layers2.from_pretrained,1.271,0.54,0.699,0.722,0.762,0.646,0.798
8,GRU.use_hid_true.bi_false.n_layers2.from_scratch,1.243,0.534,0.726,0.732,0.743,0.709,0.81
9,GRU.use_hid_false.bi_true.n_layers1.from_pretrained,1.216,0.542,0.701,0.721,0.754,0.655,0.796
10,GRU.use_hid_false.bi_true.n_layers1.from_scratch,1.191,0.531,0.726,0.732,0.743,0.71,0.809


From the results obtained it can be seen that:

1.   RNN + CNN combination performs the best
2.   Sometimes it's worth to stack more layers and use longer convolutional kernel :)
3. In this task pretrained embeddings don't improve quality



## CNN results

Here CombinedCNN used few convolutional kernels (3, 5 and 7) and then combined results.

In [40]:
cnn_logs = results.loc[results.name=='CNN', result_columns]
cnn_logs.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0,model_name,size,test_loss,test_f1,test_accuracy,test_precision,test_recall,test_auc
33,SimpleCNN.from_pretrained,1.885,0.55,0.702,0.716,0.738,0.67,0.788
34,SimpleCNN.from_scratch,1.869,0.544,0.712,0.722,0.738,0.688,0.795
35,CombinedCNN.from_pretrained,1.852,0.545,0.711,0.72,0.736,0.688,0.792
36,CombinedCNN.from_scratch,1.831,0.541,0.713,0.72,0.73,0.698,0.795


We can see that CNN works worse than RNN or RNN+CNN here and more complicated architecture doesn't improve quality.

## BERT results

In [43]:
bert_logs = results.loc[results.name=='BERT', result_columns]
bert_logs.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0,model_name,size,test_loss,test_f1,test_accuracy,test_precision,test_recall,test_auc
53,google/bert_uncased_L-2_H-128_A-2.fr_head_false.seq_len256,0.545,0.52,0.738,0.739,0.743,0.732,0.819
54,google/bert_uncased_L-2_H-128_A-2.fr_head_true.seq_len256,0.428,0.521,0.734,0.74,0.752,0.717,0.819


So:
1. Predictably BERT beat other models by all metrics
1. BERT model with frozen head performs no worse than with trainable head
2. Using parent comment can improve classification quality
3. Unfortunately BERT model worked twice longer than other models (13 min. per epoch vs 5 min. per epoch for RNNs)