[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20A%20-%20Ensemble%20DistilGPT2.ipynb)

# Import Libraries

At the time of our work, we used the following library versions
- numpy 1.18.1
- pandas 1.0.1
- torch 1.2.0
- Cuda 10.0
- python 3.7.0
- sklearn 0.22.1
- tqdm 4.42.1
- nltk 3.4.5

In [None]:
!git clone https://github.com/cozek/OffensEval2020-code/

In [None]:
!git clone https://github.com/huggingface/transformers
!pip install /content/transformers/

In [211]:
import sys
sys.path.append('/content/OffensEval2020-code/src/')
import collections
from typing import Callable
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import notebook
import importlib
import pprint
import nltk
import datetime
import os
from argparse import Namespace

from collections import Counter

In [3]:
import utils.data as data_utils
import utils.trac2020 as trac_utils
import utils.general as general_utils
import utils.transformer.data as transformer_data_utils
import utils.transformer.general as transformer_general_utils
general_utils.set_seed_everywhere()

In [4]:
import logging
logging.basicConfig(level=logging.INFO) 

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.__version__ # we used version 1.2.0


'1.2.0'

In [6]:
# Import RAdam and Lookahead
from radam.radam import RAdam
from lookahead.optimizer import Lookahead


In [7]:
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

INFO:transformers.file_utils:PyTorch version 1.2.0 available.


In [8]:
 args = Namespace(
        #use cuda by default
        device = 'cuda' if torch.cuda.is_available() else 'cpu',
    
        #set batch size and number of epochs
        batch_size = 32,
        num_epochs = 20,
    
        #set the learning rate
        learning_rate = 0.0001,

        #location of the train, dev and test csv
        train_val_csv = '/content/OffensEval2020-code/data/eng/task_c_tiny.zip',
        test_csv = '/content/OffensEval2020-code/data/test_data/test_a_tweets.tsv',
    
        #directory to save our models at
        directory = './models/', 
        model_name = 'roberta_attn_trac_task_a.pt',
     
        date = datetime.datetime.now().strftime("%a_%d_%b_%Y/"),
)

## Model save location

In [9]:
directory = args.directory + args.date
if not os.path.exists(directory):
    os.makedirs(directory)
args.directory = directory
print(args.directory)

/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/


## Load the dataset

In [10]:
data_df_task_c = pd.read_csv(args.data.offeval_data['en_task_c_presplit_final'])
print(data_df_task_c.label.value_counts())
print(data_df_task_c.split.value_counts())

0    152562
1     24917
2     11494
Name: label, dtype: int64
train    179524
val        9449
Name: split, dtype: int64


In [11]:
with pd.option_context('display.max_colwidth', -1): 
    print(data_df_task_c.sample(5))

                         id  \
5244    1186108554765312002   
128115  1159291683755962368   
140602  1159917247755800577   
125749  1162469290085355520   
84623   1188059686748033024   

                                                                         text  \
5244    I be wanting a lil boo but then I remember everyone ain't shit          
128115  What i gotta be some ice cream to get licked around dis bitch           
140602  Fool really had his height in his twitter bio... lame ass nigga         
125749  Bitch I’m the baby 🤷🏽‍♀️                                                
84623   the fuck my country didn't even experience ly and syt im ugly sobbing   

        split  label  
5244    train  0      
128115  train  0      
140602  train  0      
125749  train  0      
84623   train  0      


## Importing the Roberta Tokeniker and Punkt sentence tokenizer

In [12]:
class RobertaPreprocessor():
    def __init__(self,transformer_tokenizer,sentence_detector):
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector
        self.bos_token = transformer_tokenizer.bos_token
        self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '
    def add_special_tokens(self, text):
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text  = self.sep_token.join(sentences) 
        return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token

In [None]:
!python -c 'import nltk; nltk.download("punkt")'

In [13]:
roberta_tokenizer = tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json from cache at /home/kaushik.das/.cache/torch/transformers/5f11352d3c3e932888f3ba75bc24579eacb5d1596d39ce56166aeae8fd363df8.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt from cache at /home/kaushik.das/.cache/torch/transformers/01f63a14ad93494c050af2090c59930fb787bdfb347c4cad7ce9063e1a5fe140.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


In [14]:
roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)

In [15]:
#apply the preprocessor on the exploded dataframe
data_df_task_c['text'] = data_df_task_c['text'].map(roberta_preproc.add_special_tokens)


In [16]:
with pd.option_context('display.max_colwidth', -1): 
    print(data_df_task_c.sample(5))

                         id  \
49079   1188333271303753729   
99777   1188514382696927233   
135957  1159549527013109760   
13851   1160664798293450752   
35525   1187184205504991232   

                                                                                                        text  \
49079   <s> I don't want shit that ain't mine </s>                                                             
99777   <s> Good news! </s> That’s the end of Xhaka at Arsenal! </s> Useless piece of shit! </s>               
135957  <s> Yo dick sucking game prolly dryer than lil Caesar's pizza crust, shut up talkin to me bitch </s>   
13851   <s> bro fuck it ima make the thread i need this </s>                                                   
35525   <s> @USER Omfg I woulda snatched her phone and robbed her ass too lmao </s>                            

        split  label  
49079   train  0      
99777   train  0      
135957  train  0      
13851   train  0      
35525   train  0      


### Here we create the dataset

In [68]:
class SimpleVectorizer():
    def __init__(self,tokenizer: Callable, max_seq_len: int):
        """
        Args:
            tokenizer (Callable): transformer tokenizer
            max_seq_len (int): Maximum sequence lenght 
        """
        self.tokenizer = tokenizer
        self._max_seq_len = max_seq_len

    def vectorize(self,text :str):
        
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False, #already added by preproc
            max_length = self._max_seq_len,
            pad_to_max_length = True,
        )
        ids =  np.array(encoded['input_ids'], dtype=np.int64)
        attn = np.array(encoded['attention_mask'], dtype=np.int64)
        
        return ids, attn

class Vectorizer():
    def __init__(self,tokenizer: Callable, max_seq_len: int ):
        """
        Args:
            tokenizer (Callable): transformer tokenizer
            max_seq_len (int): Maximum sequence lenght 
        """
        self.tokenizer = tokenizer
        self._max_seq_len = max_seq_len

    def vectorize(self,text :str, mask_prob=0.50, mask_amount:int=0.30):
        
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False, #already added by preproc
            max_length = self._max_seq_len,
            pad_to_max_length = True,
        )
        ids =  np.array(encoded['input_ids'], dtype=np.int64)
        attn = np.array(encoded['attention_mask'], dtype=np.int64)
        prob = np.random.rand(1)[0]
#         print(prob)
        if  prob <= mask_prob:
#             print(attn)
            len_of_sent = np.where(ids==tokenizer.pad_token_id)[0][0]
            amount_to_mask = max(int(len_of_sent * mask_amount ) , 1)
            ids_to_not_attend = [np.random.randint(low=0, high=len_of_sent ) for i in range(amount_to_mask)]
#             print(amount_to_mask)
#             print(len_of_sent)
#             print(ids_to_not_attend)
            attn[ids_to_not_attend]=0
            ids[ids_to_not_attend] = tokenizer.mask_token_id
#             print(attn)
        return ids, attn

In [18]:
v = Vectorizer(roberta_tokenizer, 15)

In [19]:
sent = "I am alright bro, dont worry about me"
v.vectorize(sent)

(array([  100,   524, 30103, 11051,     6, 33976,  4022,    59,   162,
            1,     1,     1,     1,     1,     1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]))

In [80]:
class HateDataset(Dataset):
    def __init__(
        self,
        data_df: pd.DataFrame,
        tokenizer: Callable,
        max_seq_length:int = None,
    ):
        """
        Args:
            data_df (pandas.DataFrame): df containing the labels and text
            tokenizer (tokenizer module for the transformer)
        """
        self.data_df = data_df
        self.tokenizer = tokenizer

        if max_seq_length is None:
            self._max_seq_length = self._get_max_len(data_df,tokenizer)
        else:
            self._max_seq_length = max_seq_length

        self.train_df = self.data_df[self.data_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.data_df[self.data_df.split == 'val']
        self.val_size = len(self.val_df)

        self.test_df = self.data_df[self.data_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self.simple_vectorize = False,
        self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)
        self._vectorizer = Vectorizer(tokenizer, self._max_seq_length)
        
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.val_size),
            'test': (self.test_df, self.test_size)
        }

        self.set_split('train')

        class_counts = data_df.label.value_counts().to_dict()
         #sorted on the basis of class label,eg, 0,1,2..
        cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])
        freq = [ x[1] for x in cts ]
        # print(freq,cts)
        self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)
    
    def flip_simple_vectorizer(self) :
        if self.simple_vectorize:
            self.simple_vectorize=False
        else:
            self.simple_vectorize= True
    
    def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):
        len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])
        max_len = data_df.text.map(len_func).max() 
        return max_len

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        
        if self.simple_vectorize:
            indices, attention_masks = self._simple_vectorizer.vectorize(row.text)
        else:
            indices, attention_masks = self._vectorizer.vectorize(row.text)
            
        label = row.label
        return {'x_data': indices,
                'x_attn_mask': attention_masks,
                'x_index': index,
                'y_target': label}
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [23]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            pin_memory= pinned_memory,
                            num_workers = n_workers,
                            )
    
    for data_dict in dataloader:
        out_data_dict = {}
        out_data_dict['x_data'] = data_dict['x_data'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_index'] = data_dict['x_index']
        out_data_dict['y_target'] = data_dict['y_target'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        yield out_data_dict

In [24]:
dataset = HateDataset(
    data_df = data_df_task_c,
    tokenizer = roberta_tokenizer
)

In [25]:
dataset._max_seq_length # make sure its safe enough for our model

245

# Initialize the Roberta+ Attention model

In [27]:
model = RobertaForSequenceClassification.from_pretrained(
    'distilroberta-base',
    num_labels=len(set(data_df_task_c.label)),
)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json from cache at /home/kaushik.das/.cache/torch/transformers/d52ced8fd31ba6aa311b6eeeae65178cca00ddd6333c087be4601dc46c20bd96.deeb956b92b63ef883f183df980353df2c982e37e78a00189cf9146088df28dd
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "

In [29]:
# for name,param in model.roberta.named_parameters():
# #     print(name)
#     param.required_grad = False

model.to(args.device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [30]:
early_stopping = transformer_general_utils.EarlyStopping(patience=4)

In [31]:
args.num_epochs = 50
args.batch_size = 120

In [32]:
loss_func = nn.CrossEntropyLoss()

print(f'Using LR:{args.learning_rate}')
base_optimizer = RAdam(model.parameters(), lr = args.learning_rate)
optimizer = Lookahead(optimizer = base_optimizer, k = 5, alpha=0.5 )
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer.optimizer, factor =0.1 ,mode='max',
)

Using LR:0.0001


# Begin Training

In [33]:
train_state = general_utils.make_train_state()
train_state.keys()


dict_keys(['train_preds', 'train_indexes', 'train_targets', 'train_accuracies', 'train_f1s', 'train_losses', 'val_preds', 'val_indexes', 'val_targets', 'val_accuracies', 'val_f1s', 'val_losses', 'test_preds', 'test_indexes', 'test_targets', 'test_accuracies', 'test_f1s', 'test_losses', 'batch_preds', 'batch_targets', 'batch_indexes', 'epoch_index'])

In [34]:
epoch_bar = notebook.tqdm(
    desc = 'training_routine',
    total = args.num_epochs,
    position=0,
    leave = True,
)
dataset.set_split('train')
train_bar = notebook.tqdm(
    desc = 'split=train ',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
dataset.set_split('val')
eval_bar = notebook.tqdm(
    desc = 'split=eval',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)

old_val_acc = 0
old_f1 = 0
model_state = None
for epoch_index in range(args.num_epochs):
    train_state['epoch_in'] = epoch_index

    dataset.set_split('train')

    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 3, 
    )

    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    model.train()

    train_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )

    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        loss,y_pred = model(
            input_ids = batch_dict['x_data'],
            attention_mask =  batch_dict['x_attn_mask'],
            labels= batch_dict['y_target'].unsqueeze(1),
        )[:2]
        
        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
                             
#         scheduler.step()
        loss.backward()
        optimizer.step()
                             
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
                             
        y_pred = y_pred.detach().cpu()
        batch_dict['y_target'] = batch_dict['y_target'].cpu()
        
        acc_t = transformer_general_utils \
            .compute_accuracy(y_pred, batch_dict['y_target'])
        
        f1_t = transformer_general_utils \
            .compute_macro_f1(y_pred, batch_dict['y_target'])

        train_state['batch_preds'].append(y_pred)
        train_state['batch_targets'].append(batch_dict['y_target'])
        train_state['batch_indexes'].append(batch_dict['x_index'])

        running_acc += (acc_t - running_acc) / (batch_index + 1)
        running_f1 += (f1_t - running_f1) / (batch_index + 1)

        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                             epoch=epoch_index)

        train_bar.update()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    train_state['train_accuracies'].append(running_acc)
    train_state['train_losses'].append(running_loss)
    
    train_state['train_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )
    train_state['train_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['train_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    train_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['train_preds'][-1],
                                  train_state['train_targets'][-1],
                                 )
                                 
    train_state['train_f1s'].append(train_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    
    dataset.set_split('val')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    
    model.eval()
    with torch.no_grad():
        optimizer._backup_and_load_cache()
        for batch_index, batch_dict in enumerate(batch_generator):
            loss, y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
                labels= batch_dict['y_target'].unsqueeze(1),
            )[:2]
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
            
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            y_pred = y_pred.detach()
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            
            acc_t = transformer_general_utils\
                .compute_accuracy(y_pred, batch_dict['y_target'])
            f1_t = transformer_general_utils \
                .compute_macro_f1(y_pred, batch_dict['y_target'])

            train_state['batch_preds'].append(y_pred.cpu())
            train_state['batch_targets'].append(batch_dict['y_target'].cpu())
            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())

            running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_f1 += (f1_t - running_f1) / (batch_index + 1)
            

            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                                 epoch=epoch_index)
            eval_bar.update()
            
    train_state['val_accuracies'].append(running_acc)
    train_state['val_losses'].append(running_loss)
    
        
    train_state['val_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )

    train_state['val_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['val_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    val_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['val_preds'][-1],
                                  train_state['val_targets'][-1],
                                 )
                                 
    train_state['val_f1s'].append(val_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    torch.save(
        {
            'model':model.state_dict(),
        },
        args.directory + f'_epoc_{epoch_index}_' + args.model_name,
    )
    
    scheduler.step(val_f1)
    early_stopping(val_f1, model)
    optimizer._clear_and_load_backup()
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()    
    
    if early_stopping.early_stop:
        print("Early stopping")
        break


HBox(children=(FloatProgress(value=0.0, description='training_routine', max=50.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='split=train ', max=1496.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='split=eval', max=78.0, style=ProgressStyle(description_wi…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



EarlyStopping counter: 2 out of 4
EarlyStopping counter: 3 out of 4
EarlyStopping counter: 4 out of 4
Early stopping


In [35]:
epoch_index

9

In [46]:
print(train_state['val_f1s'])

[0.7434965204141823, 0.7505651120587791, 0.7692071080698577, 0.755785881435914, 0.7698018627331286, 0.7778388042128456, 0.7727479807203751, 0.7666622259214374, 0.7730406310762795, 0.759795332840237]


In [37]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [39]:

print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][-1],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][-1].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][-1],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][-1].cpu().numpy(), 
    digits=4)
)


Train:               precision    recall  f1-score   support

           0     0.9695    0.9809    0.9751    144934
           1     0.9074    0.8733    0.8900     23671
           2     0.8436    0.7802    0.8107     10919

    accuracy                         0.9545    179524
   macro avg     0.9068    0.8781    0.8920    179524
weighted avg     0.9536    0.9545    0.9539    179524

Dev:               precision    recall  f1-score   support

           0     0.9496    0.9457    0.9477      7628
           1     0.7584    0.8010    0.7791      1246
           2     0.5728    0.5339    0.5527       575

    accuracy                         0.9016      9449
   macro avg     0.7602    0.7602    0.7598      9449
weighted avg     0.9014    0.9016    0.9014      9449



In [41]:
best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))
print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)

Train:               precision    recall  f1-score   support

           0     0.9574    0.9739    0.9655    144934
           1     0.8650    0.8298    0.8471     23671
           2     0.7689    0.6608    0.7107     10919

    accuracy                         0.9358    179524
   macro avg     0.8638    0.8215    0.8411    179524
weighted avg     0.9337    0.9358    0.9344    179524

Dev:               precision    recall  f1-score   support

           0     0.9488    0.9567    0.9527      7628
           1     0.8029    0.7648    0.7834      1246
           2     0.6000    0.5948    0.5974       575

    accuracy                         0.9094      9449
   macro avg     0.7839    0.7721    0.7778      9449
weighted avg     0.9083    0.9094    0.9088      9449



In [42]:
def sort_preds(indexes, preds):
    """Sorts the predictions in order, to reverse the effects of shuffle
    done by dataloader"""
    indexes = indexes.cpu().numpy().reshape(-1,1)
    preds = preds.cpu().numpy()
    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes
    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes
    sorted_preds = np.delete(sort_arr,0,axis=1)
    return sorted_preds

In [96]:
def get_optimal_models(train_state, split, reverse=False ):
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    rng = range(0,total_preds)
    if reverse:
        rng = reversed(rng)
    for i in rng:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='macro'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return idxes

In [54]:
optimal_emsemble = get_optimal_models(train_state,'val')

Taking preds from [9, 8, 7, 6, 5, 4, 2, 1] | Dev f1:0.8180580972703804


In [55]:
optimal_emsemble

[9, 8, 7, 6, 5, 4, 2, 1]

# Making sure AttenMask Dropout works

In [134]:
def evaluate(model, state, dataset, split,args):
    """Returns the final layer output of our transformer model
    Puts them in the '{split}_*' keys in the state dict
    Args:
        model: A pytorch transformers model
        state: dict to store outputs
        dataset: A pytorch Dataset
        split: The split on which to evaluate the model on
        args: Arguments from namespace, etc
    Returns:
        state: all evaluated output stored in the "test" key
    """
    eval_bar = notebook.tqdm(
        desc = 'evaluation progress: ',
        total=dataset.get_num_batches(args.batch_size),
        position=0,
        leave=False,
    )
    dataset.set_split(split)
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.eval()
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
            _, y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
                labels= batch_dict['y_target'].unsqueeze(1),
            )[:2]
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
            

            y_pred = y_pred.detach()
            
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            
            acc_t = transformer_general_utils\
                .compute_accuracy(y_pred, batch_dict['y_target'])
            f1_t = transformer_general_utils \
                .compute_macro_f1(y_pred, batch_dict['y_target'])

            state['batch_preds'].append(y_pred.cpu())
            state['batch_targets'].append(batch_dict['y_target'].cpu())
            state['batch_indexes'].append(batch_dict['x_index'].cpu())
            
            eval_bar.update()
            
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    state[f'{split}_preds'].append(
        torch.cat(state['batch_preds']).cpu()
    )
    state[f'{split}_targets'].append(
        torch.cat(state['batch_targets']).cpu()
    )
    state[f'{split}_indexes'].append(
        torch.cat(state['batch_indexes']).cpu()
    )
    _f1 = transformer_general_utils \
                .compute_macro_f1(state[f'{split}_preds'][-1],
                                  state[f'{split}_targets'][-1],
                                 )
                                 
    state[f'{split}_f1s'].append(_f1)
#     print(state[f'{split}_f1s'][-1])
    
    state['batch_preds'] = []
    state['batch_targets'] = []
    state['batch_indexes'] = []
    
    eval_bar.close()
    return state

In [87]:
args.directory

'/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/'

In [88]:
os.listdir(args.directory)

['_epoc_0_roberta_attn_trac_task_a.pt',
 '_epoc_1_roberta_attn_trac_task_a.pt',
 '_epoc_2_roberta_attn_trac_task_a.pt',
 '_epoc_3_roberta_attn_trac_task_a.pt',
 '_epoc_4_roberta_attn_trac_task_a.pt',
 '_epoc_5_roberta_attn_trac_task_a.pt',
 '_epoc_6_roberta_attn_trac_task_a.pt',
 '_epoc_7_roberta_attn_trac_task_a.pt',
 '_epoc_8_roberta_attn_trac_task_a.pt',
 '_epoc_9_roberta_attn_trac_task_a.pt']

In [89]:
selected_models = [os.listdir(args.directory)[i] for i in optimal_emsemble]
selected_models

['_epoc_9_roberta_attn_trac_task_a.pt',
 '_epoc_8_roberta_attn_trac_task_a.pt',
 '_epoc_7_roberta_attn_trac_task_a.pt',
 '_epoc_6_roberta_attn_trac_task_a.pt',
 '_epoc_5_roberta_attn_trac_task_a.pt',
 '_epoc_4_roberta_attn_trac_task_a.pt',
 '_epoc_2_roberta_attn_trac_task_a.pt',
 '_epoc_1_roberta_attn_trac_task_a.pt']

In [90]:
selected_models_paths = [os.path.join(args.directory,i ) for i in selected_models]
selected_models_paths

['/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_9_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_8_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_7_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_6_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_5_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_4_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_2_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_1_roberta_attn_trac_task_a.pt']

In [102]:
all_model_paths = [os.path.join(args.directory,i ) for i in os.listdir(args.directory)]
all_model_paths

['/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_0_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_1_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_2_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_3_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_4_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_5_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_6_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_7_roberta_attn_trac_task_a.pt',
 '/home/kaushik.

In [81]:
val_dataset = HateDataset(
    data_df = data_df_task_c[data_df_task_c.split == 'val'],
    tokenizer = roberta_tokenizer
)

In [120]:
val_dataset.simple_vectorize = True
assert val_dataset.simple_vectorize == True

In [135]:
val_state = general_utils.make_train_state()

In [136]:
val_state_with_f1 = general_utils.make_train_state()

In [137]:
for path in notebook.tqdm(selected_models_paths, total = len(selected_models_paths)):
    model.load_state_dict(torch.load(path)['model'])
    val_state = evaluate(model, val_state, val_dataset, 'val',args)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…




In [138]:
val_state['val_f1s']

[0.7993532557548972,
 0.8037430562303842,
 0.8052549980779805,
 0.8148503858889021,
 0.8130260230239558,
 0.8142116805229819,
 0.8053864980648223,
 0.7879739969055087]

In [139]:
for path in notebook.tqdm(all_model_paths, total = len(all_model_paths)):
    model.load_state_dict(torch.load(path)['model'])
    val_state_with_f1 = evaluate(model, val_state_with_f1, val_dataset, 'val',args)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=78.0, style=ProgressStyle(des…




In [140]:
val_state_with_f1['val_f1s']

[0.76904619801307,
 0.7879739969055087,
 0.8053864980648223,
 0.8066677119594804,
 0.8142116805229819,
 0.8130260230239558,
 0.8148503858889021,
 0.8052549980779805,
 0.8037430562303842,
 0.7993532557548972]

In [115]:
get_optimal_models(val_state,'val',reverse=True)

Taking preds from [7, 6, 5, 4] | Dev f1:0.8202963455196183


[7, 6, 5, 4]

In [118]:
get_optimal_models(val_state_with_f1,'val',reverse=False)

Taking preds from [0, 1, 2, 3, 4, 5] | Dev f1:0.8225497357779874


[0, 1, 2, 3, 4, 5]

In [119]:
val_state["val_f1s"]

[]

In [141]:
val_state_best = val_state.copy()
val_state_all = val_state_with_f1.copy()

In [154]:
l = zip(val_state_all['val_f1s'], range(len(val_state_all['val_f1s'])))
sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)
model_idxes = [i[1] for i in sorted_vals]
print(sorted_vals)
print(model_idxes)


[(0.8148503858889021, 6), (0.8142116805229819, 4), (0.8130260230239558, 5), (0.8066677119594804, 3), (0.8053864980648223, 2), (0.8052549980779805, 7), (0.8037430562303842, 8), (0.7993532557548972, 9), (0.7879739969055087, 1), (0.76904619801307, 0)]
[6, 4, 5, 3, 2, 7, 8, 9, 1, 0]


In [155]:
def get_optimal_models_v2(train_state, split):
    l = zip(train_state[f'{split}_f1s'], range(len(train_state[f'{split}_f1s'])))
    sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)
    model_idxes = [i[1] for i in sorted_vals]
    
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    for i in model_idxes:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='macro'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return idxes

In [157]:
final_optimal_models = get_optimal_models_v2(val_state_all, 'val')
final_optimal_models

Taking preds from [6, 4, 5, 3, 1, 0] | Dev f1:0.8281869746692267


[6, 4, 5, 3, 1, 0]

In [158]:
data_df_task_c.split.value_counts()

train    179524
val        9449
Name: split, dtype: int64

In [1]:
179524 + 9449

188973

In [189]:
[all_model_paths[i] for i in final_optimal_models]

['/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_6_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_4_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_5_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_3_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_1_roberta_attn_trac_task_a.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Wed_11_Mar_2020//task_c/roberta_attn/_epoc_0_roberta_attn_trac_task_a.pt']

# Making preds on the given test set

In [162]:
!ls /home/kaushik.das/OffensEval2020/data/OffenseEval2020Data/English/public_data_task_C/

readme-offenseval-testsetC-english.txt	test_c_tweets.tsv


In [None]:
test_set

In [170]:

test_set_loc = \
'/home/kaushik.das/OffensEval2020/data/OffenseEval2020Data/English/public_data_task_C/test_c_tweets.tsv'


In [171]:
test_df = pd.read_csv(test_set_loc,sep='\t')

In [172]:
test_df.sample(5)

Unnamed: 0,id,tweet
276,BC603,@USER i hate how people are acting like he did...
761,BC1688,@USER @USER @USER @USER @USER --says every lib...
34,BC87,"You can be sleep on a nigga chest, whole time ..."
406,BC904,niggas be 45 talking about ima get my life tog...
495,BC1112,@USER Why waste your sweet cum I'll suck your ...


In [174]:
test_df['text'] = test_df['tweet'].map(roberta_preproc.add_special_tokens)
test_df['split'] = 'test'  #dummy label
test_df['label'] = -1  #dummy label
test_df.sample(5)

Unnamed: 0,id,tweet,text,split,label
143,BC345,@USER i should block you for disrespecting the...,<s> @USER i should block you for disrespecting...,test,-1
528,BC1199,@USER Because Trumpets would totally be listen...,<s> @USER Because Trumpets would totally be li...,test,-1
751,BC1666,@USER I fuckin hate this just have a fuckin sh...,<s> @USER I fuckin hate this just have a fucki...,test,-1
283,BC624,@USER said the angry bitch. #PartTimePM,<s> @USER said the angry bitch. </s> #PartTime...,test,-1
171,BC397,Suck mine and I will suck yours,<s> Suck mine and I will suck yours </s>,test,-1


In [175]:
test_dataset = HateDataset(
    data_df = test_df,
    tokenizer = roberta_tokenizer
)
test_dataset.set_split('test')


In [190]:
test_dataset._target_df.sample(5)

Unnamed: 0,id,tweet,text,split,label
345,BC756,what the fuck there was a loud ass noise comin...,<s> what the fuck there was a loud ass noise c...,test,-1
511,BC1156,@USER I knew some human are stupid but this gu...,<s> @USER I knew some human are stupid but thi...,test,-1
216,BC483,@USER Well how about taking some money from yo...,<s> @USER Well how about taking some money fro...,test,-1
420,BC940,@USER bitch needs to clean her glasses and ste...,<s> @USER bitch needs to clean her glasses and...,test,-1
789,BC1747,@USER Dudes is too aggressive smh what if shor...,<s> @USER Dudes is too aggressive smh what if ...,test,-1


In [176]:
test_dataset.simple_vectorize = True
assert test_dataset.simple_vectorize == True

In [183]:
print(len(test_df))
print(test_dataset._target_df.split.value_counts())

850
test    850
Name: split, dtype: int64


In [197]:
def evaluate_testset(model, state, dataset, split,args):
    """Returns the final layer output of our transformer model
    Puts them in the '{split}_*' keys in the state dict
    Args:
        model: A pytorch transformers model
        state: dict to store outputs
        dataset: A pytorch Dataset
        split: The split on which to evaluate the model on
        args: Arguments from namespace, etc
    Returns:
        state: all evaluated output stored in the "test" key
    """
    eval_bar = notebook.tqdm(
        desc = 'evaluation progress: ',
        total=dataset.get_num_batches(args.batch_size),
        position=0,
        leave=False,
    )
    dataset.set_split(split)
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.eval()
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )[0]
            y_pred = y_pred.view(-1, 3)

            y_pred = y_pred.detach()
            
            state['batch_preds'].append(y_pred.cpu())
            state['batch_indexes'].append(batch_dict['x_index'].cpu())
            
            eval_bar.update()
            
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    state[f'{split}_preds'].append(
        torch.cat(state['batch_preds']).cpu()
    )
    state[f'{split}_indexes'].append(
        torch.cat(state['batch_indexes']).cpu()
    )
    
    state['batch_preds'] = []
    state['batch_indexes'] = []
    
    eval_bar.close()
    return state

In [198]:
chosen_models = [all_model_paths[i] for i in final_optimal_models]

In [199]:
test_state = general_utils.make_train_state()
for model_path in notebook.tqdm(chosen_models, total=len(chosen_models)):
    model.load_state_dict(torch.load(model_path)['model'])
    test_state = evaluate_testset(model, test_state, test_dataset, 'test',args)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=7.0, style=ProgressStyle(desc…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=7.0, style=ProgressStyle(desc…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=7.0, style=ProgressStyle(desc…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=7.0, style=ProgressStyle(desc…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=7.0, style=ProgressStyle(desc…



HBox(children=(FloatProgress(value=0.0, description='evaluation progress: ', max=7.0, style=ProgressStyle(desc…




In [201]:
test_state['test_preds'][-1].shape

torch.Size([850, 3])

In [202]:
[test_state['test_preds'][i].size() for i in range(len(test_state['test_preds']))]

[torch.Size([850, 3]),
 torch.Size([850, 3]),
 torch.Size([850, 3]),
 torch.Size([850, 3]),
 torch.Size([850, 3]),
 torch.Size([850, 3])]

In [203]:
len(test_dataset._target_df)

850

In [206]:
torch.zeros_like(test_state['test_preds'][0]).size()

torch.Size([850, 3])

In [207]:
ensemble_pred = torch.zeros_like(test_state['test_preds'][0])
for i in test_state['test_preds']:
    ensemble_pred += i

In [None]:
# label_dict["IND"] = 0
# label_dict["GRP"] = 1
# label_dict["OTH"] = 2
#ref utils/offeval2020.py

In [209]:
int_to_label = { 0: 'IND', 1:'GRP', 2:'OTH'}

In [212]:
t = []
for i in torch.argmax(ensemble_pred, dim=1):
    t.append(int_to_label[i.item()])

collections.Counter(t)

Counter({'IND': 653, 'GRP': 171, 'OTH': 26})

In [213]:
assert len(t) == len(test_df)

In [214]:
offeval_task_c_run_2_pred_analysis_df = pd.DataFrame(
    data={
        'id':test_df.id,
        'text':test_df.tweet,
        'label':t,
    }
)

In [215]:
offeval_task_c_run_2_pred_label_df = pd.DataFrame(
    data={
        'id':test_df.id,
        'label':t,
    }
)

In [216]:
offeval_task_c_run_2_pred_analysis_df.to_csv(
    'offeval_task_c_run_2_pred_analysis.csv',index=False,
)

In [217]:
offeval_task_c_run_2_pred_label_df.to_csv(
    'offeval_task_c_run_2_pred_label.csv', index=False, header=False,
)

In [218]:
offeval_task_c_run_2_pred_label_df.label.value_counts()


IND    653
GRP    171
OTH     26
Name: label, dtype: int64

In [220]:
offeval_task_c_run_2_pred_analysis_df.label.value_counts()

IND    653
GRP    171
OTH     26
Name: label, dtype: int64