# Data Challenge - Kaggle - Summary Source Prediction


In [1]:
! pip install transformers
! pip install imbalanced-learn
! pip install timebudget



In [2]:
import pandas as pd
import numpy as np
import json, re
import uuid

from tqdm import tqdm_notebook

try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW



## NLP libs
import nltk
nltk.download('stopwords')

from nltk import download
import gensim

## PyTorch Transformer
import transformers

## Roberta
from transformers import RobertaModel, RobertaTokenizer,  TFRobertaModel
from transformers import RobertaForSequenceClassification, RobertaConfig

from timebudget import timebudget
timebudget.report_atexit()  # Generate report when the program exits

import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# stopwords = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"}
print(torch.__version__)
print(transformers.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
1.10.0+cu111
4.17.0


In [3]:
dataset = pd.read_json('./train_set.json')
print(dataset.head(5))


                                            document  \
0  Two GOP presidential hopefuls - Ted Cruz and B...   
1  The Tesla Model S P85D's 'insane mode' may be ...   
2  MI5 has issued an alert over the threat posed ...   
3  A new video that shows homeless people reading...   
4  Aston Villa may be gearing up for an FA Cup se...   

                                             summary  label  
0  Ted Cruz and Ben Carson want the charity to re...      1  
1  latvia-based drive eo has created a vehicle, n...      0  
2  Alert issued over rogue workers in nuclear , t...      1  
3  A short film highlights the nasty things peopl...      1  
4  tim sherwood replied to a letter from charlie ...      0  


In [4]:
from sklearn.model_selection import train_test_split
train_dataset, validation_dataset = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset['label'])



In [3]:
# Model with classifier layers on top of RoBERTa
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):

        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)

        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        x = self.sigmoid(x)
        return x

In [4]:
model = ROBERTAClassifier()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [6]:
def prepare_features(summary, doc, zero_pad = False, max_seq_length = 512):
    
    doc_tokens = tokenizer.encode_plus(text=doc, add_special_tokens=True)
    summary_tokens = tokenizer.encode_plus(text=summary, add_special_tokens=True)

    #Get the words in the summary that appear in the document
    words_in_doc = [t for t in summary_tokens['input_ids'][1:-1] if t in doc_tokens['input_ids'][1:-1]]

    if words_in_doc:
      enc_text = tokenizer.encode_plus(text=summary, text_pair=words_in_doc, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)    
    else:
      enc_text = tokenizer.encode_plus(text=summary, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)

    return enc_text

In [9]:
doc = "Father-of-four Gavin Thorman , 36 , was kingpin of a violent drugs gang . Drugs worth £ 200,000 seized by police following five year investigation . Planned to spend ill-gotten money on new teeth , liposuction and a facelift . He has been jailed for 12 years after admitting conspiracy to supply drugs along with 25 other defendants involved in the north Wales-based group . Gavin Thorman , 36 , of no fixed abode but formerly of Caernarfon , pleaded guilty to conspiring to supply cocaine and cannabis - 12 years . James Dylan Davies , 41 , of Cae Mur , Caernarfon , guilty to supplying cocaine - jailed eight years and six months . Richard Broadley , 34 , formerly of Caernarfon and now of Tarporley Close , Stockport , guilty to supplying cocaine and cannabis - jailed six years and eight months . Adam Roberts , 33 , of Lon Eilian , Caernarfon , guilty to supplying cocaine and cannabis - jailed for eight years . Christopher Taylor , 29 , of Pool Street , Caernarfon , guilty to supplying cocaine and cannabis - jailed for eight years and three months . Dylan Rees Hughes , 30 , of Glan Peris , Caernarfon , guilty to supplying cocaine and cannabis - jailed for nine years . Jonathan White , 32 , of Caernarfon , pleaded guilty to supplying cannabis and having an imitation gun , found guilty of supplying cocaine after a trial - 11 years . Gavin Rees Hughes , 29 , of Ty 'n Lon , Llandwrog , Caernarfon , guilty to supplying cocaine - six years and eight months . Martin Taylor , 26 , of Pool Street , Caernarfon , guilty to supplying cannabis - 40 months . Gethin Ellis , 23 , of Cae Bold , Caernarfon , guilty to supplying cocaine and cannabis - four years . Paul Hughes , 36 , of Lon Nant , Caernarfon , guilty to supplying cocaine and cannabis - four years and eight months . Martin Shaw , 32 , of Llanberis Road , Caernarfon , guilty to supplying cannabis - 20 months . Dawn Williams , 47 , of Lon Eilian , Caernarfon , allowing premises to be used for supply of cocaine and cannabis - 14 months . Julian Williams , 40 , of Lon Eilian , Caernarfon , guilty to allowing premises to be used for supply of cocaine and cannabis - 40 weeks . Yasmin Owen , 25 , of Church Drive , Caernarfon , guilty to money laundering - 12 months . Ryan Williams , 34 , of Caer Saint , Caernarfon , entering arrangement concerning criminal property - three and a half years . Nicole Herbert , 30 , of Llanddeiniolen , Caernarfon , guilty to money laundering - 10 months suspended for 18 months . Rizwan Hussain , 28 , of Rochdale and formerly of Caernarfon , found guilty of supplying cannabis after trial - six years . James Whitworth , 30 , of Manchester , pleaded guilty to cannabis , found guilty of supplying cocaine after trial - 12 years . Anthony Ferguson , 20 , of Tweedle Hill Road , Blackley , Manchester , guilty of supplying cocaine and cannabis - six years and eight months . Gregory Appleby , 20 , of Bromfield Paark , Middleton , Manchester , guilty of supplying cannabis - two years . Ian Ogden , 26 , of Hesford Avenue , Moston , Manchester , guilty to supplying cannabis - 16 months . Samuel Hughes , 34 , of White Moss Road , Blackley , Manchester , guilty to supplying cannabis - 18 months . Jake Crookes , 23 , of Selston Road , Blackley , Manchester , guilty to supplying cannabis - 16 months . Patrick Tynan , 23 , of Alconbury Walk , Blackley , Manchester , guilty to supplying cocaine and cannabis - four years . Anthony Hunt , 30 , of Rudston Avenue , Manchester , guilty to supplying cannabis - 16 months ."
summary = "Father-of-four, 36 , violent drugs gang."
example = prepare_features(summary, doc)

print(len(example['input_ids']))

Token indices sequence length is longer than the specified maximum sequence length for this model (807 > 512). Running this sequence through the model will result in indexing errors


512


In [7]:
#Class to prepare X and Y data
class Intents(Dataset):
    def __init__(self, dataframe, testing=False):
        self.len = len(dataframe)
        self.data = dataframe
        self.testing = testing
        
    def __getitem__(self, index):

        summary = self.data['summary'].iloc[index]
        doc = self.data['document'].iloc[index]

        X = prepare_features(summary, doc)

        if self.testing:
          return np.array(X['input_ids']), np.array(X['attention_mask'])

        y = self.data['label'].iloc[index]
        
        return np.array(X['input_ids']), np.array(X['attention_mask']), np.array(y)
    
    def __len__(self):
        return self.len

In [11]:
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALIDATION Dataset: {}".format(validation_dataset.shape))

TRAIN Dataset: (6400, 3)
VALIDATION Dataset: (1600, 3)


In [10]:
training_set = Intents(train_dataset)
print(len(training_set))


6400


In [11]:
validation_set = Intents(validation_dataset)

x, m, y = training_set.__getitem__(820)
print(x.shape)
print(m.shape)
print(y.shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (1012 > 512). Running this sequence through the model will result in indexing errors


(512,)
(512,)
()


In [12]:

### Dataloaders Parameters
params = {'batch_size': 8}

training_loader = DataLoader(training_set, **params)


In [13]:
validation_loader = DataLoader(validation_set, **params)


In [9]:
loss_function = nn.BCEWithLogitsLoss()
learning_rate = 1e-5
optimizer = optim.AdamW(params=model.parameters(), lr=learning_rate)
if torch.cuda.is_available():
    print("GPU is AVAILABLE!🤘🙌💪")
    model = model.cuda()

GPU is AVAILABLE!🤘🙌💪


In [17]:
@timebudget
def train(model, epochs):
  max_epochs = epochs
  model = model.train()
  for epoch in tqdm_notebook(range(max_epochs)):
      print("EPOCH -- {}".format(epoch))
      correct = 0
      total = 0
      for i, (ids, attention_mask, labels) in enumerate(training_loader):


          optimizer.zero_grad()

          if torch.cuda.is_available():
              ids = ids.cuda()
              attention_mask = attention_mask.cuda()
              labels = labels.cuda()


          output = model.forward(ids, attention_mask=attention_mask)

          loss = loss_function(output, labels.float().unsqueeze(1))
          loss.backward()
          optimizer.step()

          predicted = torch.round(output).squeeze(1).int()
          total += labels.size(0)
          correct += (predicted.cpu() == labels.cpu()).sum()
          accuracy = 100.00 * correct.numpy() / total

          if i%100 == 0:
              print('Iteration: {}. Loss: {}. Accuracy: {}.%'.format(i, loss.item(), accuracy))

      print('Finished batch with: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))
  
  return "Training finished!"

In [18]:
train(model, 3)

  0%|          | 0/3 [00:00<?, ?it/s]

EPOCH -- 0
Iteration: 0. Loss: 0.7434123158454895. Accuracy: 50.0.%
Iteration: 100. Loss: 0.650188684463501. Accuracy: 54.33168316831683.%
Iteration: 200. Loss: 0.5718940496444702. Accuracy: 68.28358208955224.%


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 300. Loss: 0.6606975793838501. Accuracy: 65.19933554817275.%
Iteration: 400. Loss: 0.7080792188644409. Accuracy: 61.97007481296758.%
Iteration: 500. Loss: 0.5730767250061035. Accuracy: 64.74550898203593.%


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 600. Loss: 0.5772373676300049. Accuracy: 67.42928452579035.%
Iteration: 700. Loss: 0.5752332210540771. Accuracy: 69.56134094151213.%
Finished batch with: 799. Loss: 0.5772653818130493. Accuracy: 72.109375%
EPOCH -- 1
Iteration: 0. Loss: 0.561012864112854. Accuracy: 87.5.%
Iteration: 100. Loss: 0.5740878582000732. Accuracy: 80.56930693069307.%
Iteration: 200. Loss: 0.4730437397956848. Accuracy: 83.76865671641791.%


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 300. Loss: 0.48512861132621765. Accuracy: 85.00830564784053.%
Iteration: 400. Loss: 0.5678195953369141. Accuracy: 85.81670822942644.%
Iteration: 500. Loss: 0.5707317590713501. Accuracy: 86.60179640718563.%


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 600. Loss: 0.5725660920143127. Accuracy: 87.14642262895175.%
Iteration: 700. Loss: 0.5665661692619324. Accuracy: 87.55349500713267.%
Finished batch with: 799. Loss: 0.5224918127059937. Accuracy: 88.15625%
EPOCH -- 2
Iteration: 0. Loss: 0.5193344354629517. Accuracy: 100.0.%
Iteration: 100. Loss: 0.566765308380127. Accuracy: 90.47029702970298.%
Iteration: 200. Loss: 0.47073253989219666. Accuracy: 91.41791044776119.%


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 300. Loss: 0.6271938681602478. Accuracy: 91.15448504983388.%
Iteration: 400. Loss: 0.5630619525909424. Accuracy: 90.74189526184539.%
Iteration: 500. Loss: 0.6392901539802551. Accuracy: 90.91816367265469.%


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 600. Loss: 0.6424785256385803. Accuracy: 90.91098169717138.%
Iteration: 700. Loss: 0.5631157755851746. Accuracy: 90.5848787446505.%
Finished batch with: 799. Loss: 0.5389032363891602. Accuracy: 91.015625%
train took 1227.982sec


'Training finished!'

In [19]:
torch.save(model.state_dict(), './mymodel.pt')

In [8]:
model.load_state_dict(torch.load('./mymodel.pt'))
model.eval()

ROBERTAClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [16]:
#Validation
correct = 0
total = 0
for i, (ids, attention_mask, labels) in enumerate(validation_loader):
    if torch.cuda.is_available():
        ids = ids.cuda()
        attention_mask = attention_mask.cuda()
        labels = labels.cuda()
    
    output = model.forward(ids,attention_mask=attention_mask)

    predicted = torch.round(output).squeeze(1).int()
    total += labels.size(0)
    correct += (predicted.cpu() == labels.cpu()).sum()
    accuracy = 100.00 * correct.numpy() / total

    if i%100 == 0:      
        print('Iteration: {}. Accuracy: {}%'.format(i, accuracy))


accuracy = 100.00 * correct.numpy() / total
print('Final Accuracy: ', accuracy)

Iteration: 0. Accuracy: 87.5%
Iteration: 100. Accuracy: 93.1930693069307%
Final Accuracy:  93.3125


In [17]:
print('total', total)
print('correct: ', correct)

total 1600
correct:  tensor(1493)


In [10]:
#Testing
test_dataset = pd.read_json('./test_set.json')
print(len(test_dataset))
testing_set = Intents(test_dataset, testing=True)



3200


In [11]:
params = {'batch_size': 8}
testing_loader = DataLoader(testing_set, **params)

In [12]:
print(len(testing_loader))

400


In [13]:
correct = 0
total = 0
all_predicted = []
for i, (ids, attention_mask) in enumerate(testing_loader):
    if torch.cuda.is_available():
        ids = ids.cuda()
        attention_mask = attention_mask.cuda()

    output = model.forward(ids,attention_mask=attention_mask)

    predicted = torch.round(output).squeeze(1).int()
    all_predicted.extend(predicted.cpu().tolist())

    if i%100 == 0:      
        print('Iteration: {}.'.format(i))

Token indices sequence length is longer than the specified maximum sequence length for this model (1918 > 512). Running this sequence through the model will result in indexing errors


Iteration: 0.
Iteration: 100.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Iteration: 200.
Iteration: 300.


In [14]:
print(len(all_predicted))

3200


In [None]:
import csv
# Write predictions to a file
with open("submission.csv", "w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','label'])
    for i, row in enumerate(all_predicted):
        csv_out.writerow([i, row])