In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pltimport 
import numpy as np
import random
import nltk

import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
# from ignite.metrics import Accuracy, Precision, Recall, Fbeta

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from collections import defaultdict

%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1ca463b0e70>

In [2]:
df = pd.read_csv("Twitter_Dataset.csv")
df

Unnamed: 0,clean_text,category
0,when modi promised â€œminimum government maxim...,-1
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,1
3,asking his supporters prefix chowkidar their n...,1
4,answer who among these the most powerful world...,1
...,...,...
995,there are two reasons for atmosphere hatred cr...,0
996,modi has wiped out the small micro industries ...,-1
997,bjp struggles find candidates west bengal graf...,-1
998,modis opposition trying defame him they not wa...,-1


In [3]:
df['category'].value_counts()

 1    442
 0    333
-1    225
Name: category, dtype: int64

In [4]:
def text_transformation(text):
    text = re.sub('[^a-zA-Z]',' ',str(text))
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
# Also lets encode 'sentiment' column. 1 for positive and 0 for negative sentiment
df['category'] = df['category'].map({-1:2, 1:1, 0:0}) 

In [6]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1:].values

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.20, random_state = 1)

In [7]:
print('Original: ', x[1])
print('Processed: ', text_transformation(x[1]))

Original:  ['talk all the nonsense and continue all the drama will vote for modi ']
Processed:  talk nonsense continue drama vote modi


In [8]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [9]:
# Lets apply above function to every tweet in df
df['processed'] = df['clean_text'].apply(text_transformation)

df

Unnamed: 0,clean_text,category,processed
0,when modi promised â€œminimum government maxim...,2,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,1,answer among powerful world leader today trump...
...,...,...,...
995,there are two reasons for atmosphere hatred cr...,0,two reasons atmosphere hatred created modi rul...
996,modi has wiped out the small micro industries ...,2,modi wiped small micro industries india demoni...
997,bjp struggles find candidates west bengal graf...,2,bjp struggles find candidates west bengal graf...
998,modis opposition trying defame him they not wa...,2,modis opposition trying defame want succeed in...


In [10]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-cased'

# Lets load pre-trained Distill BertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [11]:
# Lets use below text to understand tokenization process
# First I am processing our review using above defined function
sample_text = text_transformation(df['processed'][1])

# Lets apply our BertTokenizer on sample text
tokens = tokenizer.tokenize(sample_text)    # this will convert sentence to list of words
token_ids = tokenizer.convert_tokens_to_ids(tokens) # this will convert list of words to list of numbers based on tokenizer

print(f'Sentence: {sample_text}')
print(f'Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

Sentence: talk nonsense continue drama vote modi
Tokens: ['talk', 'nonsense', 'continue', 'drama', 'vote', 'm', '##od', '##i']
Token IDs: [2037, 17466, 2760, 3362, 2992, 182, 5412, 1182]


In [12]:
encoding = tokenizer.encode_plus(
  sample_text,
  max_length=32,  # Here for experiment I gave 32 as max_length
  truncation = True,  # Truncate to a maximum length specified with argument max_length
  add_special_tokens=True, # Add '[CLS]', [PAD] and '[SEP]'
  return_token_type_ids=False,  # since our use case deals with only one sentence as opposed to use case which use 2 sentences in single training example(for ex: Question-anwering) we can have it as false
  padding='max_length',   # pad to longest sequence as defined by max_length
  return_attention_mask=True,  # Returns attention mask. Attention mask indicated to the model which tokens should be attended to, and which should not.
  return_tensors='pt',  # Return PyTorch tensors
)

print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

32


tensor([  101,  2037, 17466,  2760,  3362,  2992,   182,  5412,  1182,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [13]:
# Attention mask also has same length. Zero's in output if any says those corresponds to padding
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

32


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [14]:
max_len = 512

class IMDBDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  # __getitem__ helps us to get a review out of all reviews
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      truncation = True,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),         # flatten() flattens a continguous range of dims in a tensor
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [15]:
# Lets have 70% for training, 15% for validation and 15% for testing

X_train, X_valid, y_train, y_valid = train_test_split(df[['processed', 'clean_text']], df['category'], 
                                                    test_size=0.30, random_state = 0)
df_train = pd.concat([pd.DataFrame({'clean_text': X_train['processed'].values,'review_old':X_train['clean_text'].values}),pd.DataFrame({'category': y_train.values})], axis = 1)
df_valid = pd.concat([pd.DataFrame({'clean_text': X_valid['processed'].values,'review_old':X_valid['clean_text'].values}),pd.DataFrame({'category': y_valid.values})], axis = 1)


X_valid, X_test, y_valid, y_test = train_test_split(df_valid[['clean_text','review_old']], df_valid['category'],
                                                    test_size=0.5, random_state = 0)

df_valid = pd.concat([pd.DataFrame({'clean_text': X_valid['clean_text'].values,'review_old':X_valid['review_old'].values}),pd.DataFrame({'category': y_valid.values})], axis = 1)
df_test = pd.concat([pd.DataFrame({'clean_text': X_test['clean_text'].values,'review_old':X_test['review_old'].values}),pd.DataFrame({'category': y_test.values})], axis = 1)

print(df_train.shape, df_valid.shape, df_test.shape)

(700, 3) (150, 3) (150, 3)


In [16]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ## pass in entire data set here
  ds = IMDBDataset(
    reviews=df.clean_text.to_numpy(),
    targets=df.category.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  # this returns dataloaders with what ever batch size we want
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4                 
    # tells data loader how many sub-processes to use for data loading. No hard and fast rule. Have to experiment on how many num_workers giving better speed up
  )


batch_size = 16      # Bert recommendation

train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
valid_data_loader = create_data_loader(df_valid, tokenizer, max_len, batch_size)
test_data_loader = create_data_loader(df_test, tokenizer, max_len, batch_size)

In [17]:
# Lets build classifier for our reviews now. Below n_classes would be 2 in our case since we are classifying review as either positive or negative.

model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = 3)
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.wei

In [18]:
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr = 5e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)



In [19]:
# Lets write a function to train our model on one epoch

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):

  model = model.train()    # tells your model that we are training
  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    loss, logits = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels = targets
    )
    
    #logits = classification scores befroe softmax
    #loss = classification loss
    
    logits = logits.detach().cpu().numpy()
    label_ids = targets.to('cpu').numpy()

    preds = np.argmax(logits, axis=1).flatten()   #returns indices of maximum logit
    targ = label_ids.flatten()

    correct_predictions += np.sum(preds == targ)

    losses.append(loss.item())
    loss.backward()   # performs backpropagation(computes derivates of loss w.r.t to parameters)
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  #clipping gradients so they dont explode
    optimizer.step()       #After gradients are computed by loss.backward() this makes the optimizer iterate over all parameters it is supposed to update and use internally #stored grad to update their values
    scheduler.step()    # this will make sure learning rate changes. If we dont provide this learning rate stays at initial value
    optimizer.zero_grad()     # clears old gradients from last step

  return correct_predictions / n_examples, np.mean(losses)

In [20]:
# Lets write a function to validate our model on one epoch

def eval_model(model, data_loader, device, n_examples):
  
  model = model.eval()   # tells model we are in validation mode
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      loss, logits = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels = targets
      )


      logits = logits.detach().cpu().numpy()
      label_ids = targets.to('cpu').numpy()

      preds = np.argmax(logits, axis=1).flatten()
      targ = label_ids.flatten()

      correct_predictions += np.sum(preds == targ)
      losses.append(loss.item())

  return correct_predictions / n_examples, np.mean(losses)


In [21]:
# standard block
# used accuracy as metric here

best_acc = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))

  print(f'Train loss {train_loss} Accuracy {train_acc}')

  val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))

  print(f'Val   loss {val_loss} Accuracy {val_acc}')
  print()

  if val_acc > best_acc:
    torch.save(model.state_dict(), 'best_model_state_a5.bin')
    best_acc = val_acc

# We are storing state of best model indicated by highest validation accuracy

Epoch 1/5
----------
