In [2]:
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import nltk
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from nltk.tokenize.treebank import TreebankWordDetokenizer

from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup


import torch
from torch import nn, optim

from torch.utils.data import Dataset, DataLoader


## Load data

In [3]:
df=pd.read_csv("IMDB Dataset.csv")

**Check the head of df**

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [6]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Data Split


#### Train Test Validation Split

In [7]:
def to_sentiment(rating):
  if rating == 'positive':
    return 1
  else:
    return 0
df['sentiment'] = df.sentiment.apply(to_sentiment)

class_names = ['negative', 'positive']

In [8]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=123, stratify= df.sentiment)
df_train, df_val = train_test_split(df_train, test_size=0.125, random_state=123, stratify= df_train.sentiment)


print(df_train.sentiment.value_counts())
print(df_val.sentiment.value_counts())
print(df_test.sentiment.value_counts())


1    17500
0    17500
Name: sentiment, dtype: int64
0    2500
1    2500
Name: sentiment, dtype: int64
0    5000
1    5000
Name: sentiment, dtype: int64


In [9]:
# X = df.drop(columns='sentiment')
# y = df['sentiment']


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify= y)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=123, stratify= y_train)


# print(y_train.value_counts())
# print(y_val.value_counts())
# print(y_test.value_counts())

## Pre-processing

#### Original Reviews

In [10]:
# print(df_train['review'])

#### Lowercasing

In [11]:
# reviews_lowerCase = df_train.review.str.lower()
# reviews_lowerCase

#### Tokenizing

In [12]:
# [nltk.sent_tokenize(item) for item in reviews_lowerCase]
# tokens = [nltk.word_tokenize(item) for item in reviews_lowerCase]
# tokens

#### Removing Punctuation

In [13]:
# regex = re.compile(f'[{re.escape(string.punctuation)}]')
# res=[regex.sub('', word) for words in tokens for word in words if not regex.sub('', word) == '']
# res

#### Removing Stop Words

In [14]:
# stop_words = stopwords.words('english')
# stop_words.append('via')
# words = [token for token in res if token not in stop_words]
# words

#### Removing Links

In [15]:
# regex = re.compile('http\S+')
# tokens_without_links = [regex.sub('', word) for word in words if not regex.sub('', word) == '' and not word.startswith('tc')]
# tokens_without_links

#### Stemming

In [16]:
# stemmer = PorterStemmer()
# stemmed_words = [stemmer.stem(word) for word in tokens_without_links]
# stemmed_words

#### Lemmatizing

In [17]:
# lemmatizer = WordNetLemmatizer()
# lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
# lemmatized_words

#### Function

In [18]:
sample_str = 'When was I last outside? I am stuck at home for 2 weeks.'

# set all as lowercase
lowerCase = sample_str.lower()

# tokenize
tokens = nltk.word_tokenize(lowerCase)

# remove punctuation
regex = re.compile(f'[{re.escape(string.punctuation)}]')
tokens_NoPunct = [regex.sub('', word) for word in tokens if not regex.sub('', word) == '']

# removing stopwords
stop_words = stopwords.words('english')
stop_words.append('via')
tokens_NoStopWords = [token for token in tokens_NoPunct if token not in stop_words]

# removing links
regex = re.compile('http\S+')
tokens_NoLinks = [regex.sub('', word) for word in tokens_NoStopWords if not regex.sub('', word) == '' and not word.startswith('tc')]

# Lemmatizing
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_NoLinks]

# Detokenizing
untokenized = TreebankWordDetokenizer().detokenize(lemmatized_words)


print(lowerCase)
print(tokens)
print(tokens_NoPunct)
print(tokens_NoStopWords)
print(tokens_NoLinks)
print(lemmatized_words)
print(untokenized)

when was i last outside? i am stuck at home for 2 weeks.
['when', 'was', 'i', 'last', 'outside', '?', 'i', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.']
['when', 'was', 'i', 'last', 'outside', 'i', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks']
['last', 'outside', 'stuck', 'home', '2', 'weeks']
['last', 'outside', 'stuck', 'home', '2', 'weeks']
['last', 'outside', 'stuck', 'home', '2', 'week']
last outside stuck home 2 week


In [23]:
def processText(df):
    processed_Reviews = []
    for review in df.review:
        # original
        # print(review)

        # set all as lowercase
        lowerCase = review.lower()

        # tokenize
        tokens = nltk.word_tokenize(lowerCase)

        # remove punctuation
        regex = re.compile(f'[{re.escape(string.punctuation)}]')
        tokens_NoPunct = [regex.sub('', word) for word in tokens if not regex.sub('', word) == '']

        # removing stopwords
        stop_words = stopwords.words('english')
        stop_words.append('via')
        tokens_NoStopWords = [token for token in tokens_NoPunct if token not in stop_words]

        # removing links
        regex = re.compile('http\S+')
        tokens_NoLinks = [regex.sub('', word) for word in tokens_NoStopWords if not regex.sub('', word) == '' and not word.startswith('tc')]

        # Lemmatizing
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_NoLinks]

        # Detokenizing
        untokenized = TreebankWordDetokenizer().detokenize(lemmatized_words)

        processed_Reviews.append(untokenized)
    return processed_Reviews

In [24]:
processed_Reviews = processText(df)
df['processedReview'] = processed_Reviews


In [25]:
df.head()
# df.tail()

Unnamed: 0,review,sentiment,processedReviews
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching 1 oz episode h...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...


# Classification Using BERT

#### Testing stuff

In [48]:
# PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

# encoding = tokenizer.encode_plus(
#   sample_txt,
#   add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#   max_length = 32,
#   padding = 'max_length',
#   # truncation = True,
#   return_attention_mask = True,
#   return_token_type_ids = False,
#   return_tensors = 'pt',  # Return PyTorch tensors
# )

# # encoding.keys()
# # encoding['input_ids'][0], encoding['attention_mask']
# # tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

# bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# bm = bert_model(
#   input_ids=encoding['input_ids'],
#   attention_mask=encoding['attention_mask'],
# )

# print(bm['last_hidden_state'].shape)
# print(bm['pooler_output'].shape)

#### Custom Dataset Class & Dataloader Function

In [49]:

class IMDBReviewsDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = self.reviews[item]
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
        review,
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = self.max_len,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_token_type_ids = False,
        return_tensors = 'pt',  # Return PyTorch tensors
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }


def create_data_loader(features, classification, tokenizer, max_len, batch_size):
    dataset = IMDBReviewsDataset(
        reviews = features.review.to_numpy(),
        targets = classification.to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len
    )
    return DataLoader(
        dataset,
        batch_size = batch_size,
        num_workers = 0
    )

#### Creating Dataloaders

In [50]:
# Testing dataset class
# max_len = 500
# dataset = IMDBReviewsDataset(reviews = X_train.review.to_numpy(), targets = y_train.to_numpy(), tokenizer = tokenizer, max_len = max_len)
# dataset.__getitem__(5)

In [51]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

train_data_loader = create_data_loader(X_train, y_train, tokenizer, 250, 16)
val_data_loader = create_data_loader(X_val, y_val, tokenizer, 250, 16)
test_data_loader = create_data_loader(X_test, y_test, tokenizer, 250, 16)

In [52]:
# Checking one batch from dataloader
data = next(iter(train_data_loader))
data.keys()

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

torch.Size([16, 250])
torch.Size([16, 250])
torch.Size([16])


### Classifier

##### Unused bits (might be useful later)

In [53]:

# hidden_sizes = [768, 512, 256, 128, 64]
# print(hidden_sizes[-1])

# self.layers = nn.ModuleList()
    # for h in range(len(hidden_sizes)-1):
    #   self.layers.append(nn.Linear(hidden_sizes[h], hidden_sizes[h+1]))
    
    # self.out = nn.Linear(hidden_sizes[-1], n_classes)

# self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

##### Classifier NN Module

In [54]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()


    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    
    self.out = nn.Sequential(
      nn.Linear(768, 512),
      nn.ReLU(),
      nn.Linear(512, 256),
      nn.ReLU(),
      nn.Linear(256, 128),
      nn.ReLU(),
      nn.Linear(128, 64),
      nn.ReLU(),
      nn.Linear(64, n_classes),
    )

  def forward(self, input_ids, attention_mask):
    bm = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      
    )
    output = self.drop(bm['pooler_output'])
    return self.out(output)

##### Testing classifier using single batch

In [55]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

# Output tensors
out_tensors = nn.functional.softmax(model(input_ids, attention_mask), dim=1)
print(out_tensors)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 250])
torch.Size([16, 250])
tensor([[0.4955, 0.5045],
        [0.4986, 0.5014],
        [0.5006, 0.4994],
        [0.5001, 0.4999],
        [0.4963, 0.5037],
        [0.4984, 0.5016],
        [0.4897, 0.5103],
        [0.4999, 0.5001],
        [0.5040, 0.4960],
        [0.4962, 0.5038],
        [0.4981, 0.5019],
        [0.4952, 0.5048],
        [0.4955, 0.5045],
        [0.4973, 0.5027],
        [0.4982, 0.5018],
        [0.4986, 0.5014]], grad_fn=<SoftmaxBackward0>)


### Training the Model

In [56]:
EPOCHS = 1
# optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [61]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    # print("outputs: ", outputs)
    _, preds = torch.max(outputs, dim=1)
    # print("max: ", preds)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    print("correct_predictions: ", correct_predictions)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [58]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
# %%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(X_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  # val_acc, val_loss = eval_model(
  #   model,
  #   val_data_loader,
  #   loss_fn,
  #   device,
  #   len(X_val)
  # )
  # print(f'Val   loss {val_loss} accuracy {val_acc}')
  # print()
  # history['train_acc'].append(train_acc)
  # history['train_loss'].append(train_loss)
  # history['val_acc'].append(val_acc)
  # history['val_loss'].append(val_loss)
  # if val_acc > best_accuracy:
  #   torch.save(model.state_dict(), 'best_model_state.bin')
  #   best_accuracy = val_acc