In [20]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import model_selection

%load_ext tensorboard
%tensorflow_version 2.x
import tensorflow as tf

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

df = pd.read_csv('./input/imdb_dataset.csv')

print(df.shape)
df.head()

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [21]:
target_mapping = {'negative': 0, 'positive': 1}
df['sentiment'] = df['sentiment'].map(target_mapping)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [22]:
df['kfold'] =  -1
df = df.sample(frac=1).reset_index(drop=True)
y = df['sentiment'].values
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X=df, y=y)):
  df.loc[valid_idx, 'kfold'] = fold

df.head()

Unnamed: 0,review,sentiment,kfold
0,"I saw a trailer for this on Afro Promo, the co...",1,0
1,"I, like many die-hard Trekkers (or Trekkies, i...",1,0
2,When a BBC murder thriller is this rife with h...,0,0
3,"I think I truly love this film . ""Prix de Beau...",1,0
4,`The United States of Kiss My Ass'<br /><br />...,1,0


In [23]:
class IMDBDataset:
  def __init__(self, reviews, targets):
    self.reviews = reviews
    self.targets = targets
  
  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, idx):
    review = torch.tensor(self.reviews[idx,:], dtype=torch.long)
    target = torch.tensor(self.targets[idx], dtype=torch.float)

    return {'review': review, 'target': target}

In [24]:
class LSTM(nn.Module):
  def __init__(self, embedding_dim, vocab_size, lstm_hidden_dim, padding_idx=0):
    super(LSTM, self).__init__()

    self.embeddings = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size, padding_idx=padding_idx)

    self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, 
                        batch_first=True, bidirectional=True)
    self.lstm2 = nn.LSTM(lstm_hidden_dim*2, lstm_hidden_dim, 
                         batch_first=True, bidirectional=True)

    self.dropout = nn.Dropout(0.2)
    self.fc1 = nn.Linear(lstm_hidden_dim*4, 512)
    self.fc2 = nn.Linear(512, 1)

  def forward(self, x):
    x = self.embeddings(x)
    out, _ = self.lstm(x)
    out, _ = self.lstm2(out)

    avg_pool= torch.mean(out, 1)
    max_pool, index_max_pool = torch.max(out, 1)

    z = torch.cat((avg_pool, max_pool), 1)

    z = self.fc1(z)
    z = self.dropout(z)
    y_pred = self.fc2(z)

    return y_pred

In [25]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df.review.values.tolist())
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

124253


In [26]:
def train_fn(model, loader, optimizer, loss_fn, device):
  running_train_loss = 0.0
  running_train_acc = 0.0

  train_preds = []
  train_targets = []

  model.train()
  for i, data in enumerate(loader):
    reviews = data['review'].to(device, dtype=torch.long)
    targets = data['target'].to(device, dtype=torch.float)

    optimizer.zero_grad()
    outputs = model(reviews)

    loss = loss_fn(outputs, targets.view(-1, 1))
    loss.backward()
    optimizer.step()

    preds = torch.sigmoid(outputs)
    preds = np.asarray(preds.detach().cpu().numpy()) >= 0.5

    train_preds.extend(preds.tolist())
    train_targets.extend(targets.cpu().numpy().tolist())

    accuracy = metrics.accuracy_score(train_targets, train_preds)

    running_train_loss = loss.item() 
    running_train_acc = accuracy

  return running_train_loss, running_train_acc

def val_fn(model, loader, loss_fn, scheduler, epoch, device):
  running_val_loss = 0.0
  running_val_acc = 0.0

  val_preds = []
  val_targets = []

  model.eval()
  for i, data in enumerate(loader):
    reviews = data['review'].to(device, dtype=torch.long)
    targets = data['target'].to(device, dtype=torch.float)

    outputs = model(reviews)

    loss = loss_fn(outputs, targets.view(-1, 1))
    preds = torch.sigmoid(outputs)
    preds = np.asarray(preds.detach().cpu().numpy()) >= 0.5
  
    val_preds.extend(preds.tolist())
    val_targets.extend(targets.cpu().numpy().tolist())

    accuracy = metrics.accuracy_score(val_targets, val_preds)

    running_val_loss = loss.item()
    running_val_acc = accuracy

  scheduler.step(epoch-1)
    
  return running_val_loss, running_val_acc, accuracy

In [27]:
NUM_EPOCHS = 5
EMBEDDING_DIM = 100
RNN_HIDDEN_DIM = 128
LSTM_HIDDEN_DIM = 128
HIDDEN_DIM = 128

loss_history={
    "train": [],
    "val": [],
}

metric_history={
    "train": [],
    "val": [],
}

for fold in range(5):
  print(f"FOLD: {fold+1}")

  train_df = df[df.kfold != fold].reset_index(drop=True)
  valid_df = df[df.kfold == fold].reset_index(drop=True)

  X_train = tokenizer.texts_to_sequences(train_df.review.values)
  X_valid = tokenizer.texts_to_sequences(valid_df.review.values)

  X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=128)
  X_valid = tf.keras.preprocessing.sequence.pad_sequences(X_valid, maxlen=128)

  train_ds = IMDBDataset(X_train, train_df.sentiment.values)
  valid_ds = IMDBDataset(X_valid, valid_df.sentiment.values)

  train_loader = torch.utils.data.DataLoader(train_ds, batch_size=64, num_workers=4)
  valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=64, num_workers=4)

  device = torch.device('cuda')
  model = LSTM(embedding_dim=EMBEDDING_DIM, 
               vocab_size=vocab_size, 
               lstm_hidden_dim=LSTM_HIDDEN_DIM).to(device)

  loss_fn = nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)

  for epoch in range(NUM_EPOCHS):
    running_train_loss, running_train_acc = train_fn(model, train_loader, optimizer, loss_fn, device)

    loss_history['train'].append(running_train_loss)
    metric_history['train'].append(running_train_acc)

    running_val_loss, running_val_acc, accuracy = val_fn(model, valid_loader, loss_fn, scheduler, epoch, device)

    loss_history['val'].append(running_val_loss)
    metric_history['val'].append(running_val_acc)

    print(f'Epoch: {epoch+1} | train_loss: {running_train_loss:.2f} | val_loss: {running_val_loss:.2f} | accuracy_score: {accuracy}')

FOLD: 1
Epoch: 1 | train_loss: 0.25 | val_loss: 0.16 | accuracy_score: 0.8651
Epoch: 2 | train_loss: 0.17 | val_loss: 0.12 | accuracy_score: 0.881
Epoch: 3 | train_loss: 0.09 | val_loss: 0.08 | accuracy_score: 0.8778
Epoch: 4 | train_loss: 0.06 | val_loss: 0.00 | accuracy_score: 0.8658
Epoch: 5 | train_loss: 0.01 | val_loss: 0.00 | accuracy_score: 0.8685
FOLD: 2
Epoch: 1 | train_loss: 0.29 | val_loss: 0.22 | accuracy_score: 0.8626
Epoch: 2 | train_loss: 0.27 | val_loss: 0.33 | accuracy_score: 0.871
Epoch: 3 | train_loss: 0.22 | val_loss: 0.48 | accuracy_score: 0.8685
Epoch: 4 | train_loss: 0.16 | val_loss: 0.36 | accuracy_score: 0.8804
Epoch: 5 | train_loss: 0.02 | val_loss: 0.08 | accuracy_score: 0.8576
FOLD: 3
Epoch: 1 | train_loss: 0.26 | val_loss: 0.55 | accuracy_score: 0.8585
Epoch: 2 | train_loss: 0.13 | val_loss: 0.54 | accuracy_score: 0.8747
Epoch: 3 | train_loss: 0.06 | val_loss: 0.43 | accuracy_score: 0.8802
Epoch: 4 | train_loss: 0.02 | val_loss: 0.29 | accuracy_score: 0.852