# Sentiment Analysis with IMDB dataset using Pytorch

In [1]:
import numpy as np
import keras
from keras.datasets import imdb
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch import nn
from keras.preprocessing.text import Tokenizer
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
%matplotlib inline

np.random.seed(42)

Using TensorFlow backend.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Dataset Prepration

In [3]:
mean, std = (0.5,), (0.5,)

# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test), = imdb.load_data(num_words=1000)
# restore np.load for future normal usage
np.load = np_load_old

print(x_train.shape)
print(x_test.shape)

(25000,)
(25000,)


In [4]:
print(x_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


In [5]:
num_features = 1000
tokenizer = Tokenizer(num_words=num_features)
x_train  = torch.from_numpy(tokenizer.sequences_to_matrix(x_train, mode='binary')).float().to(device)
x_test = torch.from_numpy(tokenizer.sequences_to_matrix(x_test, mode='binary')).float().to(device)

In [6]:
print(x_train[0].shape)

torch.Size([1000])


In [7]:
num_classes = 2
y_train = torch.from_numpy(np.eye(num_classes)[y_train.flatten()]).float().to(device)
y_test = torch.from_numpy(np.eye(num_classes)[y_test.flatten()]).float().to(device)

In [8]:
print(y_train.shape)
print(y_test.shape)

torch.Size([25000, 2])
torch.Size([25000, 2])


In [9]:
x_train_batches, y_train_batches = torch.split(x_train,500), torch.split(y_train,500)

In [10]:
class Sentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(num_features, 256)
        self.fc2 = nn.Linear(256, 64)
        self.fc2_ = nn.Dropout(p=0.7)
        self.fc3 = nn.Linear(64, 32)
        self.fc3_ = nn.Dropout(p=0.4)
        self.fc4 = nn.Linear(32, 4)
        self.fc4_ = nn.Dropout(p=0.2)
        self.fc5 = nn.Linear(4, num_classes)

    def forward(self, x):
        x = F.selu(self.fc1(x))
        x = F.selu(self.fc2(x))
        x = self.fc2_(x)
        x = F.selu(self.fc3(x))
        x = self.fc3_(x)
        x = F.selu(self.fc4(x))
        x = self.fc4_(x)
        x = self.fc5(x)

        return x

model = Sentiment()

In [11]:
from torch import optim
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.01)
model.to(device)
num_epochs = 30
train_tracker, test_tracker, accuracy_tracker = [], [], []

for i in range(num_epochs):
    cum_loss = 0

    for x_, y_ in zip(x_train_batches, y_train_batches):
        x_.to(device)
        y_.to(device)
        optimizer.zero_grad()
        output = model(x_)
        loss = criterion(output, y_)
        loss.backward()
        optimizer.step()

        cum_loss += loss.item()

    train_tracker.append(cum_loss/x_train.shape[0])
    print(f"Epoch({i+1}/{num_epochs}) | Training loss: {cum_loss/x_train.shape[0]} | ",end='')

    test_loss = 0
    num_correct = 0
    total = 0
    x_ = x_test
    y_ = y_test
    x_.to(device)
    y_.to(device)
    logps = model(x_)
    batch_loss = criterion(logps, y_)
    test_loss += batch_loss.item()

    output = torch.sigmoid(logps)
    pred = torch.argmax(output, 1)
    total += y_.size(0)
    num_correct += (pred == torch.argmax(y_,1)).sum().item()

    test_tracker.append(test_loss/y_test.shape[0])
    print(f"Test loss: {test_loss/y_test.shape[0]} | ", end='')
    accuracy_tracker.append(num_correct/total)
    print(f'Accuracy : {num_correct/total}')
print(f'\nNumber correct : {num_correct}, Total : {total}')
print(f'Accuracy of the model after 30 epochs on the 10000 test images: {num_correct * 100 / total}% ')


Epoch(1/30) | Training loss: 0.0013920266342163086 | Test loss: 2.776472568511963e-05 | Accuracy : 0.5128
Epoch(2/30) | Training loss: 0.0013828126788139343 | Test loss: 2.7608561515808105e-05 | Accuracy : 0.5278
Epoch(3/30) | Training loss: 0.0013781810474395753 | Test loss: 2.7481937408447266e-05 | Accuracy : 0.54948
Epoch(4/30) | Training loss: 0.001370486545562744 | Test loss: 2.7346854209899902e-05 | Accuracy : 0.56344
Epoch(5/30) | Training loss: 0.0013635927677154542 | Test loss: 2.7186362743377684e-05 | Accuracy : 0.58328
Epoch(6/30) | Training loss: 0.0013555804109573364 | Test loss: 2.6996347904205323e-05 | Accuracy : 0.60496
Epoch(7/30) | Training loss: 0.0013468788051605224 | Test loss: 2.6791725158691407e-05 | Accuracy : 0.61872
Epoch(8/30) | Training loss: 0.0013342578101158142 | Test loss: 2.6572155952453614e-05 | Accuracy : 0.6402
Epoch(9/30) | Training loss: 0.001323445336818695 | Test loss: 2.6297993659973143e-05 | Accuracy : 0.65628
Epoch(10/30) | Training loss: 0.00