<a href="https://colab.research.google.com/github/Yusuf-xx/Sentiment-Analysis-on-Movie-Reviews/blob/main/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
import re, string
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
!mkdir ./model_bakup/

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG:
    batch_size = 20
    lr = 0.02
    eval_step_num = 50
    mid_eval = False
    best_eval_acc = 0.0
    model_output_dir = './model_bakup/'
    seed = 2032
    use_ema = False
    use_adversial_training = False
    
QUICK_CHECK = False

global_start_t = time.time()
print('ok')

ok


In [None]:
seed_everything(seed=42)

imdb_data = pd.read_csv('IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
print('before drop_duplicates, imdb_data.shape: ', imdb_data.shape)

imdb_data = imdb_data.drop_duplicates()

print('after drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(30000)

print('after sample, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(len(imdb_data)).reset_index(drop=True)

imdb_data.head(5)

before drop_duplicates, imdb_data.shape:  (50000, 2)
after drop_duplicates, imdb_data.shape:  (49582, 2)
after sample, imdb_data.shape:  (30000, 2)


Unnamed: 0,review,sentiment
0,I just saw Adam Had Four Sons for the first ti...,0
1,I was fascinated as to how truly bad this movi...,0
2,I came to Nancy Drew expecting the worst...bec...,1
3,I found this to be a surprisingly light-handed...,1
4,"Well, it's all been said about this movie and ...",0


In [None]:
MAX_WORDS = 10000
MAX_LEN = 200
word_count_dict = {}

def clean_text(text):
    lowercase = text.lower().replace('\n', ' ')
    stripped_html = re.sub('<br />', ' ', lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation), '', stripped_html)
    return cleaned_punctuation

for review in imdb_data['review'].values:
    cleaned_text = clean_text(review)
    for word in cleaned_text.split(' '):
        word_count_dict[word] = word_count_dict.get(word, 0) + 1
            
df_word_dict = pd.DataFrame(pd.Series(word_count_dict, name='count'))
df_word_dict = df_word_dict.sort_values(by='count', ascending=False)

df_word_dict = df_word_dict[:MAX_WORDS-2]
df_word_dict['word_id'] = range(2, MAX_WORDS)

word_id_dict = df_word_dict['word_id'].to_dict()
word_id_dict['<unknown>'] = 0
word_id_dict['<padding>'] = 1

df_word_dict.head(15)

Unnamed: 0,count,word_id
the,395803,2
a,192182,3
and,192101,4
of,171897,5
to,159402,6
is,125603,7
in,110351,8
,94493,9
it,92472,10
i,90868,11


In [None]:
def pad(data_list, pad_length):
    padded_list = data_list.copy()
    
    if len(data_list) > pad_length:
        padded_list = data_list[-pad_length:]
        
    if len(data_list) < pad_length:
        padded_list = [1] * (pad_length-len(data_list)) + data_list
        
    return padded_list

def text_to_token(text):
    cleaned_text = clean_text(text)
    word_token_list = [word_id_dict.get(word, 0) for word in cleaned_text.split(' ')]
    pad_list = pad(word_token_list, MAX_LEN)
    token = ' '.join([str(x) for x in pad_list])
    return token
            
process_start_t = time.time()
print('start processing...')
imdb_data['review_tokens'] = imdb_data['review'].map(text_to_token)
print('ok, cost time: ', time.time()-process_start_t)
imdb_data.head(5)

start processing...
ok, cost time:  3.784602642059326


Unnamed: 0,review,sentiment,review_tokens
0,I just saw Adam Had Four Sons for the first ti...,0,115 203 3 118 642 3412 7 1666 6 172 15 3 0 258...
1,I was fascinated as to how truly bad this movi...,0,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
2,I came to Nancy Drew expecting the worst...bec...,1,273 3 0 928 5 2 1140 19 52 22 96 3 18 45 5 139...
3,I found this to be a surprisingly light-handed...,1,3045 2367 4 4326 2 282 1331 67 1304 4 3629 3 1...
4,"Well, it's all been said about this movie and ...",0,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...


In [None]:
print(imdb_data['review'].values[0])

I just saw Adam Had Four Sons for the first time and the thing that struck me was that I believe that the model used was Theodore Roosevelt and his four sons. They were approximately the same ages as the four boys in this film. Warner Baxter in his portrayal of Adam Stoddard talked about the same values and family tradition that you would have heard from our 26th president without some of the more boisterous aspects of TR's character. <br /><br />Like TR all of the Stoddard sons serve in World War I, in this case though the youngest only loses an eye instead of being killed. <br /><br />But what if a female minx gets into this all male household and disrupts things? That's Susan Hayward's job here. In one of her earliest prominent roles, Hayward is a flirtatious amoral girl who marries one son, has an affair with another, and starts making a play for the third. It's an early forerunner of the kind of a part that later brought her an Oscar in I Want to Live.<br /><br />I suppose that wi

In [None]:
TRAIN_NUM = 15000
imdb_data_test = imdb_data.iloc[:5000]
imdb_data_valid = imdb_data.iloc[5000:10000]
imdb_data_train = imdb_data.iloc[10000:TRAIN_NUM+10000]

if QUICK_CHECK:
    SAMPLE_NUM = 3000
    imdb_data_test = imdb_data
    _test.sample(SAMPLE_NUM)
    imdb_data_valid = imdb_data_valid.sample(SAMPLE_NUM)
    imdb_data_train = imdb_data_train.sample(2*SAMPLE_NUM)

print(f'imdb_data_train.shape: {imdb_data_train.shape}, imdb_data_valid.shape: {imdb_data_valid.shape}, '
      f'imdb_data_test.shape: {imdb_data_test.shape}')

imdb_data_test.head(5)

imdb_data_train.shape: (15000, 3), imdb_data_valid.shape: (5000, 3), imdb_data_test.shape: (5000, 3)


Unnamed: 0,review,sentiment,review_tokens
0,I just saw Adam Had Four Sons for the first ti...,0,115 203 3 118 642 3412 7 1666 6 172 15 3 0 258...
1,I was fascinated as to how truly bad this movi...,0,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
2,I came to Nancy Drew expecting the worst...bec...,1,273 3 0 928 5 2 1140 19 52 22 96 3 18 45 5 139...
3,I found this to be a surprisingly light-handed...,1,3045 2367 4 4326 2 282 1331 67 1304 4 3629 3 1...
4,"Well, it's all been said about this movie and ...",0,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...


In [None]:
cfg = CFG()
seed_everything(seed=cfg.seed)

print('ok')

ok


In [None]:
class imdbDataset(Dataset):
    def __init__(self, data_df):
        self.data_df = data_df
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, index):
        label = self.data_df.iloc[index]['sentiment']
        label = torch.tensor([float(label)], dtype=torch.float, device=device)
        
        tokens = self.data_df.iloc[index]['review_tokens']
        feature = torch.tensor([int(x) for x in tokens.split(' ')], dtype=torch.long, device=device)
            
        return feature, label
    
def generate_data_iter(cfg):
    global imdb_data_train, imdb_data_valid, imdb_data_test
    ds_train = imdbDataset(imdb_data_train)
    ds_valid = imdbDataset(imdb_data_valid)
    ds_test = imdbDataset(imdb_data_test)
    print('len of ds_train: ', len(ds_train), 'len of ds_valid: ', len(ds_valid),
          'len of ds_test: ', len(ds_test))

    dl_train = DataLoader(ds_train, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    dl_valid = DataLoader(ds_valid, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    dl_test = DataLoader(ds_test, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    return dl_train, dl_valid, dl_test

dl_train, dl_valid, dl_test = generate_data_iter(cfg)
print('ok')

len of ds_train:  15000 len of ds_valid:  5000 len of ds_test:  5000
ok


In [None]:
EMBEDDING_DIM = 100

class CNN_Net_old(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=EMBEDDING_DIM, padding_idx=1)
        
        self.conv = nn.Sequential()
        self.conv.add_module('conv_1', nn.Conv1d(in_channels=EMBEDDING_DIM, out_channels=16, kernel_size=5))
        self.conv.add_module('pool_1', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_1', nn.ReLU())
        self.conv.add_module('conv_2', nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2))
        self.conv.add_module('pool_2', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_2', nn.ReLU())
        
        self.dense = nn.Sequential()
        self.dense.add_module('flatten', nn.Flatten())
        self.dense.add_module('linear', nn.Linear(6144, 1))
        self.dense.add_module('sigmoid', nn.Sigmoid())
        
    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = self.conv(x)
        y = self.dense(x)
        return y
    
class CNN_Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=EMBEDDING_DIM, padding_idx=1)
        
        self.conv = nn.Sequential(OrderedDict([
            ('conv_1', nn.Conv1d(in_channels=EMBEDDING_DIM, out_channels=16, kernel_size=5)),
            ('pool_1', nn.MaxPool1d(kernel_size=2)),
            ('relu_1', nn.ReLU()),
            ('conv_2', nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2)),
            ('pool_2', nn.MaxPool1d(kernel_size=2)),
            ('relu_2', nn.ReLU()),
        ]))
        
        self.dense = nn.Sequential(OrderedDict([
            ('flatten', nn.Flatten()),
            ('linear', nn.Linear(6144, 1)),
            ('sigmoid', nn.Sigmoid()),
        ]))
        
    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = self.conv(x)
        y = self.dense(x)
        return y
    
model = CNN_Net()
print(model)
model.to(device)     

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)

print('ok')

CNN_Net(
  (embedding): Embedding(10000, 100, padding_idx=1)
  (conv): Sequential(
    (conv_1): Conv1d(100, 16, kernel_size=(5,), stride=(1,))
    (pool_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_1): ReLU()
    (conv_2): Conv1d(16, 128, kernel_size=(2,), stride=(1,))
    (pool_2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_2): ReLU()
  )
  (dense): Sequential(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear): Linear(in_features=6144, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)
model_param_num:  1018385 model_trainable_param_num:  1018385
ok


In [None]:
def accuracy(y_pred, y_true):
    if type(y_pred)==list:
        y_pred = np.array(y_pred)
    y_pred = (y_pred > 0.5)
    if type(y_true)==list:
        y_true = np.array(y_true)
    acc = (y_pred==y_true).mean()
    return acc

def evaluate(model, dl_test, device):
    global cfg
    model.eval() #change the model to evaluaiton mode
    
    y_true_lst, y_pred_lst = [], []
    with torch.no_grad():
        for step, batch in enumerate(dl_test):
            feature, label = batch
            feature, label = feature.to(device), label.to(device)
            y_pred = model(feature)
            y_pred_lst += list(y_pred.detach().cpu().numpy())
            y_true_lst += list(label.detach().cpu().numpy())
            
    model.train()
    acc = accuracy(y_pred_lst, y_true_lst)

    return acc
    
def train(model, dl_train, optimizer, loss_func, device):
    global cfg, global_step_num, global_best_valid_acc, dl_valid,  model_ema, fgm
    model.train() #change the model to train mode
    
    y_true_lst, y_pred_lst = [], []
    for step, batch in enumerate(dl_train):
        global_step_num += 1
        feature, label = batch
        feature, label = feature.to(device), label.to(device)
        y_pred = model(feature)
        train_loss = loss_func(y_pred, label)
        y_pred_lst += list(y_pred.detach().cpu().numpy())
        y_true_lst += list(label.detach().cpu().numpy())
        train_loss.backward()
        optimizer.step()
        model.zero_grad()
        
        #Added early stopping callback to save the model from overfitting the data
        if cfg.mid_eval and (global_step_num % cfg.eval_step_num == 0):
            valid_acc = evaluate(model, dl_valid, device)
            print(f'step_num: {global_step_num}, valid_acc: {valid_acc:.5f}')
            if valid_acc > global_best_valid_acc:
                global_best_valid_acc = valid_acc
                print(f'step_num: {global_step_num}, get new best val_acc: {valid_acc:.5f}, save the model now!')                
                torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))
        
    acc = accuracy(y_pred_lst, y_true_lst)
    return acc

print('ok')

ok


In [None]:
global_best_train_acc, global_best_valid_acc = 0.0, 0.0
global_train_acc = 0.0
global_step_num = 0

epochs = 10
optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-5)
loss_func = nn.BCELoss()
    
for epoch in range(epochs):
    train_acc = train(model, dl_train, optimizer, loss_func, device)
    valid_acc = evaluate(model, dl_valid, device)
    test_acc = evaluate(model, dl_test, device)
    print(f'in epoch: {epoch}, train_acc: {train_acc:.5f}, valid_acc: {valid_acc:.5f}, test_acc: {test_acc:.5f}')
    if train_acc > global_best_train_acc:
        global_best_train_acc = train_acc
    if valid_acc > global_best_valid_acc:
        global_best_valid_acc = valid_acc
        global_train_acc = train_acc
        print(f'at the end of epoch, global_step_num: {global_step_num} get new best_valid_acc: {valid_acc:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))

in epoch: 0, train_acc: 0.73140, valid_acc: 0.83800, test_acc: 0.83540
at the end of epoch, global_step_num: 750 get new best_valid_acc: 0.83800, save the model now!
in epoch: 1, train_acc: 0.88280, valid_acc: 0.86400, test_acc: 0.86200
at the end of epoch, global_step_num: 1500 get new best_valid_acc: 0.86400, save the model now!
in epoch: 2, train_acc: 0.92233, valid_acc: 0.82740, test_acc: 0.82120
in epoch: 3, train_acc: 0.94947, valid_acc: 0.85720, test_acc: 0.84520
in epoch: 4, train_acc: 0.96353, valid_acc: 0.84740, test_acc: 0.84400
in epoch: 5, train_acc: 0.97073, valid_acc: 0.83400, test_acc: 0.83620
in epoch: 6, train_acc: 0.97233, valid_acc: 0.85100, test_acc: 0.84600
in epoch: 7, train_acc: 0.97693, valid_acc: 0.84720, test_acc: 0.84560
in epoch: 8, train_acc: 0.98527, valid_acc: 0.85340, test_acc: 0.84640
in epoch: 9, train_acc: 0.97900, valid_acc: 0.84600, test_acc: 0.84500


In [None]:
model = CNN_Net()
model.to(device)

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model.pth')))
test_acc = evaluate(model, dl_test, device)
print(f'final test_acc: {test_acc:.5f}, best_val_acc: {global_best_valid_acc:.5f}, '
      f'train_acc: {global_train_acc:.5f}, best_train_acc: {global_best_train_acc:.5f}')

print('total finished, cost time: ', time.time() - global_start_t)

final test_acc: 0.86200, best_val_acc: 0.86400, train_acc: 0.88280, best_train_acc: 0.98527
total finished, cost time:  327.3876988887787
