<a href="https://colab.research.google.com/github/YoussefKhafaga/Movies-Reviews-Classification/blob/main/MovieReviewsIMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
! pip install nltk
! pip install transformers



In [32]:
import numpy as np
import torch

class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss


In [None]:
import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import pandas as pd
import torch
import nltk
from sklearn import model_selection
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string
from typing_extensions import final
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
htmlRGX = re.compile('<.*?>') 

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def data_preprocessing(text):
  sentence = re.sub(htmlRGX, ' ', text)
  sentence = "".join([char for char in sentence if char not in string.punctuation])
  sentence = sentence.lower()
  sentence = sentence.split(' ')
  sentence = [word for word in sentence if word not in stop_words]
  sentence = [lemmatizer.lemmatize(word=word,pos='v') for word in sentence]
  sentence = ' '.join(sentence)
  return sentence

def split(dataFrame):
    x, y = dataFrame.iloc[:, :-1], dataFrame.iloc[:, [-1]]  # split feature and label
    X_train, X_rem, y_train, y_rem = train_test_split(x, y, train_size=0.7, test_size=0.3, stratify=y)
    X_validate, X_test, y_validate, y_test = train_test_split(X_rem, y_rem, train_size=1/3, test_size=2/3,stratify=y_rem)
    return X_train, X_validate, X_test, y_train, y_validate, y_test

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x,y):

        self.labels =  torch.FloatTensor(y['sentiment'].values)
        self.texts  = [tokenizer(review, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for review in x['review']]
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx]

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'negative':0,
          'positive':1
          }

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1):

        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        # dropout layer
        self.dropout = nn.Dropout(dropout)
        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,128)
        self.fc4 = nn.Linear(128,64)
        self.fc = nn.Linear(64,1)

        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(128)
        self.bn4 = nn.BatchNorm1d(64)
         # dense layer 5 (Output layer)

        #sigmoid activation function
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_id, mask):
          #pass the inputs to the model  
        _, cls_hs = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        x = self.dropout(cls_hs)
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
  #####
        x1 = self.fc2(x)
        x1 = self.bn2(x1)
        x1 = self.relu(x1)

        x1 = self.dropout(x1)
  ######
        x2 = self.fc3(x1)
        x2 = self.bn3(x2)
        x2 = self.relu(x2)

        x2 = self.dropout(x2)
  ######
        x3 = self.fc4(x2)
        x3 = self.bn4(x3)
        x3 = self.relu(x3)

        x3 = self.dropout(x3)

        # output layer
        x = self.fc(x3)
        
        # apply softmax activation
        x = self.sigmoid(x)

        return x


In [None]:
def evaluate(model, test_data):
    test = Dataset(test_data[0],test_data[1])
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=128)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask        = test_input['attention_mask'].to(device)
            input_id    = test_input['input_ids'].squeeze(1).to(device)
            output      = model(input_id, mask)
            output      = (output >0.5).float()
            test_label  = test_label.unsqueeze(1)
            acc         = (output == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .4f}')

In [None]:
train_accuracy = []
val_accuarcy = []

train_loss = []
val_loss = []

In [None]:
def train(model,path, train_dataloader, val_dataloader, learning_rate, epochs):
    global train_accuracy,val_accuarcy,train_loss,val_loss

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    early_stopping = EarlyStopping(path=path,patience=1, verbose=True)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):
            total_acc_train  = 0
            total_loss_train = 0
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output     = model(input_id, mask)
                batch_loss = criterion(output, train_label.unsqueeze(1))
                total_loss_train += batch_loss.item()
                output      = (output >0.5).float()
                train_label = train_label.unsqueeze(1)
                acc         = (output == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output     = model(input_id, mask)
                    batch_loss = criterion(output, val_label.unsqueeze(1))
                    total_loss_val += batch_loss.item()
                    output      = (output >0.5).float()
                    val_label = val_label.unsqueeze(1)
                    acc         = (output == val_label).sum().item()
                    total_acc_val += acc
            
            print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader.dataset): .4f} \
            | Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .4f} | \
            Val Loss: {total_loss_val / len(val_dataloader.dataset): .4f} | \
            Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .4f}')

            train_accuracy.append( total_acc_train/len(train_dataloader.dataset))
            val_accuarcy.append( total_acc_val/len(val_dataloader.dataset) )
            train_loss.append( total_loss_train/len(train_dataloader.dataset))
            val_loss.append( total_loss_val/len(val_dataloader.dataset))

            early_stopping(total_loss_val, model)
        
            if early_stopping.early_stop:
                print("Early stopping")
                break
                

In [None]:
from google.colab import drive
drive.mount('/content/drive')
lemmatizer = WordNetLemmatizer()
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
df = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')
df = df.replace({'positive':1,'negative':0})

preprocess= df.copy()
preprocess['review'] = preprocess['review'].apply(data_preprocessing)

In [None]:
EPOCHS = 5

In [None]:
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df)

train_df, val_df = Dataset(X_train,y_train), Dataset(X_validate,y_validate)

train_dataloader = torch.utils.data.DataLoader(train_df, batch_size=128, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_df, batch_size=128)

In [None]:
import pickle 
import gc
import matplotlib.pyplot as plt

LR = [0.005,0.001,0.0005]

model = None
for lr in LR:
    for _ in range(0,100):
        gc.collect()
        torch.cuda.empty_cache()

    train_accuracy = []
    val_accuarcy = []
    train_loss = []
    val_loss = []

    model = BertClassifier()
    PATH = './models/model'+str(lr)
    train(model, PATH,train_dataloader, val_dataloader, lr, EPOCHS)
    
    with open(f'./models/train_accuracy_{lr}','wb') as f: 
        pickle.dump( np.array(train_accuracy), f)

    with open(f'./models/val_accuarcy_{lr}','wb') as f: 
        pickle.dump( np.array(val_accuarcy), f)

    with open(f'./models/train_loss_{lr}','wb') as f: 
        pickle.dump( np.array(train_loss), f)

    with open(f'./models/val_loss_{lr}','wb') as f: 
        pickle.dump( np.array(val_loss), f)
    
    epochs  =  [i for i in range(0,EPOCHS)]
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(f'Model {lr} plot')
    ax1.plot(epochs, val_loss,'g',epochs,train_loss,'b')
    ax2.plot(epochs,val_accuarcy,'g',epochs,train_accuracy,'b')
    plt.savefig('./models/model'+str(lr)+"graph_.png")

In [None]:
m1 = BertClassifier()
m1 = m1.load_state_dict(torch.load("./models/model1"))

In [None]:
evaluate(m1, [X_test,y_test])