<a href="https://colab.research.google.com/github/radwaahmed20112000/IMDB_Sentiment_Prediction/blob/main/IMDB_Sentiment_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Read Dataframe**

In [None]:
import zipfile
import io
zf = zipfile.ZipFile('archive.zip')
zf.extractall()
import pandas as pd
from collections import Counter
df = pd.read_csv('IMDB Dataset.csv')

In [None]:
import nltk
import numpy as np
nltk.download('stopwords')
nltk.download('wordnet')
from textblob import Word
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop = stopwords.words('english')
porter =  PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Data Preprocessing**

In [None]:
def preprocessing():
  global df
  #remove punctuation
  df['review'] = df['review'].str.replace('[^\w\s]','')
  #remove stop words + Lemmetization of words
  df['review'] = df['review'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split() if word not in (stop)]))
  #lowercase
  df['review'] = df['review'].str.lower()

## **Data Split**

In [None]:
preprocessing()
np.random.seed(112)
df1 = df[df['sentiment'] == 'positive']

df2 = df[df['sentiment'] == 'negative']

df1_train, df1_val, df1_test = np.split(df1.sample(frac=1, random_state=42), 
                                     [int(.7*len(df1)), int(.9*len(df1))])

df2_train, df2_val, df2_test = np.split(df2.sample(frac=1, random_state=42), 
                                     [int(.7*len(df2)), int(.9*len(df2))])

frames = [df1_train, df2_train]
df_train = pd.concat(frames)
df_train = df_train.sample(frac = 1)

frames = [df1_val, df2_val]
df_val = pd.concat(frames)
df_val = df_val.sample(frac = 1)

frames = [df1_test, df2_test]
df_test = pd.concat(frames)
df_test = df_test.sample(frac = 1)
print(len(df_train),len(df_val), len(df_test))
print(len(df1_train),len(df1_val), len(df1_test))
print(len(df2_train),len(df2_val), len(df2_test))

  after removing the cwd from sys.path.


35000 10000 5000
17500 5000 2500
17500 5000 2500


## **Dataframe Conversion**

In [None]:
!pip install transformers
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'positive':0,
          'negative':1
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['sentiment']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
!pip install optuna
import optuna
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
#maximize direction tends to increase the accuracy, F1 score, etc.
#TPESampler : Bayesian hyperparameter optimization method
model = None

## **Model Build**

In [None]:
from torch import nn
from transformers import BertModel
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        
        self.layer_1  = nn. Linear(768, 512)
        self.layer_2  = nn. Linear (512, 256)
        self.layer_3  = nn. Linear(256, 128)
        self.layer_4  = nn. Linear(128, 64)
        self.layer_out= nn. Linear (64, 2)
        self.sigmoid = nn.Sigmoid()
        #self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        input = self.relu(self.layer_1(dropout_output))
        input = self.dropout(input)
        input = self.relu(self.layer_2(input))
        input = self.dropout(input)
        input = self.relu(self.layer_3(input))
        input = self.dropout(input)
        input = self.relu(self.layer_4(input))
        output= self.sigmoid(self.layer_out(input))
        return output
        
        #linear_output = self.linear(dropout_output)
        #final_layer = self.relu(linear_output)

        #return final_layer
        

# **Model Train**

In [None]:
from torch.optim import Adam
from tqdm import tqdm
import matplotlib.pyplot as plt 

def train(model, train_data, val_data, params):
    epoch_plot=[1,2]
    acc_train_plot=[]
    acc_val_plot=[]
    train, val = Dataset(train_data), Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=5, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=5)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= params['learning_rate'])
    print("Learning rate = " + str(params['learning_rate']))
    accuracy = 0
    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    for epoch_num in range(params['epochs_num']):
            print("Epoch" + str(epoch_num))
            total_loss_train = 0
            total_acc_train = 0
            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()

                # output = torch.round(output)
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    # output = torch.round(output)
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            acc_train_plot.append(total_acc_train / len(train_data))
            acc_val_plot.append(total_acc_val / len(val_data))
            accuracy = total_acc_val / len(val_data)
    plot1 = plt.figure(1)
    plt.scatter(x = epoch_plot ,y = acc_train_plot)

    plot2 = plt.figure(2)
    plt.scatter(x = epoch_plot ,y = acc_val_plot)

    plt.show()
    return accuracy       
              


In [None]:
def objective(trial):
    global df_train, df_val
    global model
    #parameters we want to tune:
    params = {
              'learning_rate': 1e-5, 
              'epochs_num' : 2
              }
    
    model = BertClassifier()
    print("Model is built")
    accuracy = train(model,  df_train, df_val, params)

    return accuracy

In [None]:
study.optimize(objective, n_trials=1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model is built
Learning rate = 1e-05
Epoch0


 53%|█████▎    | 3708/7000 [58:46<52:09,  1.05it/s]

## **Model Evaluation**

In [None]:
MODEL_PATH = 'model.pth'
model = torch.load(MODEL_PATH, map_location=torch.device('cuda'))

In [None]:
def evaluating_model(predicted , y_test):
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(predicted , y_test)
    f1 = f1_score(predicted , y_test , average='weighted')
    print ("Accuracy: " ,accuracy)
    print("F1_score: ",f1)

    #calculating precision and reall

    from sklearn.metrics import precision_score , recall_score
    precision = precision_score(predicted , y_test )
    recall = recall_score(predicted , y_test ) 
    print('Precision: ',precision)
    print('Recall: ',recall)
    
    from sklearn.metrics import confusion_matrix
    confusion = confusion_matrix(predicted,y_test)
    print("Confusion Matrix:")
    print(confusion)

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=10)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    output_list = []
    label_list = []
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              output = output.argmax(dim=1)
              output_as_list = output.tolist()
              output_list.extend(output_as_list)
              label_as_list = test_label.tolist()
              label_list.extend(label_as_list)
              acc = (output == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    #computing requirements
    evaluating_model(output_list, label_list)
    
evaluate(model, df_test)

In [None]:
MODEL_PATH = 'model.pth'
torch.save(model, MODEL_PATH)

In [None]:
# MODEL_PATH = 'model.pth'
# model = torch.load(MODEL_PATH, map_location=torch.device('cuda'))