In [None]:
import numpy as np 
import pandas as pd 

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

In [None]:
df.sentiment = df.sentiment.map({"positive":1,"negative":0})
df.head()

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
import torch

class IMDBDataset:
    def __init__(self,reviews,sentiments):
        self.reviews = reviews
        self.sentiments = sentiments
        
    def __len__(self):
        return len(self.reviews)

    def __getitem__(self,item):
        review = self.reviews[item,:]
        target = self.sentiments[item]
        return {
            "review": torch.tensor(review,dtype = torch.long),
            "target": torch.tensor(target,dtype = torch.long)
        }

In [None]:
import torch.nn as nn 
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self,embedding_matrix):
        super(LSTM,self).__init__()
        self.embedding_matrix = embedding_matrix
        # num of words = no of rows of the embedding matrix
        num_words = self.embedding_matrix.shape[0]
        # dimension of embedding matrix is num of columns in the embedding matrix
        embed_dim = self.embedding_matrix.shape[1]
        # we define an input embedding layer as 
        self.embedding = nn.Embedding(num_embeddings=num_words,embedding_dim=embed_dim)
        # Embedding layer is used as the as weights of the embedding layer
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        
        # we do not want to train the pretrained embeddings
        self.embedding.weight.requires_grad = False
        
        # a simple bidirectional lstm with an hidden_dim of 128
        self.lstm = nn.LSTM(embed_dim,128,bidirectional=True,batch_first=True)
        # output layer is a layer which has only one output 
        # input(512) = 128+128 for mean and same for max pooling
        self.out = nn.Linear(512,1)
        
    def forward(self,x):
        # pass the data through embedding layer the input is just the tokens
        x = self.embedding(x)
        
        # move the embedding output to lstm
        x,_ = self.lstm(x)
        # apply mean and max pooling on lstm output
        avg_pool = torch.mean(x,1)
        max_pool,_ = torch.max(x,1)
        # concatenate mean and max pooling this is why 512
        # 128 for each direction = 256
        # avg_pool = 256, max_pool = 256
        out = torch.cat((avg_pool,max_pool),1)
        # pass through the output layer and return the output
        out = self.out(out)
        
        return out

In [None]:
import torch
import torch.nn as nn

def train(data_loader,model,optimizer,device):
    """
    This is the main training function that trains model
    for one epoch
    :param data_loader: this is the torchdataloader
    :param model: model(lstm model)
    :param optimizer: optimizer Adam, SGD etc
    :param device: this can be "cuda" or "cpu"
    """
    # set the model to training mode
    model.train()
    
    # go through the batches of data in data_loader:
    for data in data_loader:
        reviews = data["review"]
        targets = data["target"]
        # move the data to the device that we want to use
        reviews = reviews.to(device,dtype=torch.long)
        targets = targets.to(device,dtype=torch.float)
        # clear the gradients
        optimizer.zero_grad()
        # make predictions from the models
        predictions = model(reviews)
        # loss
        loss = nn.BCEWithLogitsLoss()(predictions,targets.view(-1,1))
        loss.backward()
        optimizer.step()
        
def evaluate(data_loader,model,device):
    final_predictions = []
    final_targets = []
    
    model.eval()
    for data in data_loader:
        reviews = data["review"]
        targets = data["target"]
        # move the data to the device that we want to use
        reviews = reviews.to(device,dtype=torch.long)
        targets = targets.to(device,dtype=torch.float)
        predictions = model(reviews)
        predictions = predictions.detach().cpu().numpy().tolist()
        targets = data["target"].detach().cpu().numpy().tolist()
        final_predictions.extend(predictions)
        final_targets.extend(targets)
        
        
    return  final_predictions,final_targets


In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10

In [None]:
import io
import torch

import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
def load_vectors(fname):
    fin = open(fname)
    data = {}
    for line in fin:
        tokens = line.split()
        data[tokens[0]] = np.array([float(value) for value in tokens[1:]])
        
    return data

In [None]:
def create_embedding_matrix(word_index,embedding_dict):
    """
    This function creates the embedding matrix
    :param word_index: a dictionary of word: index_value
    :param embedding_dict:
    :return a numpy array with embedding vectors for all known words
    """
    # intialize the embedding matrix 
    embedding_matrix = np.zeros((len(word_index)+1,300))
    for word, i in word_index.items():
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
    return embedding_matrix
            

In [None]:
def run(df):
    y = df.sentiment.values
    train_df,valid_df = train_test_split(df,test_size = 0.2, stratify = y)
    
    print('Fitting tokenizer')
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())
    
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain,maxlen = MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest,maxlen = MAX_LEN)
    train_dataset = IMDBDataset(reviews=xtrain,sentiments=train_df.sentiment.values)
    train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=TRAIN_BATCH_SIZE,num_workers=2)
    valid_dataset = IMDBDataset(reviews=xtest,sentiments=valid_df.sentiment.values)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=VALID_BATCH_SIZE,num_workers=2)
    
    print("Load embeddings")
    embedding_dict = load_vectors('../input/glove-embeddings/glove.6B.300d.txt')
    embedding_matrix = create_embedding_matrix(tokenizer.word_index,embedding_dict)
    # create a torch device since we are using cuda
    device = torch.device("cuda")
    model = LSTM(embedding_matrix)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3)
    print("Traning model")
    best_accuracy = 0
    early_stopping_counter = 0
    for epoch in range(1,EPOCHS+1):
        train(train_loader,model,optimizer,device)
        outputs,targets = evaluate(valid_loader,model,device)
        outputs = np.array(outputs)>0.5
        accuracy = metrics.accuracy_score(targets,outputs)
        print(f"EPOCH:{epoch}, Accuracy Score: {accuracy}")
        if accuracy>best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter +=1
        if early_stopping_counter>2:
            break
            
            
            
            
    
    

In [None]:
df.head()

In [None]:
run(df)