# Preeliminary: imports and loading the data

In [8]:
import pandas as pd
import torch
import os
HOME = os.getcwd()
DATA_PATH = os.path.join(HOME, 'IMDB Dataset.csv')

In [9]:
# read all the data at once
all_data = pd.read_csv(DATA_PATH)
# let's see if data was read correctly
all_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
X, y = all_data['review'], all_data['sentiment']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.15, stratify=y) # keep the classes distributions between the two splits

In [16]:
train_df, test_df = train_test_split(all_data, random_state=11, test_size=0.15, stratify=all_data['sentiment'])

In [23]:
# our two operations led to equivalent results
assert (train_df['review'] == X_train).all()
assert (test_df['review'] == X_test).all()
# save the data
train_df.to_csv(os.path.join(HOME, 'train.csv'))
test_df.to_csv(os.path.join(HOME, 'test.csv'))

In [11]:
# time for the heavy machinery: Pytorch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [12]:
print(X_train[0])

GOOD_PUNCT = list("',.?!:\"-")
from string import punctuation
import re
unwanted_regex_comb = r'[#$%&*+\/;<=>@[\\\]\^_`{\|}~]+?[\w\s]+[#$%&()*+\/;<=>@[\\\]^_`{\|}~]+?' 
unwanted_punc_regex = r'[#$%&*+\/;<=>@[\\\]^_`{\|}~]+'

def process_sentence(sentence: str):
    sentence = re.sub(unwanted_regex_comb, '', sentence)
    sentence = re.sub(unwanted_punc_regex , '', sentence)
    # the next point is to remove extra spaces
    return re.sub('\s+', " ", sentence)


One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [20]:
import spacy
NLP = spacy.load("en_core_web_md")
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

english_sw = set(stopwords.words("english"))


def reduce_sentence(sentence: str, nlp_object=None):
    if nlp_object is None:
        nlp_object = NLP
    # tokenize using spacy
    document = nlp_object(sentence)                  
    embedding = np.array([token.vector for token in document if re.sub( r'\W+', '',token.text.strip().lower()) not in english_sw])
    return embedding


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bouab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
sentence =  X_train[0]
sentence = process_sentence(sentence)
embeddings = reduce_sentence(sentence)
embeddings[0].shape

(300,)

In [32]:
from torch.utils.data import Dataset, DataLoader
from typing import Union
from pathlib import Path
from math import ceil

class ReviewDataset(Dataset):
    def __init__(self, file_path: Union[Path, str], labels_dict:dict, train:bool=True, embedding_length: int=300, sequence_length:int=300):
        # need to call the super class constructor
        super().__init__()
        self.file_path = file_path
        # read the data
        self.data = pd.read_csv(file_path)

        # a boolean flag to determine whether labels will be returned or not
        self.train = train 
        
        # the length of the embedding of a single token
        self.embedding_length = embedding_length
        
        # the length of the input sequence to the model
        self.sequence_length = sequence_length
        # a varible to map a non necessarily numeric representation to a numerical encoding of the target variable 
        self.labels_dict = labels_dict


    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, index):
        # get the index-th row from the data 
        review_raw = self.data.iloc[index, :]['review']

        if self.train:
            label = self.data.iloc[index, :]['sentiment']
            
        assert isinstance(review_raw, str)
        # make sure to process the data
        embedded_review = reduce_sentence(process_sentence(review_raw))

        # in the light of the length's constraint some additional manipulation is needed for longer sequences         
        # embedded_review = np.mean(
        #      np.array(
        #             [embedded_review[i * self.sequence_length : (i + 1) * self.sequence_length] 
        #                 for i in range(int(ceil(len(embedded_review) / self.sequence_length)))])
        #       , axis=0)    
        
        # let's define a model as follows
        
        # take into account the lenght's contrainst
        embedded_review = embedded_review[:self.sequence_length]    
        # make sure to pad the rest
        padding = np.array([[0] * len(embedded_review[0]) for _ in range(0, max(0, self.sequence_length  - len(embedded_review)))])

        # concatenate the 2 arrays into a single one
        embedded_review = np.concatenate((embedded_review, padding), axis=0)

        # next step is to the embedding to a tensor
        embedding = torch.from_numpy(embedded_review)

        assert torch.is_tensor(embedding)  and embedding.size()[0] == self.sequence_length and embedding.size()[1] == self.embedding_length # make sure the types and shapes are as expected

        if self.train:
                return embedding, self.labels_dict[label] 

        return embedding    

# Building LSTM

In [33]:
TRAIN_FILE = os.path.join(HOME, 'train.csv')
TEST_FILE = os.path.join(HOME, 'test.csv')

train_dataset = ReviewDataset(file_path=TRAIN_FILE, train=True, labels_dict = {"positive":1, "negative":0}, sequence_length=400)
test_dataset = ReviewDataset(file_path=TEST_FILE, train=False, labels_dict = {"positive":1, "negative":0}, sequence_length=400)

# let's get our data 
x, y = train_dataset[1]

In [31]:
# time to iterate through the dataset
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [None]:
from torch import nn

class ReviewLSTM(nn.Module):
    def __init__(self, input_dim: int, hidden_state_dim: int, num_layers: int, dropout_prob: float=0.2, num_classes:int=2):
        super.__init__()
        self.input_dim = input_dim # the length of the embedding (pretty much determined by the embedding method)
        self.hidden_state_dim = hidden_state_dim # the dimension of the hidden state inside of the LSTM
        self.num_layers = num_layers # the number of layers stacked on one another in the LSTM
        
        self.lstm =  nn.LSTM(input_size=self.input_dim, hidden_size=self.hidden_state_dim, num_layers=self.num_layers, batch_first=True) # to make sure that the output
        # is of the shape (batch, embedding dimension, fixed sequence length)
        # determine the number of output units
        self.num_outputs = (num_classes if num_classes > 2 else 1)
        self.dropout = nn.Dropout(p=max(dropout_prob, 0)) # in case the input is negative
        self.output = nn.Linear(in_features=hidden_state_dim, out_features=self.num_outputs)

    def forward(self, x: torch.Tensor):
        # so first let's pass the data through the LSTM
        lstm_out, _ = self.lstm(x) # the outputs arguments return all the final hidden state at each of the intermediate steps in the sequence 

        # just to make we are extracting the information as it is supposed to be
        assert len(lstm_out.shape) == 3 and lstm_out.shape[1] == self.input_dim and lstm_out.shape[2] == self.hidden_state_dim, \
        "MAKE SURE THE ASSUMPTIONS ABOUT THE LSTM OUTPUT ARE CORRECT !!"
        
        #  for simplicity we will solely use the last one
        lstm_out = lstm_out[:, -1, :] # the sequence length is the 2nd dimension
        
        return self.output(self.dropout(lstm_out))

            