In [57]:
import os  # when loading file paths
import pandas as pd  # for lookup in annotation file
import spacy  # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence  # pad batch
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

# We want to convert text -> numerical values
# 1. We need a Vocabulary mapping each word to a index
# 2. We need to setup a Pytorch dataset to load the data
# 3. Setup padding of every batch (all examples should be
#    of same seq_len and setup dataloader)
# Note that loading the image is very easy compared to the text!

# Download with: python -m spacy download en_core_web_sm
spacy_eng = spacy.load("en_core_web_sm")


class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        #print(tokenized_text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


class NewDelhiDataset(Dataset):
    def __init__(self, file, freq_threshold=5):
        self.df = pd.read_csv(file)

        self.reviews = self.df["review_full"]
        self.ratings = self.df["rating_review"]
        
        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.reviews.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        rating = self.ratings[index]
        review = self.reviews[index]
        
        numericalized_rating = [0,0,0,0,0]
        numericalized_rating[rating-1] = 1
        
        numericalized_review = [self.vocab.stoi["<SOS>"]]
        numericalized_review += self.vocab.numericalize(review)
        numericalized_review.append(self.vocab.stoi["<EOS>"])

        return numericalized_rating, torch.tensor(numericalized_review)


class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        ratings = [item[0] for item in batch]
        # ratings_encoded = [0,0,0,0,0]
        # ratings_encoded[ratings-1] = 1
        
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_idx)

        return ratings, targets


def get_loader(
    annotation_file,
    batch_size=32,
    num_workers=8,
    shuffle=False,
    pin_memory=True,
):
    dataset = NewDelhiDataset(annotation_file)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )

    return loader, dataset

loader, dataset = get_loader("../data/New_Delhi_reviews_rnn.csv")    

from torch.nn.utils.rnn import pack_padded_sequence


In [58]:
for i, (ratings, reviews) in enumerate(loader):
    print(ratings)
    print(reviews.shape)
    break



[[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
torch.Size([32, 511])


In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = 5
num_epochs = 2
batch_size = 32
learning_rate = 0.001

input_size = 1
hidden_size = 1
num_layers = 2

In [54]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        # -> x needs to be: (batch_size, seq, input_size)
        
        # or:
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Set initial hidden states (and cell states for LSTM)
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device) 
        #c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        
        # x: (32, L, 1), h0: (32, L, 1)
        
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)  
        # or:
        #out, _ = self.lstm(x, (h0,c0))  
        
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (n, 28, 128)
        
        # Decode the hidden state of the last time step
        out = out[:, :, :]
        # out: (n, 1)
         
        out = self.fc(out)
        # out: (n, 10)
        return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)


In [65]:
# Loss and optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

# Train the model
n_total_steps = len(loader)
for epoch in range(num_epochs):
    for i, (ratings, reviews) in enumerate(loader):  
        # ratings_tensor = torch.from_numpy(ratings)
        # ratings = ratings.to(device)
        ratings_tensor = torch.tensor(ratings)
        ratings_tensor = ratings_tensor.to(device)
        N = reviews.shape[0]
        L = reviews.shape[1]
        reviews_tensor = torch.reshape(reviews, (N, L, input_size))
        reviews_tensor = reviews_tensor.float()
        reviews_tensor = reviews_tensor.to(device)        
        
        # print(reviews_tensor.dtype)
        # Forward pass
        outputs = model(reviews_tensor).to(device)
        loss = criterion(outputs, ratings_tensor)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

Epoch [1/5], Step [100/4612], Loss: -5.8727
Epoch [1/5], Step [200/4612], Loss: -6.0727
Epoch [1/5], Step [300/4612], Loss: -6.2727
Epoch [1/5], Step [400/4612], Loss: -6.4727
Epoch [1/5], Step [500/4612], Loss: -6.6727
Epoch [1/5], Step [600/4612], Loss: -6.8727
Epoch [1/5], Step [700/4612], Loss: -7.0727
Epoch [1/5], Step [800/4612], Loss: -7.2726
Epoch [1/5], Step [900/4612], Loss: -7.4726
Epoch [1/5], Step [1000/4612], Loss: -7.6726
Epoch [1/5], Step [1100/4612], Loss: -7.8726
Epoch [1/5], Step [1200/4612], Loss: -8.0726
Epoch [1/5], Step [1300/4612], Loss: -8.2726
Epoch [1/5], Step [1400/4612], Loss: -8.4726
Epoch [1/5], Step [1500/4612], Loss: -8.6726
Epoch [1/5], Step [1600/4612], Loss: -8.8725
Epoch [1/5], Step [1700/4612], Loss: -9.0725
Epoch [1/5], Step [1800/4612], Loss: -9.2725
Epoch [1/5], Step [1900/4612], Loss: -9.4725
Epoch [1/5], Step [2000/4612], Loss: -9.6725
Epoch [1/5], Step [2100/4612], Loss: -9.8725
Epoch [1/5], Step [2200/4612], Loss: -10.0725
Epoch [1/5], Step 