In [1]:
import pickle
import re
import string
import tarfile
import time
import urllib
from collections import Counter

import en_core_web_sm
import numpy as np
import pandas as pd
import requests
import torch
import torch.nn as nn
import torch.nn.functional as functional
from google.colab import drive
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader

In [2]:
DATASET_URL = 'http://hidra.lbd.dcc.ufmg.br/datasets/yelp_2015/original/yelp_review_full_csv.tar.gz'
DATASET_ARCHIVE_NAME = 'dataset.tar.gz'
DATASET_FOLDER_NAME = 'dataset'

ENGLISH = en_core_web_sm.load()

In [3]:
# Save/load data from file to increase speed when rerun this notebook
def save(data, filename: str = 'data.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)


def load(filename: str = 'data.pkl'):
    with open(filename, 'rb') as file:
        return pickle.load(file)


drive.mount('/content/gdrive')
def gload(filename: str = 'data.pkl'):
    with open(f'/content/gdrive/My Drive/{filename}', 'rb') as file:
        return pickle.load(file)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
%%time
# Download dataset
def download_file(url: str, filename: str, chunk_size: int = 2 ** 15):
    with requests.get(url, stream=True) as request:
        request.raise_for_status()
        with open(filename, 'wb') as file:
            for chunk in request.iter_content(chunk_size=chunk_size): 
                file.write(chunk)


def extract_archive(archive_name: str, folder_name: str):
    with tarfile.open(archive_name) as tar:
        tar.extractall(path=folder_name)


download_file(DATASET_URL, DATASET_ARCHIVE_NAME)
extract_archive(DATASET_ARCHIVE_NAME, DATASET_FOLDER_NAME)

CPU times: user 5.83 s, sys: 1.52 s, total: 7.35 s
Wall time: 36.8 s


In [5]:
%%time
# Read dataset
test = pd.read_csv(f'{DATASET_FOLDER_NAME}/yelp_review_full_csv/test.csv', header=None)
train = pd.read_csv(f'{DATASET_FOLDER_NAME}/yelp_review_full_csv/train.csv', header=None)

test = test.rename(columns={0: 'label', 1: 'review'})
train = train.rename(columns={0: 'label', 1: 'review'})

print(len(test), len(train))
train.head()

50000 650000
CPU times: user 3.79 s, sys: 420 ms, total: 4.21 s
Wall time: 4.2 s


In [6]:
%%time
# Tokenize in order to clean text
def tokenize(text: str) -> list:
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    words = regex.sub(' ', text.lower())
    words = re.sub(r'\s+', ' ', words.strip(), flags=re.UNICODE)
    return [token.text for token in ENGLISH.tokenizer(words)]


# counts = Counter()
# for index, row in train.iterrows():
#     if index % 100 == 0:
#         percent = 100 * index // len(train)
#         print(f'{percent}%')
#     counts.update(tokenize(row['description']))
# save(counts, 'counts.pkl')
counts = load('counts.pkl')

CPU times: user 70.9 ms, sys: 7.59 ms, total: 78.5 ms
Wall time: 83 ms


In [7]:
# Check words with spaces (must be empty)
for word in list(counts):
    if ' ' in word:
        print(word)
        print(counts[word])

In [8]:
# Create vocabulary
counts_most = counts.most_common(2000)
word2vec = {'': 0, 'UNK': 1}
words = ['', 'UNK']
for word, freq in counts_most:
    word2vec[word] = len(words)
    words.append(word)

In [9]:
%%time
# Encode reviews to array of int
def encode_sentence(text: str, word2vec: dict, size: int = 150):
    tokenized = tokenize(text)
    encoded = np.zeros(size, dtype=int)
    enc1 = np.array([word2vec.get(word, word2vec['UNK']) for word in tokenized])
    length = min(size, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length


# train['encoded'] = train['review'].apply(lambda x: np.array(encode_sentence(x, word2vec), dtype=object))
# save(train, 'train.pkl')
train = gload('ReviewAnalyze/pickles/train.pkl')

CPU times: user 5.81 s, sys: 1.51 s, total: 7.31 s
Wall time: 11.7 s


In [10]:
%%time
# Find and drop reviews with zero word length
indexes = []
for index, row in train.iterrows():
    if row['encoded'][0][0] == 0:
        indexes.append(index)
train.iloc[indexes].head()
train.drop(train.index[indexes], inplace=True)

CPU times: user 42.1 s, sys: 0 ns, total: 42.1 s
Wall time: 42.1 s


In [11]:
# Transform labels from 1..5 to 0..4
zero_numbering = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
train['label'] = train['label'].apply(lambda x: zero_numbering[x])
Counter(train['label'])

Counter({0: 129987, 1: 129998, 2: 129998, 3: 129998, 4: 129992})

In [12]:
# Split into train and validation subsets
x = list(train['encoded'])
y = list(train['label'])
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)

In [13]:
# Dataset 
class ReviewsDataset(Dataset):
    def __init__(self, x: list, y: list):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        tensor = torch.from_numpy(self.x[idx][0].astype(np.int32))
        return tensor, self.y[idx], self.x[idx][1]


train_ds = ReviewsDataset(x_train, y_train)
valid_ds = ReviewsDataset(x_valid, y_valid)

In [14]:
# Check cuda device (GPU)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()

True

In [15]:
# Create train and validation functions
def train_model(model: torch.nn.Module, epochs: int = 10, lr: float = 0.001):
    since = time.time()
    ep_time = time.time()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dataloader:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = functional.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            # print('train loss %.3f' % (loss))
            time_elapsed = time.time() - ep_time
            ep_time = time.time()
            # Time spent for train/eval
            # print(f'Complete in {time_elapsed // 60}m {time_elapsed % 60}s')
            sum_loss += loss.item() * y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, valid_dataloader)
        print('train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f' % (sum_loss / total, val_loss, val_acc, val_rmse))


def validation_metrics(model: torch.nn.Module, dataloader: DataLoader):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in dataloader:
        x = x.long().to(device)
        y = y.long().to(device)
        y_hat = model(x, l)
        loss = functional.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1].cpu()
        correct += (pred == y.cpu()).float().sum().cpu()
        total += y.shape[0]
        sum_loss += loss.item() * y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.cpu().unsqueeze(-1))) * y.shape[0]
    return sum_loss / total, correct / total, sum_rmse / total

In [16]:
# Create LSTM net class
class CustomLSTM(torch.nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        # initialize the hidden state (see code below)
        self.hidden_dim = self.init_hidden()

    def init_hidden(self):
        """At the start of training, we need to initialize a hidden state
        there will be none because the hidden state is formed based on previously seen data
        So, this function defines a hidden state with all zeroes and of a specified size"""
        # The axes dimensions are (n_layers, batch_size, hidden_dim)
        return torch.zeros(1, 1, self.hidden_dim).to(device), \
               torch.zeros(1, 1, self.hidden_dim).to(device)

    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [17]:
# Create dataloaders
batch_size = 2000
vocab_size = len(words)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_ds, batch_size=batch_size)

In [18]:
# Create model
model = CustomLSTM(vocab_size, 100, 100)
model.to(device)

CustomLSTM(
  (embeddings): Embedding(2002, 100, padding_idx=0)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(100, 100, batch_first=True)
  (linear): Linear(in_features=100, out_features=5, bias=True)
)

In [19]:
# Train model
train_model(model, epochs=20, lr=0.005)

train loss 1.240, val loss 1.025, val accuracy 0.555, and val rmse 0.924
train loss 0.991, val loss 0.956, val accuracy 0.583, and val rmse 0.896
train loss 0.940, val loss 0.932, val accuracy 0.594, and val rmse 0.869
train loss 0.916, val loss 0.912, val accuracy 0.602, and val rmse 0.853
train loss 0.899, val loss 0.911, val accuracy 0.603, and val rmse 0.851
train loss 0.886, val loss 0.898, val accuracy 0.609, and val rmse 0.833
train loss 0.876, val loss 0.892, val accuracy 0.610, and val rmse 0.830
train loss 0.867, val loss 0.895, val accuracy 0.610, and val rmse 0.801
train loss 0.859, val loss 0.887, val accuracy 0.613, and val rmse 0.828
train loss 0.853, val loss 0.890, val accuracy 0.613, and val rmse 0.824
train loss 0.847, val loss 0.891, val accuracy 0.613, and val rmse 0.827
train loss 0.843, val loss 0.888, val accuracy 0.616, and val rmse 0.813
train loss 0.837, val loss 0.885, val accuracy 0.616, and val rmse 0.811
train loss 0.833, val loss 0.887, val accuracy 0.61

In [20]:
torch.save(model.state_dict(), 'model.pt')

In [21]:
model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [22]:
%%time
# Encode test dataframe
# test['encoded'] = test['review'].apply(lambda x: np.array(encode_sentence(x, word2vec), dtype=object))
# save(test, 'test.pkl')
test = gload('ReviewAnalyze/pickles/test.pkl')

CPU times: user 606 ms, sys: 62 ms, total: 668 ms
Wall time: 725 ms


In [23]:
indexes = []
for index, row in test.iterrows():
    if row['encoded'][0][0] == 0:
        indexes.append(index)
test.drop(test.index[indexes], inplace=True)

In [24]:
test['label'] = test['label'].apply(lambda x: zero_numbering[x])

In [25]:
# Create dataloader for test dataset
x_test = list(test['encoded'])
y_test = list(test['label'])
test_dataset = ReviewsDataset(x_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [27]:
# Evaluate model on test dataset
test_loss, test_acc, test_rmse = validation_metrics(model, test_dataloader)
print('test loss %.3f, test accuracy %.3f, and test rmse %.3f' % (test_loss, test_acc, test_rmse))

test loss 0.885, test accuracy 0.615, and test rmse 0.811
