In [None]:
# pytorch
import torch
import torch.nn as nn
from RNN import SentimentRNN
import torchtext.data as data
from torchtext.data import get_tokenizer

# gensim
import gensim.downloader

# word processing
import nltk
from nltk.corpus import stopwords

# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# utils
from tqdm import tqdm
import re
from collections import Counter
import time

In [None]:
glove_embeddings = gensim.downloader.load('glove-twitter-25')

In [None]:
df = pd.read_csv('./datasets/IMDB-Dataset.csv')  # could also download this from torchtext.datasets
# make everything lower case
cleaned_df = df.sample(frac=0.25)
cleaned_df = df.apply(lambda x: x.astype(str).str.lower())

# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
CLEANR = re.compile('<.*?>')

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

cleaned_df = cleaned_df.apply(lambda x: x.astype(str).apply(lambda y: cleanhtml(y)))
cleaned_df.head()

In [None]:
tokenizer = get_tokenizer(tokenizer='spacy', language='en_core_web_sm')
tqdm.pandas(desc="progress-bar")
cleaned_df['tokens'] = cleaned_df['review'].progress_apply(lambda x: tokenizer(x))
# remove stop words
# cleaned_df['tokens'] = cleaned_df['tokens'].progress_apply(lambda x: [word for word in x if word not in stopwords.words('english')])

In [None]:
print(cleaned_df.head()['tokens'][0])

In [None]:
# build vocab
cleaned_reviews = cleaned_df['tokens'].tolist()
tokens = list(np.concatenate(cleaned_reviews).flat)

counter = Counter(tokens)
vocab = sorted(counter, key=counter.get, reverse=True)

# convert words to integers
vocab_to_int = {word: word_int for word_int, word in enumerate(vocab, 1)}
vocab_to_int['<PAD>'] = 0

In [None]:
# convert reviews to integers
review_tokens = cleaned_df['tokens'].tolist()
reviews_encoded = [[vocab_to_int[word] for word in review] for review in tqdm(review_tokens)]

for i in range(5):
    print(review_tokens[i][:5])
    print(reviews_encoded[i][:5])

In [None]:
# pad reviews

def pad_features(reviews, pad_id, seq_length=200):
    '''
    Return features of review_ints, where each review is padded with 0's
    or truncated to the input seq_length.
    '''
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        features[i, :len(row)] = np.array(row)[:seq_length]

    return features

seq_length = 256
features = pad_features(reviews_encoded, vocab_to_int['<PAD>'], seq_length=seq_length)

assert len(features) == len(reviews_encoded), "Your features should have as many rows as reviews."
assert len(features[0]) == seq_length, "Each feature row should contain seq_length values."

print(features[:10, :10])

In [None]:
# get labels as numpy array
labels = cleaned_df['sentiment'].tolist()
labels = np.array([1 if label == 'positive' else 0 for label in labels])
labels

In [None]:
# create training, validation, and test data
train_frac = 0.8  # 80% of data will be used for training
validation_frac = 0.5  # 50% of test data will be used for validation (10% of total data)

# create train set
split_id = int(train_frac * len(features))
train_x, remaining_x = features[:split_id], features[split_id:]
train_y, remaining_y = labels[:split_id], labels[split_id:]

# create validation and test set
split_cal_id = int(validation_frac * len(remaining_x))
val_x, test_x = remaining_x[:split_cal_id], remaining_x[split_cal_id:]
val_y, test_y = remaining_y[:split_cal_id], remaining_y[split_cal_id:]

print("Train set: \t\t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t\t{}".format(test_x.shape))

In [None]:
print(len(train_y[train_y==0]), len(train_y[train_y==1]))
print(len(val_y[val_y==0]), len(val_y[val_y==1]))
print(len(test_y[test_y==0]), len(test_y[test_y==1]))

In [None]:
# create dataloaders
batch_size = 128

# create tensor datasets
train_set = torch.utils.data.TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
val_set = torch.utils.data.TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_set = torch.utils.data.TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create data loaders
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [None]:
# check our batches are correct
# expecting the shape to be (batch_size, seq_length) and the labels to be (batch_size)
data_iter = iter(train_loader)
x, y = data_iter.__next__()

print('Sample input size: ', x.size())  # batch_size, seq_length
print('Sample input: \n', x)
print()
print('Sample label size: ', y.size())  # batch_size
print('Sample label: \n', y)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

vocab_size = len(vocab_to_int)
word_embeddings = np.array()
for word in vocab_to_int.keys():
    if word in glove_embeddings:
        word_embeddings.append(np.array(glove_embeddings[word]))
    else:
        word_embeddings.append(np.zeros(25))
print(len(word_embeddings), len(word_embeddings[0]))

output_size = 1
embedding_dim = 25
hidden_dim = 256
num_layers = 2
dropout = 0.25

model = SentimentRNN(word_embeddings=word_embeddings, output_dim=output_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers, dropout=dropout)
print(model)

In [None]:
# training config
lr = 0.001
criterion = nn.BCELoss() # binary cross entropy loss
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
gradient_clip = 5
epochs = 8
print_every = 1
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': [],
    'epochs': epochs
}
es_limit = 5  # early stopping limit

In [None]:
# training loop

epoch_loop = tqdm(range(epochs), position=0, desc='Training', leave=True)

# early stopping trigger
es_trigger = 0
min_val_loss = torch.inf

for epoch in epoch_loop:
    model.train()

    train_loss = 0
    train_acc = 0

    for idx, (feature, target) in enumerate(train_loader):
        # add epoch meta info
        epoch_loop.set_postfix_str(f'Training batch {idx}/{len(train_loader)}')

        # move to device
        feature, target = feature.to(device), target.to(device)

        # reset optimizer
        optimizer.zero_grad()

        # forward pass
        feature = feature
        out = model(feature)

        # accuracy
        pred = torch.tensor([1 if o > 0.5 else 0 for o in out], device=device)
        equals = pred == target
        acc = torch.mean(equals.type(torch.FloatTensor))
        train_acc += acc.item()

        # loss
        loss = criterion(out.squeeze(), target.float())
        train_loss += loss.item()
        loss.backward()

        # clip gradient
        nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)

        # update optimizer
        optimizer.step()

        # free some memory
        del feature, target, pred

    history['train_loss'].append(train_loss / len(train_loader))
    history['train_acc'].append(train_acc / len(train_loader))


### References
https://www.kaggle.com/code/affand20/imdb-with-pytorch