In [200]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
from tqdm import tqdm

torch.backends.cudnn.deterministic = True #???

import numpy as np
import matplotlib as mpl

# Sentiment Classifier

Code based on https://github.com/andrei-radulescu-banu/stat453-deep-learning-ss21/blob/main/L15/packed_lstm.ipynb rewrite of raschkas lectures

### General settings

In [72]:
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000 #to prevent overfitting
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device("cpu") #"cuda:1" if avaialble 

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2

### Download dataset

In [5]:
# import urllib.request
# url = "https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz"
# filename = "movie_data.csv.gz"
# urllib.request.urlretrieve(url,filename)

# #!python -m wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz wget doesnt work (natively works on mac) need to instal module which conda doesnt do right now

# #https://ai.standord.edu/~amaas/data/sentiment/ another source of hte dataset


In [4]:
# import gzip
# import shutil

# with gzip.open("movie_data.csv.gz","rb") as f_in: #"rb" = read bianry mode
#         with open("movie_data.csv", "wb") as f_out: #write binary
#                 shutil.copyfileobj(f_in,f_out)

In [73]:
df = pd.read_csv("movie_data.csv")
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [74]:
# df.columns = ["TEXT_COLUMN_NAME","LABEL_COLUMN_NAME"]
# df.to_csv("movie_data.csv", index = None)

# df = pd.read_csv("movie_data.csv")
# df.head()

# Prepare Dataset with Torchdata and new DataPipes API



In [75]:
from torchdata.datapipes.iter import IterableWrapper, FileOpener

datapipe = IterableWrapper(["movie_data.csv"])
datapipe = FileOpener(datapipe, mode = "b")
datapipe = datapipe.parse_csv(skip_lines = 1)


# Split dataset into train/val/test

In [76]:
# Get the number of rows in dataset
N_ROWS = len(list(datapipe))

train_dp, valid_dp, test_dp = datapipe.random_split(total_length = N_ROWS, weights = {"train": 0.8, "valid":0.1, "test":0.1}, seed = 0)

print(f'Num Train: {len(train_dp)}')
print(f'Num Validate: {len(valid_dp)}')
print(f'Num Test: {len(test_dp)}')



Num Train: 40000
Num Validate: 5000
Num Test: 5000


# Build the vocabulary class

build hte vocabulary based on the top VOCAB_SIZE words. Build_vocab_from_iterator collects the most Hz tokens from the iterator yield_tokens(train_dp and adds special toeksn at the begginign w/o chainign the order of hte supplied tokens)

In [77]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

#chops up a data iterator into its tokens
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

def get_vocab(train_datapipe):
    vocab = build_vocab_from_iterator(yield_tokens(train_datapipe),
        specials = ["<UNK>","<PAD>"], #?unknown and pad
        max_tokens = VOCABULARY_SIZE) #set max vocab to prevent overfitting
    vocab.set_default_index(vocab["<UNK>"])
    return vocab



In [78]:
vocab = get_vocab(train_dp)
print(len(vocab))
print(vocab.get_itos()[:10]) #integet to sting


20000
['<UNK>', '<PAD>', 'the', '.', ',', 'and', 'a', 'of', 'to', "'"]


In [79]:
#handling unkown tokens
print(f"Default index: {vocab.get_default_index()}")
vocab["124202"]

Default index: 0


0

tet_transform and label_transform are callable ojects such as a lambda function here to process the raw test adn label data from the datset iterator

In [81]:
text_transform = lambda x: [vocab[token] for token in tokenizer(x)]
label_transform = lambda x: 1 if x == "1" else 0 if x == "0" else 1/0 #better to make it throw errors

#crazy solution that doesnt actually work
# type(lambda: 0)(type((lambda: 0).__code__)(
#     1,0,1,1,67,b'|\0\202\1\0',(),(),('x',),'','',1,b''),{}
# )(Exception()) https://stackoverflow.com/questions/8294618/define-a-lambda-expression-that-raises-an-exception

print(text_transform("Hello my name is Bruno Batinica"))
print(label_transform("0"))

[4968, 64, 401, 10, 3847, 0]
0


In [82]:
#converting a string to an integer
# A direct way
print(f"the: {vocab['the']}")

# And an indirect way, using get_stoi() to get a dictionary of tokens and values
print(f"the: {vocab.get_stoi()['the']}") # stoi = string-to-integer

# What is the padding value?
print(f"<PAD>: {vocab['<PAD>']}")
PADDING_VALUE=vocab['<PAD>']

the: 2
the: 2
<PAD>: 1


In [116]:
vocab_itos = vocab.get_itos()
vec_vocab_itos = np.vectorize(lambda x: vocab_itos[x])
vec_vocab_itos([1, 2, 3, 4, 5])

array(['<PAD>', 'the', '.', ',', 'and'], dtype='<U5')

# Define data laoders

torch.utils.data.DataLoader is used to getnerate data batches. Users should customize the databatch by defining a function with teh collate_fn argument in theDataLoader

Collate_batch func we process the raw text data adna dd pagging to dynamically match the longest setnece in a batch

In [137]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_batch(batch):
    text_list, label_list = [] , []
    for (_text,_label) in batch:
        processed_text = torch.tensor(text_transform(_text)) #cuts into tokens and converts into a list of integers - see above function
        text_list.append(processed_text) #?appends in 1st dimension
        label_list.append(label_transform(_label))
    return pad_sequence(text_list, padding_value = PADDING_VALUE, batch_first = False).to(DEVICE), torch.tensor(label_list).to(DEVICE)

to group the texts with simliar length together like the legacy BucketIterator class, first of all, we randomly create multiple POOLS each with size batch_size * 100. Then we sort the samples within the individual pools by length. 

This idea can be implemented succintly through batch_sampler argument of PyTorch DataLoader

batch_sampler accepts "sampler" or iterable object that yields indices of the next batch

In the code below we implement a generator that yields batch of indexes for which the coresponding batch of data is of similiar length

Reduces amount of padding required


In [138]:
from torch.utils.data import Sampler, Dataset

#create out own sampler to ensure we function with multiple worker threads
#see https://discuss.pytorch.org/t/using-distributedsampler-in-combination-with-batch-sampler-to-make-sure-batches-have-sentences-of-similar-length/119824/3

class BatchSamplerSimilarLength(Sampler):
    def __init__(self, dataset, batch_size, indices = None, shuffle = True):
        self.batch_size = batch_size
        self.shuffle = shuffle

        # get indices and length
        self.indices = [(i, len(tokenizer(s[0]))) for i, s in enumerate(dataset)] #extracts index and length of token
        # if indices are passed, then use only the one passed (for ddp)
        if indices is not None:
            self.indices = torch.tensor(self.indices)[indices].tolist()

    def __iter__(self):
        if self.shuffle:
            random.shuffle(self.indices)

        pooled_indices = []
        #create pool of indices with similiar lengths
        for i in range(0, len(self.indices), self.batch_size * 100): #take steps of batch_size * 100
            pooled_indices.extend(sorted(self.indices[i:i+self.batch_size * 100], key = lambda x: x[1])) #sorts them by length of the "sentence" (x[1]) Only sorted within the pool - but pool is exactly 100 tiems larger than 
        self.pooled_indices = [x[0] for x in pooled_indices] # returns the indices in order

        batches = [self.pooled_indices[i:i+self.batch_size] for i in range(0, len(self.pooled_indices),self.batch_size)] #creates batches of data which have similiar length each batch is size batchsiz

        if self.shuffle:
            random.shuffle(batches)

        # thisis a generator so make it yield the batch
        for batch in batches:
            yield batch

    def __len__(self):
        return len(self.pooled_indices) // self.batch_size #



In [139]:

train_dp_list = list(train_dp)
valid_dp_list = list(valid_dp)
test_dp_list = list(test_dp)

train_loader = DataLoader(train_dp_list, 
                          batch_sampler=BatchSamplerSimilarLength(dataset = train_dp_list, 
                                                                  batch_size=BATCH_SIZE),
                          collate_fn=collate_batch)
valid_loader = DataLoader(train_dp_list, 
                          batch_sampler=BatchSamplerSimilarLength(dataset = valid_dp_list, 
                                                                  batch_size=BATCH_SIZE,
                                                                  shuffle=False),
                          collate_fn=collate_batch)
test_loader = DataLoader(train_dp_list, 
                          batch_sampler=BatchSamplerSimilarLength(dataset = test_dp_list, 
                                                                  batch_size=BATCH_SIZE,
                                                                  shuffle=False),
                          collate_fn=collate_batch)

text_batch, label_batch = next(iter(train_loader))
print(text_batch.size())
print(label_batch.size())

torch.Size([1015, 128])
torch.Size([128])


In [145]:
# testing the iterators

#shuffle so not in order
i = 0
for text_batch, label_batch in train_loader:
    print(f'T Text matrix size: {text_batch.size()}')
    print(f'Target vector size: {label_batch.size()}')
    i += 1
    if i == 4:
        break


#not shuffled so in order
i = 0
for text_batch, label_batch in valid_loader:
    print(f'V Text matrix size: {text_batch.size()}')
    print(f'Target vector size: {label_batch.size()}')
    i += 1
    if i == 4:
        break

#number of rows depends on longest document in respective batch

T Text matrix size: torch.Size([141, 128])
Target vector size: torch.Size([128])
T Text matrix size: torch.Size([2623, 128])
Target vector size: torch.Size([128])
T Text matrix size: torch.Size([214, 128])
Target vector size: torch.Size([128])
T Text matrix size: torch.Size([140, 128])
Target vector size: torch.Size([128])
V Text matrix size: torch.Size([763, 128])
Target vector size: torch.Size([128])
V Text matrix size: torch.Size([1151, 128])
Target vector size: torch.Size([128])
V Text matrix size: torch.Size([1049, 128])
Target vector size: torch.Size([128])
V Text matrix size: torch.Size([2623, 128])
Target vector size: torch.Size([128])


# INTERESTING THAT ORDER IS DIFFERNT - BATCH SIZE = NUMBER OF COLUMNS, SENTENCE LENGTH = NUMBER OF ROWS - HMMMMM

# MAYBE THIS MAKES SENSE - WHERE EACH WORD IS FED INTO A DIFFERENT NEURON SO EACH INPUT IS ACTUALLY THE EMBEDDING * BATCH SIZE AND THE WORD (SENTENCE LENGTH) IS THE 0th dimensions seperating hte different 2D matrices

In [159]:
print(text_batch.shape)

print(*vec_vocab_itos(text_batch.to("cpu"))[:,99]) #need to actually look down the rows!!!

torch.Size([2623, 128])


# utility functions

In [None]:
# train mode - datalaoder and model independent

In [205]:
import time

def train_model(model, num_epochs, train_loader, valid_loader, test_loader, optimizer, device, logging_interval = 50, schedule = None, scheduler_on = "valid_acc"):

    start_time = time.time()
    minibatch_loss_list, train_acc_list, valid_acc_list = [], [] ,[]

    for epoch in range(num_epochs):
        
        model.train() #set model in training mode

        for batch_idx, (features,targets) in tqdm(enumerate(train_loader)):

            #print(features)
            
            features = features.to(device)
            targets = targets.to(device)

            #forward and backprop
            logits = model(features)

            #print(logits)
            loss = torch.nn.functional.cross_entropy(logits, targets)
            
            loss.backward()

            #update paraemeters
            optimizer.zero_grad()
            optimizer.step()

            #logging
            minibatch_loss_list.append(loss.item())
            if not batch_idx % logging_interval:
                print(f'Epoch: {epoch+1:03d}/{num_epochs:03d} '
                      f'| Batch {batch_idx:04d}/{len(train_loader):04d} '
                      f'| Loss: {loss:.4f}')

        #at the end of each epoch

        model.eval() #set to evaluation mode
        with torch.no_grad():  # save memory during inference
                train_acc = compute_accuracy(model, train_loader, device=device)
                valid_acc = compute_accuracy(model, valid_loader, device=device)
                print(f'Epoch: {epoch+1:03d}/{num_epochs:03d} '
                    f'| Train: {train_acc :.2f}% '
                    f'| Validation: {valid_acc :.2f}%')
                train_acc_list.append(train_acc)
                valid_acc_list.append(valid_acc)


        elapsed = (time.time() - start_time)/60
        print(f'Time elapsed: {elapsed:.2f} min')

        if scheduler is not None:

            if scheduler_on == 'valid_acc':
                scheduler.step(valid_acc_list[-1])
            elif scheduler_on == 'minibatch_loss':
                scheduler.step(minibatch_loss_list[-1])
            else:
                raise ValueError(f'Invalid `scheduler_on` choice.')

    elapsed = (time.time() - start_time)/60
    print(f'Total Training Time: {elapsed:.2f} min')

    test_acc = compute_accuracy(model, test_loader, device=device)
    print(f'Test accuracy {test_acc :.2f}%')

    return minibatch_loss_list, train_acc_list, valid_acc_list



# **Actually writing the RNN model**

actually an lstm

In [206]:

class RNN(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # #poor performance when just using RNN
        # self.rnn = torch.nn.RNN(embedding_dim,  
        #                         hidden_dim,
        #                         nonlinearity = "relu")
        self.rnn = torch.nn.LSTM(embedding_dim,
                                hidden_dim) 

        self.fc = torch.nn.Linear(hidden_dim, output_dim) #fully connected - hidden dim to class labels

    def forward(self, text):
        #text dim; [sentence lentgh, batch size]

        embedded = self.embedding(text)
        #embedded dim: [sentence length, batchsize, embedding dim]

        output, (hidden, cell) = self.rnn(embedded)
        #output dim: [sentence length, batch size, hidden dim]
        # this is the y matrix - one for each word

        # hidden dim: [1, batchsize, hidden dim] #one for the network

        #hidden = the hidden state of the RNN - this would usually be fed into the next neuron

        hidden.squeeze_(0) #squeezes out the 0th dimension
        #hidden dim: [batchsize, hidden dim]

        output = self.fc(hidden)


        #Many to one network so we ignore most of hte outputs
        #in this one we actually ignore all the outputs and just use the hidden state

        return output



In [207]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim = len(vocab),
            embedding_dim = EMBEDDING_DIM,
            hidden_dim = HIDDEN_DIM,
            output_dim = NUM_CLASSES) #could use 1 for bianry classification)

model = model.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                      factor = 0.1,
                                                      mode = "max",
                                                      verbose = True)

minibatch_loss_list, train_acc_list, valid_acc_list = train_model(
    model = model,
    num_epochs = NUM_EPOCHS,
    train_loader = train_loader,
    valid_loader = valid_loader,
    test_loader = test_loader,
    optimizer = optimizer,
    device = DEVICE,
    logging_interval = 100
)


0it [00:00, ?it/s]

1it [00:00,  1.94it/s]

Epoch: 001/015 | Batch 0000/0312 | Loss: 0.6985


101it [01:01,  1.47it/s]

Epoch: 001/015 | Batch 0100/0312 | Loss: 0.6922


201it [02:06,  1.43s/it]

Epoch: 001/015 | Batch 0200/0312 | Loss: 0.6930


301it [03:01,  1.02s/it]

Epoch: 001/015 | Batch 0300/0312 | Loss: 0.6941


313it [03:07,  1.67it/s]


Epoch: 001/015 | Train: 50.05% | Validation: 50.86%
Time elapsed: 4.89 min


NameError: name 'scheduler' is not defined