In [17]:
# Necessary packages

import sys, os, random, math, sys
import torch, spacy
import numpy as np
from torch import nn
from tqdm.notebook import trange, tqdm

from matplotlib import pyplot as plt

from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_ as clip_grad_norm
from torch.nn import functional as F
from tqdm import tqdm 

## Random seeds, to make the results reproducible
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print(torch.randn(5))

tensor([ 0.3367,  0.1288,  0.2345,  0.2303, -1.1229])


In [18]:
torch.cuda.is_available()

True

In [19]:
#Read in data...

!rm -rf processed_data processed_data.zip
import urllib.request
url = "https://github.com/benartuso/bias-detection/blob/main/data/processed_data.zip?raw=true"
filename, headers = urllib.request.urlretrieve(url, filename="processed_data.zip")
!unzip processed_data.zip
print("Done")

Archive:  processed_data.zip
   creating: processed_data/
  inflating: processed_data/dev_lstm.csv  
  inflating: processed_data/biased.word.dev  
  inflating: processed_data/dev_ann.csv  
  inflating: processed_data/train_ann.csv  
  inflating: processed_data/biased.word.test  
  inflating: processed_data/test_ann.csv  
  inflating: processed_data/biased.word.train  
  inflating: processed_data/train_lstm.csv  
  inflating: processed_data/test_lstm.csv  
Done


In [49]:
#This code is adapted from neural_classifier() lecture example code.
from torchtext.data import Field, ReversibleField, Dataset, TabularDataset, BucketIterator, Iterator
spacy_en = spacy.load('en')
def tokenize_fn(text):
    """ Tokenization function - split apart on spaces.
        This is sufficient for tokenization, since puctuation etc. 
        have already been handled by the prof's preprocessing of wiki2
    """
    # return [tok.text for tok in spacy_en.tokenizer(text)]
    return text.strip().split()


def reader(suffix=".tsv", rpath="sst", batch_size=8, min_freq=2):
    """
    - suffix: data file suffix
    - rpath: path to the data files
    - batch_size: mini-batch size
    - min_freq: word frequency cutoff, frequency less than min_freq will be removed when building the vocab
    """
    # Utterance Field: text
    TXT = Field(sequential=True, tokenize=tokenize_fn, init_token=None, eos_token=None, lower=True)
    LABEL = Field(sequential=False, unk_token=None, dtype=torch.long, use_vocab=False)
    #Treat label as a sequential field, pad it as well!


    # Create a Dataset instance
    fields = [("text", TXT), ("label", LABEL)]
    trn_data = TabularDataset(os.path.join(rpath,'train_lstm'+suffix), format="CSV", fields=fields, skip_header=True)
    val_data = TabularDataset(os.path.join(rpath,'dev_lstm'+suffix), format="CSV", fields=fields, skip_header=True)
    tst_data = TabularDataset(os.path.join(rpath, 'test_lstm'+suffix), format="CSV", fields=fields, skip_header=True)
    #No test data

    
    # Split
    # Build vocab using training data
    TXT.build_vocab(trn_data, min_freq=min_freq) # or max_size=10000
    LABEL.build_vocab(trn_data, min_freq=min_freq)
    # 
    train_iter, val_iter, test_iter = BucketIterator.splits((trn_data, val_data, tst_data), # data
                                                             batch_size=batch_size, # 
                                                             sort=True, # sort_key not specified
                                                             sort_key = lambda x : len(x.text),
                                                             shuffle=False, # shuffle between epochs
                                                             repeat=False)
    return train_iter, val_iter, test_iter, TXT

In [98]:
train_iter, val_iter, test_iter, txtfield = reader(suffix='.csv', rpath="processed_data", min_freq=5)

In [99]:
vocab_size = len(txtfield.vocab)
print("Vocab size = {}".format(vocab_size))
pad = txtfield.vocab.stoi[txtfield.pad_token]

print("[TRAIN]:%d (dataset:%d)\t[VAL]:%d (dataset:%d)\t"
    % (len(train_iter), len(train_iter.dataset),
    len(val_iter), len(val_iter.dataset)))
print("[vocab]:%d" % (vocab_size))

Vocab size = 13321
[TRAIN]:13451 (dataset:107606)	[VAL]:175 (dataset:1400)	
[vocab]:13321


In [100]:
class LSTM(nn.Module):
  def __init__(self, vocab_size, pad_token, n_input=32, n_hidden=32, n_layers=1, drop_prob=0, lr=0.1):
    super().__init__()

    self.vocab_size = vocab_size
    self.n_input = n_input
    self.n_hidden = n_hidden
    self.n_layers = n_layers
    self.pad = pad_token
    self.emb_layer = nn.Embedding(vocab_size, n_input, padding_idx = self.pad)
    self.dropout = nn.Dropout(drop_prob)
    self.lstm = nn.LSTM(n_input, n_hidden, n_layers, dropout=drop_prob, batch_first=False)
    self.fc = nn.Linear(n_hidden, 2)
    self.lr = lr
  def forward(self, batch, hidden):
    input, label = batch.text.cuda(), batch.label.cuda()

    #Create word embeddings
    embedded = self.emb_layer(input.long())
    lstm_output, hidden = self.lstm(embedded, hidden)

    out = self.dropout(lstm_output)

    out = self.fc(out)
    out = out[-1]
    out = F.log_softmax(out, dim=-1) #Is this right? 
    #label = label.permute(1,0)

    #out = out.permute(1, 2, 0)
    loss = F.cross_entropy(out, label, reduction="mean", ignore_index=self.pad)

    return loss, hidden

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data

    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(), 
                 weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
    return hidden

In [101]:
def batch_train(batch, hidden, model, optimizer):

  hidden = model.init_hidden(batch_size=batch.text.shape[1])

  model.train()

  optimizer.zero_grad()

  loss, hidden = model(batch, hidden)
  loss.backward()
  clip_grad_norm(model.parameters(), grad_clip)
  optimizer.step()

  return model, loss.item(), hidden

In [103]:
#Initialize model with params specified in HW description
model = LSTM(vocab_size, pad, n_input=32, n_hidden=32, n_layers=1, drop_prob=0.9, lr=0.1)
#Move to colab GPU for speed updates
model.cuda()
#Initialize hidden state
hidden = model.init_hidden(batch_size=8)
#Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0)
# the norm of grad clipping
grad_clip = 1.0

# ------------------------------------
# 3. Define the numbers of training epochs and validation steps (val_steps not used here)
epoch, val_step = 5, 50

# ------------------------------------
# 4. Training iterations
TrnLoss, ValLoss, ValAcc = [], [], []
total_batch = 0
for e in range(epoch):
    epoch_loss = []
    print("Beginning epoch {}".format(e+1))
    for b, batch in (enumerate(train_iter)):
        total_batch += 1
        # Update parameters with one batch
        model, loss, hidden = batch_train(batch, hidden, model, optimizer)
        epoch_loss.append(loss)
        if b % 500 == 0:
          print(loss)
    print("Epoch {} average loss: {}".format((e+1), np.mean(epoch_loss)))    

  "num_layers={}".format(dropout, num_layers))


Beginning epoch 1
0.7209597826004028
0.008460501208901405
0.0
0.0072160386480391026


KeyboardInterrupt: ignored