In [170]:
# Necessary packages

import sys, os, random, math, sys
import torch, spacy
import numpy as np
from torch import nn
from tqdm.notebook import trange, tqdm

from matplotlib import pyplot as plt

from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_ as clip_grad_norm
from torch.nn import functional as F
# from tqdm import tqdm 

## Random seeds, to make the results reproducible
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print(torch.randn(5))

tensor([ 0.3367,  0.1288,  0.2345,  0.2303, -1.1229])


In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.set_device("cuda:0")


In [4]:
#Read in data...

!rm -rf processed_data processed_data.zip
import urllib.request
url = "https://github.com/benartuso/bias-detection/blob/main/data/processed_data.zip?raw=true"
filename, headers = urllib.request.urlretrieve(url, filename="processed_data.zip")
!unzip processed_data.zip
print("Done")

Archive:  processed_data.zip
   creating: processed_data/
  inflating: processed_data/dev_lstm.csv  
  inflating: processed_data/biased.word.dev  
  inflating: processed_data/full_dev.csv  
  inflating: processed_data/dev_ann.csv  
  inflating: processed_data/train_ann.csv  
  inflating: processed_data/biased.word.test  
  inflating: processed_data/test_ann.csv  
  inflating: processed_data/full_test.csv  
  inflating: processed_data/biased.word.train  
  inflating: processed_data/full_train.csv  
  inflating: processed_data/train_lstm.csv  
  inflating: processed_data/test_lstm.csv  
Done


In [9]:
#This code is adapted from neural_classifier() lecture example code.
from torchtext.data import Field, ReversibleField, Dataset, TabularDataset, BucketIterator, Iterator
spacy_en = spacy.load('en')
# def tokenize_fn(text):
#     """ Tokenization function - split apart on spaces.
#         This is sufficient for tokenization, since puctuation etc. 
#         have already been handled by the prof's preprocessing of wiki2
#     """
#     # return [tok.text for tok in spacy_en.tokenizer(text)]
#     return text.strip().split()


def reader(train_name, dev_name, test_name, suffix=".tsv", rpath="processed_data", batch_size=8, min_freq=2):
    """
    - suffix: data file suffix
    - rpath: path to the data files
    - batch_size: mini-batch size
    - min_freq: word frequency cutoff, frequency less than min_freq will be removed when building the vocab
    """
    # Utterance Field: text
    TXT = Field(sequential=True, tokenize="spacy", init_token="<start>", eos_token="<stop>", lower=True)
    LABEL = Field(sequential=False, unk_token=None, dtype=torch.long, use_vocab=False)
    #Treat label as a sequential field, pad it as well!


    # Create a Dataset instance
    fields = [("text", TXT), ("label", LABEL)]
    trn_data = TabularDataset(os.path.join(rpath, train_name+suffix), format="CSV", fields=fields, skip_header=False)
    val_data = TabularDataset(os.path.join(rpath, dev_name+suffix), format="CSV", fields=fields, skip_header=False)
    tst_data = TabularDataset(os.path.join(rpath, test_name+suffix), format="CSV", fields=fields, skip_header=False)
    #No test data

    
    # Split
    # Build vocab using training data
    TXT.build_vocab(trn_data, min_freq=min_freq) # or max_size=10000
    LABEL.build_vocab(trn_data, min_freq=min_freq)
    # 
    train_iter, val_iter, test_iter = BucketIterator.splits((trn_data, val_data, tst_data), # data
                                                             batch_size=batch_size, # 
                                                             sort=True, # sort_key not specified
                                                             sort_key = lambda x : len(x.text),
                                                             shuffle=False, # shuffle between epochs
                                                             repeat=False)
    return train_iter, val_iter, test_iter, TXT

In [10]:
train_iter, val_iter, test_iter, txtfield = reader("full_train", "full_dev", "full_test", suffix='.csv', rpath="processed_data", min_freq=5, batch_size=16)

In [11]:
vocab_size = len(txtfield.vocab)
print("Vocab size = {}".format(vocab_size))
pad = txtfield.vocab.stoi[txtfield.pad_token]

print("[TRAIN]:%d (dataset:%d)\t[VAL]:%d (dataset:%d)\t"
    % (len(train_iter), len(train_iter.dataset),
    len(val_iter), len(val_iter.dataset)))
print("[vocab]:%d" % (vocab_size))

Vocab size = 52693
[TRAIN]:19282 (dataset:308504)	[VAL]:1135 (dataset:18147)	
[vocab]:52693


In [12]:
txtfield.vocab.load_vectors('glove.6B.50d')

.vector_cache/glove.6B.zip: 862MB [06:27, 2.23MB/s]                          
100%|█████████▉| 399762/400000 [00:12<00:00, 33233.94it/s]

In [13]:
txtfield.vocab.vectors.shape

torch.Size([52693, 50])

## LSTM model

In [137]:
class LSTM(nn.Module):
  def __init__(self, vocab, vocab_size, pad_token, n_hidden=32, n_layers=1, drop_prob=0, lr=0.1):
    super().__init__()

    self.vocab_size = vocab_size
    self.n_hidden = n_hidden
    self.n_layers = n_layers
    self.pad = pad_token
    self.emb_layer = nn.Embedding.from_pretrained(vocab.vectors, padding_idx = self.pad)
    # self.emb_layer = nn.Embedding(vocab_size, n_input, padding_idx = self.pad)
    self.dropout = nn.Dropout(drop_prob)
    self.lstm = nn.LSTM(50, n_hidden, n_layers, dropout=drop_prob, batch_first=False)
    self.fc = nn.Linear(n_hidden, 2)
    self.lr = lr
  
  def forward(self, batch, hidden):
    input = batch.text.cuda()

    #Create word embeddings
    embedded = self.emb_layer(input)
    lstm_output, hidden = self.lstm(embedded, hidden)
    summed = torch.sigmoid(lstm_output.sum(axis=0))

    out = self.dropout(summed)
    out = self.fc(out)
    # out = out[-1]
    probs = F.softmax(out, dim=-1) #Is this right? 

    # #label = label.permute(1,0)

    # #out = out.permute(1, 2, 0)
    # loss = F.cross_entropy(out, label, reduction="mean")

    return probs, hidden

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
# 
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(), 
                 weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
    return hidden

In [136]:
a.permute(1,0)

tensor([[0.0418, 0.3870, 0.7510, 0.3568, 0.9280, 0.1097, 0.8260, 0.3454, 0.3609,
         0.7454, 0.7957, 0.4409, 0.7888, 0.8633, 0.2067, 0.0989],
        [0.9582, 0.6130, 0.2490, 0.6432, 0.0720, 0.8903, 0.1740, 0.6546, 0.6391,
         0.2546, 0.2043, 0.5591, 0.2112, 0.1367, 0.7933, 0.9011]],
       device='cuda:0', grad_fn=<PermuteBackward>)

In [174]:
def batch_train(batch, model, optimizer):

  hidden = model.init_hidden(batch_size=batch.batch_size)

  model.train()

  optimizer.zero_grad()

  probs, _ = model(batch, hidden)
  logprobs = torch.log(probs)
  loss = F.cross_entropy(logprobs, batch.label.cuda())

  loss.backward()
  clip_grad_norm(model.parameters(), grad_clip)
  optimizer.step()

  return model, loss.item()

In [181]:
def eval(data_iter, model):
  model.eval()

  val_loss, val_batch = 0, 0
  total_example, correct_pred = 0, 0

  for b, batch in enumerate(data_iter):
    hidden = model.init_hidden(batch_size=batch.batch_size)
    probs, _ = model(batch, hidden)
    logprobs = torch.log(probs)
    loss = F.cross_entropy(logprobs, batch.label.cuda()).item()

    val_batch += 1
    val_loss += loss

    max_logprobs, pred_labels = torch.max(logprobs, -1)
    correct_pred += (pred_labels==batch.label.to("cuda:0")).sum()
    total_example += batch.batch_size
  
  acc = (1.0 * correct_pred) / total_example
  return (val_loss / val_batch), acc

In [None]:
#Initialize model with params specified in HW description
model = LSTM(txtfield.vocab, vocab_size, pad, n_hidden=32, n_layers=2, drop_prob=0.5, lr=0.01)
#Move to colab GPU for speed updates
model.cuda()
#Initialize hidden state
# hidden = model.init_hidden(batch_size=16)
#Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0)
# the norm of grad clipping
grad_clip = 1.0

# ------------------------------------
# 3. Define the numbers of training epochs and validation steps (val_steps not used here)
epoch, val_step = 5, 2000

# ------------------------------------
# 4. Training iterations
TrnLoss, ValLoss, ValAcc = [], [], []
total_batch = 0
for e in trange(epoch):
    epoch_loss = []
    print("Beginning epoch {}".format(e+1))
    for b, batch in tqdm(enumerate(train_iter)):
        total_batch += 1
        # Update parameters with one batch
        model, loss = batch_train(batch, model, optimizer)
        if total_batch % val_step == 0:
          val_loss, val_acc = eval(val_iter, model)
          ValLoss.append(val_loss)
          ValAcc.append(val_acc)
          print(val_acc.item(), val_loss, loss)
          TrnLoss.append(loss)
        epoch_loss.append(loss)
        # if b % 500 == 0:
          # print(loss)
    print("Epoch {} average loss: {}".format((e+1), np.mean(epoch_loss)))    

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Beginning epoch 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.49639058113098145 0.6981986803105225 0.6773391366004944
0.49699676036834717 0.6955423464333959 0.6937387585639954
0.5268639326095581 0.6932600525507318 0.7177443504333496
0.5268639326095581 0.6916792071338267 0.6613849401473999
0.5332561731338501 0.6916865199673018 0.726850688457489
0.5550228953361511 0.6840822296520687 0.7150747776031494
0.5609742403030396 0.6824279056246585 0.6951236724853516
0.5592108964920044 0.6832593782357708 0.6933128833770752
0.513914167881012 0.6938812550994268 0.7094101309776306

Epoch 1 average loss: 0.6879062445878908
Beginning epoch 2


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.5710034966468811 0.6732364272231047 0.7287817597389221
0.5643908381462097 0.6721234105494579 0.5892385840415955
0.5778916478157043 0.6691411403569881 0.6150476932525635
0.5548024773597717 0.6706300780899199 0.6762993931770325
0.5847247242927551 0.6641097607591604 0.6780310869216919
0.5946988463401794 0.6606534318251757 0.6444752216339111
0.5943130850791931 0.6662679448526861 0.6813730597496033
0.5927701592445374 0.6629690396365615 0.5597493052482605
0.5951396822929382 0.6614272492572599 0.7470428943634033
0.6016421318054199 0.6614828247330787 0.6631057262420654

Epoch 2 average loss: 0.6679751505440986
Beginning epoch 3


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.5579985976219177 0.6607260475862394 0.7486350536346436
0.6012012958526611 0.6546048745972469 0.6806470155715942
0.5938171744346619 0.6535347611893642 0.8063373565673828
0.5993828177452087 0.6522470233198829 0.6434406638145447
0.6103488206863403 0.6480269082579844 0.5989441275596619
0.600374698638916 0.6537859886490826 0.6902320384979248
0.6079241633415222 0.6493958027877471 0.706602156162262
0.606766939163208 0.6509491114364322 0.6345384120941162
0.6139307022094727 0.6490267737321391 0.6740504503250122

Epoch 3 average loss: 0.6565005093574919
Beginning epoch 4


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.6132693886756897 0.6521363747014873 0.5183908939361572
0.5798203349113464 0.6552044664185478 0.4827478528022766
0.5744200348854065 0.6508212927154507 0.6802610754966736
0.6059954762458801 0.6458558550752732 0.5586807131767273


In [None]:
for e in trange(15):
    epoch_loss = []
    print("Beginning epoch {}".format(e+1))
    for b, batch in tqdm(enumerate(train_iter)):
        total_batch += 1
        # Update parameters with one batch
        model, loss = batch_train(batch, model, optimizer)
        if total_batch % val_step == 0:
          val_loss, val_acc = eval(val_iter, model)
          ValLoss.append(val_loss)
          ValAcc.append(val_acc)
          print(val_acc.item(), val_loss, loss)
          TrnLoss.append(loss)
        epoch_loss.append(loss)
        # if b % 500 == 0:
          # print(loss)
    print("Epoch {} average loss: {}".format((e+1), np.mean(epoch_loss)))