# Feed-forward neural net. 

This notebook is meant to be run **online**, in google colab to take advantage of the free GPUs. If you run the data reading cell locally, it might yak stuff up in your local directory. 

Simplifications I'm making FOR NOW: 
- No punctuation
- No test data

In [None]:
import torch, spacy
import random
import numpy as np
from torch import nn
from tqdm.notebook import trange, tqdm
import os


from matplotlib import pyplot as plt
from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_ as clip_grad_norm
from torch.nn import functional as F

seed = 811

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark = False
print(torch.randn(5))

tensor([-1.1484,  0.0960, -2.0328, -0.0796, -0.7443])


In [None]:
#Read in data...

!rm -rf processed_data processed_data.zip
import urllib.request
url = "https://github.com/benartuso/bias-detection/blob/main/data/processed_data.zip?raw=true"
filename, headers = urllib.request.urlretrieve(url, filename="processed_data.zip")
!unzip processed_data.zip
print("Done")

Archive:  processed_data.zip
   creating: processed_data/
  inflating: processed_data/biased.word.dev  
  inflating: processed_data/dev_ann.csv  
  inflating: processed_data/train_ann.csv  
  inflating: processed_data/biased.word.test  
  inflating: processed_data/biased.word.train  
Done


In [None]:
from torchtext.data import Field, ReversibleField, Dataset, TabularDataset, BucketIterator, Iterator

spacy_en = spacy.load('en')

def tokenize_fn(text):
  return text.strip().split()

def reader(suffix = ".csv", rpath="processed_data", batch_size=8, min_freq=2):
  #Text field, sequence data
  TXT = Field(sequential=True, tokenize=tokenize_fn, init_token=None, eos_token=None, lower=True)
  #Binary labels
  LABEL = Field(sequential=False, unk_token=None, dtype=torch.long, use_vocab=False)

  #Create Dataset
  fields = [("text", TXT), ("label", LABEL)]
  trn_data = TabularDataset(os.path.join(rpath, 'train_ann'+suffix), format="CSV", fields=fields, skip_header=True)
  val_data = TabularDataset(os.path.join(rpath, 'dev_ann'+suffix), format="CSV", fields=fields, skip_header=True)


  #Split
  #Build vocab on train set  
  TXT.build_vocab(trn_data, min_freq=min_freq)

  train_iter, val_iter = BucketIterator.splits((trn_data, val_data), 
                                               batch_size=batch_size, 
                                               sort_key= lambda x: len(x.text),
                                               shuffle=False,
                                               repeat=False)
  
  return train_iter, val_iter, TXT

In [None]:
train_iter, val_iter, txtfield = reader(suffix=".csv", min_freq=5)
vocab_size = len(txtfield.vocab)

print("Vocab size = {}".format(vocab_size))
pad = txtfield.vocab.stoi[txtfield.pad_token]

print("[TRAIN]:%d (dataset:%d)\t[VAL]:%d (dataset:%d)\t"
    % (len(train_iter), len(train_iter.dataset),
    len(val_iter), len(val_iter.dataset)))
print("[vocab]:%d" % (vocab_size))

Vocab size = 27401
[TRAIN]:13451 (dataset:107606)	[VAL]:175 (dataset:1400)	
[vocab]:27401


## Basic FFNN

In [None]:
class NeuralClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, drop_rate=0.0, class_size=2, pad=1):
        super(NeuralClassifier, self).__init__()
        """ Initialization
        - vocab_size
        - embed_size: word embedding size
        - drop_rate: dropout rate
        - class_size: number of classes. For binary classification, class_size = 2
        """
        # ---------------------------------
        # Configuration
        self.vocab_size = vocab_size # size of the vocab
        self.class_size = class_size # number of classes
        self.dropout = nn.Dropout(drop_rate)
        self.pad = pad
        # ---------------------------------
        # Network parameters
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad)
        self.fc = nn.Linear(embed_size, class_size, bias=True)


    def forward(self, batch):
        """ Forward function
        """
        input, label = batch.text, batch.label
        
        # ---------------------------------
        # === Hidden layer ===
        # Sum over all the embeddings for each input text
        #   then, pass through the nonlinear Sigmoid function
        x = self.embed(input) # Dim: L x B x E
        x = self.dropout(x)
        hidden = torch.sigmoid(x.sum(axis=0)) # Dim: B x E
        
        # ---------------------------------
        # === Classification layer ===
        logit = self.fc(hidden) # Dim: Batch_size x Class_size
        # Normalization
        logprob = F.log_softmax(logit, dim=1)
        
        # ---------------------------------
        # === Loss function ===
        # Compute negative log-likelihood loss
        loss = F.cross_entropy(logprob, label)
        return loss, logprob


In [None]:

def batch_train(batch, model, optimizer):
    """ Training with one batch
    - batch: a min-batch of the data
    - model: the defined neural network
    - optimizer: optimization method used to update the parameters
    """
    # set in training mode
    model.train()
    # initialize optimizer
    optimizer.zero_grad()
    # forward: prediction
    loss, _ = model(batch)
    # backward: gradient computation
    loss.backward()
    # norm clipping, in case the gradient norm is too large
    clip_grad_norm(model.parameters(), grad_clip)
    # gradient-based update parameter
    optimizer.step()
    return model, loss.item()

In [None]:
def eval(data_iter, model):
    """ Evaluate the model with the data
    data_iter: the data iterator 
    model: the defined model
    """
    # set in the eval model, which will trun off the features only used for training, such as droput
    model.eval()
    # records
    val_loss, val_batch = 0, 0
    total_example, correct_pred = 0, 0
    # iterate all the mini batches for evaluation
    for b, batch in enumerate(data_iter):
        # Forward: prediction
        loss, logprob = model(batch)
        # 
        val_batch += 1
        val_loss += loss
        # Argmax
        max_logprob, pred_label = torch.max(logprob, -1)
        correct_pred += (pred_label==batch.label).sum()
        total_example += batch.label.size()[0]
    acc = (1.0*correct_pred)/total_example
    # print("val_batch = {}".format(val_batch))
    return (val_loss/val_batch), acc

In [None]:
model = NeuralClassifier(vocab_size, embed_size=64, drop_rate=0, class_size=2)

optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=0)
# the norm of grad clipping
grad_clip = 1.0

# ------------------------------------
# 3. Define the numbers of training epochs and validation steps
epoch, val_step = 5, 50

# ------------------------------------
# 4. Training iterations
TrnLoss, ValLoss, ValAcc = [], [], []
total_batch = 0
for e in trange(epoch):
    # print(e)
    for b, batch in tqdm(enumerate(train_iter)):
        total_batch += 1
        # Update parameters with one batch
        model, loss = batch_train(batch, model, optimizer)
        # Compute validation loss after each val_step
        if total_batch % val_step == 0:
            val_loss, val_acc = eval(val_iter, model)
            ValLoss.append(val_loss)
            ValAcc.append(val_acc)
            print(val_loss.item(), loss)
            TrnLoss.append(loss)
print("The best validation accuracy = {:.4}".format(max(ValAcc)))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.7259849905967712 0.5181664824485779
0.7660607695579529 0.4068308472633362
0.8277421593666077 0.3342895209789276
0.9083356857299805 0.28421109914779663
1.0050609111785889 0.17047050595283508
1.108483910560608 0.12410543859004974
1.1995041370391846 0.1060321256518364
1.2812292575836182 0.1062275692820549
1.3528187274932861 0.09899325668811798
1.4161514043807983 0.09634970128536224
1.4704136848449707 0.05886099115014076
1.5206832885742188 0.05574488639831543
1.5685018301010132 0.04919278621673584
1.6113319396972656 0.04393501579761505
1.6540288925170898 0.03821513429284096
1.691428780555725 0.03428789600729942
1.7261455059051514 0.0298428013920784
1.7592607736587524 0.03122178465127945
1.7897720336914062 0.027517827227711678
1.8194454908370972 0.03467577323317528
1.847251534461975 0.027930444106459618
1.8744430541992188 0.027495019137859344
1.8998764753341675 0.03129817545413971
1.9237357378005981 0.024634819477796555
1.946794033050537 0.025203874334692955
1.969254732131958 0.0271742083

KeyboardInterrupt: ignored