In [None]:
# Install all the necessary packages
!pip install pytorch-crf requests regex sentencepiece sacremoses tokenizers transformers



In [None]:
import os
import time
import datetime
import random
import pandas as pd
import numpy as np
from torchcrf import CRF
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW,  BertConfig, BertForTokenClassification, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import urllib.request


In [None]:
# Download the data
urllib.request.urlretrieve('https://ingredient-tagger.s3-us-west-2.amazonaws.com/train_file_cleaned.csv', 'train.csv')
urllib.request.urlretrieve('https://ingredient-tagger.s3-us-west-2.amazonaws.com/test_file_cleaned.csv', 'test.csv')

('test.csv', <http.client.HTTPMessage at 0x7f2a9fb461d0>)

In [None]:
# Read the data to train and test pandas dataframes
train_df = pd.read_csv('train.csv', delimiter='\t', header=None,
                       usecols=[0,1,5],  names=['word', 'wordnum', 'label'])
test_df = pd.read_csv('test.csv', delimiter='\t', header=None,
                      usecols=[0,1,5],  names=['word', 'wordnum', 'label'])

In [None]:
# Functions for processing data

MAX_LEN = 30

def build_word_dictionary(df):
    """
    Builds a dictionary of word:token_id from the training dataframe df
    """
    words = list(set(df.word.values))
    word_dict = {w: i+1 for i, w in enumerate(words)}
    word_dict['UKN'] = len(word_dict)
    word_dict['PAD'] = 0
    return word_dict


def data_to_sentences(df):
    """
    Extracts sentences and tokens out of the dataframe df

    Parameters:
        df  - pandas dataframe with the data

    Returns: 
        sentence_list - list of sentences, split to list of words
        labels_list - list of label lists for each sentence
    """
    sentence = []
    sentence_list = []
    labels = []
    labels_list = []
    for row in df.itertuples():   
        if row.wordnum == 'I1':
            sentence_list.append(sentence)
            labels_list.append(labels)
            sentence = []
            labels = []
        sentence.append(row.word)
        labels.append(row.label)

    sentence_list.append(sentence)
    labels_list.append(labels)
    
    sentence_list.pop(0)
    labels_list.pop(0)
    
    return sentence_list, labels_list


def tokenize_and_pad_sentences(sentence_list, tokenizer, maxlen):
    """
    Tokenizes the list of sentences, and pads them to the maxlen length
    
    Parameters:
        sentence_list (list) - list of sentences, split by words
        tokenizer (Tokenizer) - tokenizer
        maxlen (int) - lengh to pad/truncate the sentences to

    Returns:
        padded_sents (list) - tokenized and padded sentences
    """
    sentences = [" ".join([word for word in sent]) for sent in sentence_list]
    encoded_sents = [tokenizer.encode(sent,add_special_tokens = True) 
                     for sent in sentences]
    padded_sents = pad_sequences(encoded_sents, maxlen=maxlen, dtype="long", 
                          value=0, truncating="post", padding="post")
    return padded_sents


def convert_and_pad_labels(tags_list, tag_dict, maxlen):
    """
    Converts the list of tag sequences to integers and pads them to the maxlen
    length

    Parameters:
        tag_list (list) - list of tag sequences
        tag_dict (dict) - dictionary of tags to their integer ids
        maxlen (int) - length to pad/truncate the tag sequences to

    Returns:
        padded_tags (list) - padded sequences of tag ids
    """
    tag_ids = [[tag_dict[t] for t in s] for s in tags_list]
    padded_tags = pad_sequences(tag_ids, maxlen=maxlen, dtype="long", 
                          value=0, truncating="post", padding="post")
    return padded_tags

def convert_and_pad_sentences(sentence_list, word_dict, maxlen):
    """
    Converts the list of sentences to lists of word integer ids and pads 
    them to the maxlen length

    Parameters:
        sentence_list (list) - list of sentences split by words
        word_dict (dict) - dictionary of words to their integer ids
        maxlen (int) - length to pad/truncate the sentences to

    Returns:
        padded_words (list) - padded sequences of word ids
    """
    word_ids = [[word_dict.get(w, word_dict['UKN']) for w in s] 
                for s in sentence_list]
    padded_words = pad_sequences(word_ids, maxlen=maxlen, dtype="long", 
                          value=0, truncating="post", padding="post")
    return padded_words


def process_data(df, tokenizer, tag_dict, word_dict, maxlen):
    """
    Converts the data from dataframe df to the paded sequences of integer ids

    Parameters:
        df (DataFrame) - dataframe with the data
        tokenizer (Tokenizer) - tokenizer for word sequences
        tag_dict (dict) - dictionary of tags and their integer ids
        word_dict (dict) - dictionary of words and theri integer ids
        maxlen (int) - length to pad/trankate sequences to

    Returns:
        X_bert - tokenized and padded list of sentences (BertTokenizer)
        X - list of sentences, padded and converted to list of word integer ids
        y = list of tags, padded and converted to their integer ids   
    """
    sentences, labels = data_to_sentences(df)
    X_bert = tokenize_and_pad_sentences(sentences, tokenizer, maxlen)
    X = convert_and_pad_sentences(sentences, word_dict, maxlen)
    y = convert_and_pad_labels(labels, tag_dict, maxlen)

    return X_bert, X, y


In [None]:
tag_dict = {'PAD': 0, 
            'B-QTY': 1,  
            'B-RANGE_END': 2,
            'B-UNIT': 3,  
            'I-UNIT': 4,
            'B-NAME': 5,
            'I-NAME': 6,
            'OTHER': 7}

In [None]:
word_dict = build_word_dictionary(train_df)

In [None]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 
                           'tokenizer', 
                           'bert-base-uncased')   



Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


In [None]:
train_X_bert_full, train_X_full, train_y_full = process_data(train_df, tokenizer, tag_dict, word_dict, MAX_LEN)
test_X_bert, test_X, test_y = process_data(test_df, tokenizer, tag_dict, word_dict, MAX_LEN)

In [None]:
def get_attention_masks(input_ids):
    """
    Returns list of attention masks for ids greater than pad id
    """
    return [[int(token_id > 0) for token_id in sent] for sent in input_ids]


def get_data_loader(batch_size, train_x, train_y):
    """
    Builds pytorch dataloader
    """

    attention_masks = get_attention_masks(train_x)
        
    train_ds = TensorDataset(torch.tensor(train_x), 
                             torch.ByteTensor(attention_masks), 
                             torch.tensor(train_y))
    train_sampler = RandomSampler(train_ds)


    return torch.utils.data.DataLoader(train_ds,
                                       sampler=train_sampler,
                                       batch_size=batch_size)


In [None]:
train_X, val_X, _, _ = train_test_split(train_X_full, train_y_full,  random_state=1, test_size=0.1)     
train_X_bert, val_X_bert, train_y, val_y = train_test_split(train_X_bert_full, train_y_full,  random_state=1, test_size=0.1)                                     

In [None]:
batch_size = 32

train_dataloader_full = get_data_loader(batch_size, train_X_full, train_y_full)
train_dataloader_bert_full = get_data_loader(batch_size, train_X_bert_full, train_y_full)

train_dataloader = get_data_loader(batch_size, train_X, train_y)
train_dataloader_bert = get_data_loader(batch_size, train_X_bert, train_y)

val_dataloader = get_data_loader(batch_size, val_X, val_y)
val_dataloader_bert = get_data_loader(batch_size, val_X_bert, val_y)

test_dataloader = get_data_loader(batch_size, test_X, test_y)
test_dataloader_bert = get_data_loader(batch_size, test_X_bert, test_y)

In [None]:
class Identity(nn.Module):
    """
    Auxilary Identiy layer to remove the last Bert layer when we need to add
    other layers on top
    """
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

    
class IngredientTagger(nn.Module):
    """
    Main model for ingredient tagging
    """
    def __init__(self, num_tags, use_bert, use_rnn, vocab_size, word_embed_dim, rnn_hidden_dim, dropout, use_crf):
        """
        Model initialization

        Parameters:
            num_tags (int) - total number of tags
            use_bert (bool) - use Bert layer if True
            use_rnn (bool) - use LSTM layer if True
            vocab_size (int) - total number of words in the vocabulary
            word_embed_dim (int) - dimention of the Embedding layer
            rnn_hidden_dim (int)- dimention of the LSTM hidden states
            use_crf (bool) - use CRF layer if True

        """
        super(IngredientTagger, self).__init__()
       
        self.use_bert = use_bert
        if self.use_bert:
            self.bert = BertForTokenClassification.from_pretrained(   
                "bert-base-uncased", 
                num_labels = num_tags, 
                output_attentions = False, 
                output_hidden_states = False,
            )
          
            enc_size = 768
        else:
           self.input_dim = vocab_size
           self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0)
           enc_size = word_embed_dim

        self.use_rnn=use_rnn
        if self.use_rnn:
            if self.use_bert:
                self.bert.classifier = Identity() 
            self.rnn = nn.LSTM(bidirectional=True,
                               input_size=enc_size,
                               hidden_size=rnn_hidden_dim//2,
                               batch_first=True,
                               dropout=dropout)  
            self.drop = nn.Dropout(dropout)
            enc_size = rnn_hidden_dim 

            
        
        self.fc1 = nn.Linear(enc_size, num_tags)

        self.use_crf = use_crf 
        if self.use_crf:
            self.crf = CRF(num_tags, batch_first=True)
        

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    def __str__(self):
      """
      Returns the string representaiton of the model
      """
      return f'BERT: {self.use_bert}, LSTM: {self.use_rnn}, CRF: {self.use_crf}'
   
    def loss(self, x, y, attention_mask):
        """
        Return the loss after one forward pass

        Parameters:
            x (LongTensor) - tensor of tokenized and padded sentences
            y (LongTensor) - tensor of tokenized and padded tags
            attention_mask(ByteTensor) - tensor of attention_masks

        Returns:
            loss - loss after one model forward pass  

        """
           
        x = x.to(self.device)
        y = y.to(self.device)

        if self.use_bert:
            if self.use_rnn:
                encoded_layers = self.bert(x,
                                           attention_mask=attention_mask,
                                           token_type_ids=None)
            else:
                encoded_layers = self.bert(x,
                                           labels=y,
                                           attention_mask=attention_mask,
                                           token_type_ids=None)
            bert_loss  = encoded_layers.loss
            logits = encoded_layers.logits
        else:
            logits = self.embeds(x)
            
        if self.use_rnn:
            enc, _ = self.rnn(logits)  
            enc = self.drop(enc)
            logits = self.fc1(enc) 
            
        if self.use_crf:
                loss = -self.crf(logits, y, attention_mask)
        elif self.use_rnn:
                loss = nn.CrossEntropyLoss()(logits.view(-1, logits.shape[-1]),y.view(-1))
        else:
                loss = bert_loss
      
        return loss
        
        
    def forward(self, x, attention_mask):
        """
        Model's forward pass (used for prediction)
        """

        x = x.to(self.device) 
       
        with torch.no_grad():
            if self.use_bert:
                self.bert.eval()
                encoded_layers = self.bert(x,
                                          attention_mask=attention_mask,
                                          token_type_ids=None) 
                logits = encoded_layers.logits  
            else:
                logits = self.embeds(x)

            if self.use_rnn:
                enc, _ = self.rnn(logits)  
                logits = self.fc1(enc)  
 
            if self.use_crf:
                y_hat =  torch.tensor(self.crf.decode(logits))
            else:
                y_hat = logits.argmax(-1) 
    
        return y_hat
        

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [None]:
def flat_accuracy(preds, labels):
    """
    Computes accuracy given arrays of predictions and labels

    Parameters:
        preds (np.ndarray) - array of predictions
        labels (np.ndarray) - array of ground truth labels
    """
    pred_flat = preds.flatten() 
    labels_flat = labels.flatten()
    mask = labels_flat != 0
    return np.sum(pred_flat[mask] == labels_flat[mask]) / len(labels_flat[mask])

In [None]:
def evaluate(model, dataloader):
    """
    Returns accuracy of the model on validation/test data
    """
    
    model.eval()
  
    test_preds = []
    test_y = []

 
    for batch in dataloader:
        
        batch = tuple(t.to(device) for t in batch)
        
        input_ids, input_mask, labels = batch

        with torch.no_grad():        
            outputs = model(input_ids, 
                            attention_mask=input_mask)

        preds = outputs
    
        logits =  preds.to('cpu').numpy() 
        label_ids = labels.to('cpu').numpy()
         
        test_preds.append(logits)
        test_y.append(label_ids)
        
    correct = 0
    n = 0

    for pred, label in zip(test_preds, test_y):
        mask = label != 0
        correct += (pred[mask] == label[mask]).sum()
        n += len(label[mask])
      
    accuracy = correct/n
   
    return accuracy

In [None]:
def format_time(elapsed):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train_model(model, train_dataloader, validation_dataloader, epochs, optimizer, scheduler):
    """
    Trains and the model, using early stopping

    Parameter:
        model (IngredientTagger) - model to train
        train_dataloader (Dataloader) - dataloader for train dataset
        validation_dataloader (Dataloader) - dataloader for validation dataset
        epochs (int) - maximum number of epochs to train
        optimizer (Optimizer) - optimizer used for training
        scheduler (Scheduler) - scheduler for the learning rate 

    Returns:
       model - trained model

    """
    seed_val = 1
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
   
    model.to(device)
    t0 = time.time()

    best_acc = 0
    early_stop = 0 
    patience = 2
    model_name = 'best_model'
    best_epoch = 0

    for epoch in range(0, epochs):
       
        total_loss = 0
      
        model.train()

        for step, batch in enumerate(train_dataloader):
            
            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            labels = batch[2].to(device)

            model.zero_grad()        

            loss = model.loss(input_ids, labels, input_mask)

            total_loss += loss.item()
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
        scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)  

        # Validation
        acc = evaluate(model, validation_dataloader)
        if acc > best_acc:
           best_acc = acc
           early_stop = 0
           torch.save(model.state_dict(), model_name)
           best_epoch = epoch
        else:
           early_stop += 1

        if early_stop > patience:
          print(f'Epoch {best_epoch}: early stop')
          print(f'Accuracy on validation dataset: {best_acc}')
          break

    model.load_state_dict(torch.load(model_name))
 
    print(f'Training took: {format_time(time.time() - t0)}')
    return model

In [None]:
# List of parameters for Embedding, LSTM, CRF and fc layers
non_bert_params=['embeds.weight',
                'rnn.weight_ih_l0',
                'rnn.weight_hh_l0',
                'rnn.bias_ih_l0',
                'rnn.bias_hh_l0',
                'rnn.weight_ih_l0_reverse',
                'rnn.weight_hh_l0_reverse',
                'rnn.bias_ih_l0_reverse',
                'rnn.bias_hh_l0_reverse',
                'fc1.weight',
                'fc1.bias',
                'crf.start_transitions',
                'crf.end_transitions',
                'crf.transitions']

In [None]:
def freeze_bert(model):
  print('Freezing BERT layers')
  for param in model.named_parameters():
    if param[0] not in non_bert_params:
      param[1].requires_grad = False
  return model

In [None]:
def unfreeze(model):
  print('Unfreezing BERT for fine tuning')
  for param in model.parameters():
    param.requires_grad = True
  return model

In [None]:
def train_and_evaluate(model, epochs):
  """
  Trains the model, using early stopping, and evaluates it on the test dataset

  Parameters:
      model (IngredientTagger) - model to train and evaluate
      epochs (int) - maximum number of epochs to train the model

  Returns:
     accuracy - accuracy of the model on the test dataset
  """
 
  # We'll use different learning rate for BERT and subsequent layers
  params = list(map(lambda x: x[1],list(filter(lambda kv: kv[0] in non_bert_params, model.named_parameters()))))
  base_params = list(map(lambda x: x[1],list(filter(lambda kv: kv[0] not in non_bert_params, model.named_parameters()))))

  optimizer = AdamW([{'params': base_params}, {'params': params, 'lr': 1e-4}], lr=2e-5)

  if model.use_bert:
    train_data_loader = train_dataloader_bert
    test_data_loader = test_dataloader_bert
    val_data_loader = val_dataloader_bert
  else:
    train_data_loader = train_dataloader
    test_data_loader = test_dataloader
    val_data_loader = val_dataloader

  total_steps = len(train_data_loader) * epochs

  num_warmup_steps = 0
  num_training_steps = total_steps

  # Lambda for warmup scheduler for BERT learning rate
  def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, 
                                                lr_lambda=[lr_lambda,
                                                           lambda epoch: 0.98 ** epoch])
  if model.use_bert and (model.use_rnn or model.use_crf):
    model = freeze_bert(model)
    model = train_model(model, train_data_loader, val_data_loader, epochs, optimizer, scheduler)
    model = unfreeze(model)
  model = train_model(model, train_data_loader, val_data_loader, epochs, optimizer, scheduler)
  accuracy = evaluate(model, test_data_loader)
  return accuracy

In [None]:
# # Initiate the list of model configurations to test
word_embed_dim = 300
rnn_hidden_dim = 200
dropout = 0.5

models = [IngredientTagger(len(tag_dict),
                           use_bert=False, 
                           use_rnn=True, 
                           vocab_size=len(word_dict),
                           word_embed_dim=word_embed_dim,
                           rnn_hidden_dim=rnn_hidden_dim,
                           dropout = dropout,
                           use_crf=False),
          IngredientTagger(len(tag_dict),
                           use_bert=False, 
                           use_rnn=True, 
                           vocab_size=len(word_dict),
                           word_embed_dim=word_embed_dim,
                           rnn_hidden_dim=rnn_hidden_dim,
                           dropout = dropout,
                           use_crf=True),
          IngredientTagger(len(tag_dict),
                           use_bert=True, 
                           use_rnn=False, 
                           vocab_size=len(word_dict),
                           word_embed_dim=word_embed_dim,
                           rnn_hidden_dim=rnn_hidden_dim,
                           dropout = dropout, 
                           use_crf=False),
          IngredientTagger(len(tag_dict),
                           use_bert=True, 
                           use_rnn=True, 
                           vocab_size=len(word_dict),
                           word_embed_dim=word_embed_dim,
                           rnn_hidden_dim=rnn_hidden_dim,
                           dropout = dropout, 
                           use_crf=False),
          IngredientTagger(len(tag_dict),
                           use_bert=True, 
                           use_rnn=False, 
                           vocab_size=len(word_dict),
                           word_embed_dim=word_embed_dim,
                           rnn_hidden_dim=rnn_hidden_dim,
                           dropout = dropout, 
                           use_crf=True),
          IngredientTagger(len(tag_dict),
                           use_bert=True, 
                           use_rnn=True, 
                           vocab_size=len(word_dict),
                           word_embed_dim=word_embed_dim,
                           rnn_hidden_dim=rnn_hidden_dim,
                           dropout = dropout, 
                           use_crf=True)]

  "num_layers={}".format(dropout, num_layers))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [24]:
%%time
# This cell takes about 9 hours to run

epochs = 20

for model in models:
  print(f'Model configuration: {model}')
  accuracy = train_and_evaluate(model, epochs)
  print(f'Accuracy on test dataset: {accuracy}')
  print("")
  print("")

Model configuration: BERT: False, LSTM: True, CRF: False
Epoch 10: early stop
Accuracy on validation dataset: 0.8930275746961611
Training took: 0:05:35
Accuracy on test dataset: 0.8961605954111564


Model configuration: BERT: False, LSTM: True, CRF: True
Epoch 5: early stop
Accuracy on validation dataset: 0.8921342767813065
Training took: 0:22:32
Accuracy on test dataset: 0.8936436471756077


Model configuration: BERT: True, LSTM: False, CRF: False
Epoch 5: early stop
Accuracy on validation dataset: 0.8976703730694187
Training took: 1:20:09
Accuracy on test dataset: 0.901307401709643


Model configuration: BERT: True, LSTM: True, CRF: False
Freezing BERT layers
Epoch 19: early stop
Accuracy on validation dataset: 0.8791697030959825
Training took: 1:11:53
Unfreezing BERT for fine tuning
Epoch 4: early stop
Accuracy on validation dataset: 0.8962363948376784
Training took: 1:10:48
Accuracy on test dataset: 0.8987151802558349


Model configuration: BERT: True, LSTM: False, CRF: True
Freezi