"""
TODO: Your code below!

This file should implement all steps described in Part 2, and can be structured however you want.

Rather than using normal BERT, you should use distilbert-base-uncased. This will train faster.

We recommend training on a GPU, either by using HPC or running the command line commands on Colab.

Hints:
    * It will probably be helpful to save intermediate outputs (preprocessed data).
    * To save your finetuned models, you can use torch.save().
"""

In [None]:
# Downloading Dependencies
!pip install transformers
!pip install datasets
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-12-09 03:06:12.520190: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/My Drive/NLPAssignments/hw3

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/NLPAssignments/hw3


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from datasets import load_dataset
from src.dependency_parse import DependencyParse
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from collections import Counter

In [None]:
current_device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# save index 0 for unk and 1 for pad
global PAD_IDX, UNK_IDX, CLS_IDX, SEP_IDX
PAD_IDX = 0
CLS_IDX = 1
SEP_IDX = 2
UNK_IDX = 3

In [None]:
def build_vocab(text_tokens, max_vocab_size=10000):
    # TODO:
    # build vocab
    # returns: 
    # - id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # - token2id: dictionary where keys represent tokens and corresponding values represent indices

    all_tokens = [text_token for tokens in text_tokens for text_token in tokens]

    token_count = Counter(all_tokens)
    vocab,count = zip(*token_count.most_common(max_vocab_size))
    id2token = ["<PAD>","<CLS>","<SEP>","<UNK>"] + list(vocab)
    token2id = dict(zip(vocab, range(2, 2 + len(vocab)))) # Getting tokens and their indices
    token2id["<PAD>"] = PAD_IDX
    token2id["<CLS>"] = CLS_IDX
    token2id["<SEP>"] = SEP_IDX
    token2id["<UNK>"] = UNK_IDX

    return token2id,id2token

In [None]:
def token2index(tokens_data,token2id):
    # TODO:
    # convert token to id in the dataset
    # returns:
    # - indices_data: return list of index_list (index list for each sentence)
    indices_data = []
    for tokens in tqdm(tokens_data):
      indices = [token2id.get(token, 1) for token in tokens]
      indices_data.append(indices)
    return indices_data

def encode_data(token_list,token2id):
  encoded_list = []
  for tokens in token_list:
    token_indices = []
    for token in tokens:
      token_indices.append(token2id.get(token))
    encoded_list.append(token_indices)
  return encoded_list

def encode_rel_pos(rel_pos_list,rel_pos_vocab):
  rel_pos_idx_list = []
  
  for rel_pos in rel_pos_list:
    rel_idx = []
    for rel in rel_pos:
      rel_idx.append(rel_pos_vocab.index(rel))
    rel_pos_idx_list.append(rel_idx)
  
  return rel_pos_idx_list

In [None]:
def make_data(ud_dataset):
  dependency_parse_list = []
  iterable = iter(ud_dataset_train)
  text_list = []
  rel_list = []
  deprel_list = []
  rel_pos_vocab = []
  dep_pos_vocab = []
  token_list = []

  for ctr in range(ud_dataset_train.num_rows):
    dependency_parse_obj = DependencyParse.from_huggingface_dict(next(iterable))
    text_list.append(dependency_parse_obj.text)
    token_list.append(dependency_parse_obj.tokens)
    token_indices_list = list(enumerate(dependency_parse_obj.tokens))
    head_list = dependency_parse_obj.heads
    rel_position_list = [int(head_list[i])-token_indices_list[i][0]-1 if (dependency_parse_obj.deprel[i]!='root') else 0 for i in range(len(token_indices_list))]
    rel_pos_vocab.extend(rel_position_list)
    rel_list.append(rel_position_list) # Adding list of relative position labels
    deprel_list.append(dependency_parse_obj.deprel) # Adding list of dependency labels
    dep_pos_vocab.extend(dependency_parse_obj.deprel)

  dependency_df = pd.DataFrame({'text':text_list,'rel_pos':rel_list,'dep_label':deprel_list})
  rel_pos_vocab = list(set(rel_pos_vocab))
  rel_pos_vocab.insert(0,"[UNK]")
  rel_pos_vocab.insert(0,"[PAD]")
  print ("Relative position Vocab Size is "+str(len(rel_pos_vocab)))
  return dependency_df,rel_pos_vocab,dep_pos_vocab,deprel_list,token_list
  #dependency_df.to_csv('en_gum_10.tsv',sep='\t',index=False)

ud_dataset_train = load_dataset("universal_dependencies","en_gum", split="train")
ud_dataset_valid = load_dataset("universal_dependencies","en_gum", split="validation")
train_df,rel_pos_vocab_train,dep_pos_vocab_train,deprel_list_train,token_list_train = make_data(ud_dataset_train)
valid_df,rel_pos_vocab_valid,dep_pos_vocab_valid,deprel_list_valid,token_list_valid = make_data(ud_dataset_valid)

ud_data = {"train":train_df,"valid":valid_df}




Relative position Vocab Size is 122
Relative position Vocab Size is 122


In [None]:
token2id, id2token = build_vocab(token_list_train, max_vocab_size=10000)
text_token_train = token2index(token_list_train,token2id)
token2iddep, id2tokendep = build_vocab(deprel_list_train, max_vocab_size=10000)
text_dep_train = token2index(dep_pos_vocab_train,token2iddep)

100%|██████████| 4287/4287 [00:00<00:00, 81319.59it/s]
100%|██████████| 81861/81861 [00:00<00:00, 95346.40it/s]


In [None]:
class FineTuneDataset(Dataset):
  def __init__(self, text_list, dep_label_list ,rel_pos_list):
        """
        @param text_list: list of text tokens
        @param dep_label_list: list of dependency labels
        @param rel_pos_list: list of relative positions
        """
        self.text_list = text_list
        self.dep_tensors = []
        self.rel_tensors = []
        for sample in dep_label_list:
          self.dep_tensors.append(torch.tensor(encode_data([sample],token2iddep), dtype=torch.long).to(current_device)) # Adding idx of dependency vocab to Dataset
        for sample in rel_pos_list:
          self.rel_tensors.append(torch.tensor(encode_rel_pos([sample],rel_pos_vocab_train), dtype=torch.long).to(current_device)) # Adding indices of rel_pos_vocab to Dataset
  def __len__(self):
    return len(self.text_list)

  def __getitem__(self, key):
    token_text_idx = self.text_list[key]
    return [token_text_idx,self.dep_tensors[key], self.rel_tensors[key]]

In [None]:
ud_train = FineTuneDataset(list(ud_data['train']['text']),list(ud_data['train']['dep_label']),list(ud_data['train']['rel_pos']))
ud_valid = FineTuneDataset(list(ud_data['valid']['text']),list(ud_data['valid']['dep_label']),list(ud_data['valid']['rel_pos']))

In [None]:
def pad_collate_fn(batch):
    # batch is a list of sample tuples
    text_list = [s[0] for s in batch]
    dep_list = [s[1] for s in batch]
    rel_list = [s[2] for s in batch]
    
    return [text_list,dep_list, rel_list]

In [None]:
ud_train_generator = DataLoader(ud_train, batch_size=32,shuffle=True,collate_fn=pad_collate_fn)
ud_valid_generator = DataLoader(ud_valid, batch_size=32,shuffle=True,collate_fn=pad_collate_fn)
ud_generator = {"train":ud_train_generator,"valid":ud_valid_generator}

In [None]:
class DependencyParseBertModel(nn.Module):
  """
  """
  def __init__(self,options):
    super().__init__()

    self.distil_bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self.projection_rel = nn.Linear(options['d_hidden'],options['rel_vocab'])
    self.projection_dep = nn.Linear(options['d_hidden'],options['dep_vocab'])

  def forward(self,encoded_input_tensor):
    output, = self.distil_bert_model(**encoded_input_tensor).values() # [batch_size,seq_len,768] 
    rel_dep_logits = self.projection_rel(output) # [batch_size,121]
    dep_label_logits = self.projection_dep(output) # [batch_size,48]
    return rel_dep_logits,dep_label_logits

In [None]:
options = {
    'rel_vocab':122,
    'dep_vocab':48,
    'd_hidden':768
    }

model = DependencyParseBertModel(options).to(current_device)

rel_criterion = nn.CrossEntropyLoss(ignore_index=0).to(current_device)
dep_criterion = nn.CrossEntropyLoss(ignore_index=0).to(current_device)

params_to_update = []
for name,param in model.named_parameters():
  if param.requires_grad == True:
    params_to_update.append(param)

optimizer = torch.optim.Adam(params_to_update, lr=2e-05, eps=1e-08)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def custom_tokenize(bert_tokenized_tokens):
  bert_tokenize_clean_idx = []
  for word_id in range(len(bert_tokenized_tokens)):
    if (bert_tokenized_tokens[word_id].startswith("##") or bert_tokenized_tokens[word_id]=="-" or (word_id!=0 and bert_tokenized_tokens[word_id-1]=="-")):
        continue
    bert_tokenize_clean_idx.append(word_id)
  
  return bert_tokenize_clean_idx

def pad_list_of_tensors(list_of_tensors, pad_token,max_len):
    padded_list = []
    
    for t in list_of_tensors:
        padded_tensor = torch.cat([t.to(current_device), torch.tensor([[pad_token]*(max_len - t.size(-1))], dtype=torch.long).to(current_device)], dim = -1)
        padded_list.append(padded_tensor)
        
    padded_tensor = torch.stack(padded_list, dim=1).squeeze(0).to(current_device)
    
    return padded_tensor

In [None]:
epochs = 3
lambdas = [0.25,0.5,0.75]
distil_bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

for epoch_number in range(epochs):
  model.train()
  for _,batch in tqdm(enumerate(ud_generator["train"])):
    text_list,dep_tensors,rel_tensors = batch
    
    tokenized_val = [distil_bert_tokenizer(sentence, padding=True, return_tensors='pt').to(current_device) for sentence in text_list]
    len_batch = []

    # Getting length of maximum sentence
    for token in tokenized_val: 
      len_batch.append(token['input_ids'].size()[1])

    max_sentence_length = (max(len_batch))

    dep_tensors = pad_list_of_tensors(dep_tensors,0,max_sentence_length)
    rel_tensors = pad_list_of_tensors(rel_tensors,0,max_sentence_length)
    
    optimizer.zero_grad()
    rel_logits_pred = []
    dep_logits_pred = []
    for sentence_idx in range(len(text_list)):
      sentence = "[CLS] "+text_list[sentence_idx]+" [SEP]"
      keep_tokens_idx = torch.tensor(custom_tokenize(distil_bert_tokenizer.tokenize(sentence))).to(current_device)

      encoded_ip_tensor = distil_bert_tokenizer(sentence,return_tensors='pt',padding=True).to(current_device)
      rel_logits,dep_logits = model(encoded_ip_tensor)


      rel_logits_tensor = torch.index_select(rel_logits, 1, keep_tokens_idx)
      dep_logits_tensor = torch.index_select(dep_logits, 1, keep_tokens_idx)

      
      rel_pad = torch.zeros(1, max_sentence_length - rel_logits_tensor.size()[1], 122).to(current_device)
      rel_logits_tensor = torch.cat([rel_logits_tensor, rel_pad], dim=1)

      dep_pad = torch.zeros(1, max_sentence_length - dep_logits_tensor.size()[1], 48).to(current_device)
      dep_logits_tensor = torch.cat([dep_logits_tensor, dep_pad], dim=1)

      rel_logits_pred.append(rel_logits_tensor)
      dep_logits_pred.append(dep_logits_tensor)
    

    rel_logits_pred_tensor = torch.stack(rel_logits_pred, dim=1).squeeze(0).to(current_device)
    dep_logits_pred_tensor = torch.stack(dep_logits_pred, dim=1).squeeze(0).to(current_device)
    

    rel_logits_pred_tensor = torch.reshape(rel_logits_pred_tensor,[32,122,max_sentence_length])
    dep_logits_pred_tensor = torch.reshape(dep_logits_pred_tensor,[32,48,max_sentence_length])

    rel_loss = rel_criterion(rel_logits_pred_tensor,rel_tensors)
    print ("Relative Position Loss",rel_loss.item())
    dep_loss = dep_criterion(dep_logits_pred_tensor,dep_tensors)
    print ("Dependecy Label Loss",dep_loss.item())

    overall_loss = lambdas[0]*rel_loss + (1-lambdas[0])*dep_loss
    print ("Overall Loss",overall_loss.item())

    overall_loss.backward()

    # Have gradients at this point.
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
    optimizer.step()
  
  
  model.eval()
  for _,batch in tqdm(enumerate(ud_generator["valid"])):
    #do similar as in the loop above
    text_list,dep_tensors,rel_tensors = batch
    
    tokenized_val = [distil_bert_tokenizer(sentence, padding=True, return_tensors='pt').to(current_device) for sentence in text_list]
    len_batch = []

    # Getting length of maximum sentence
    for token in tokenized_val: 
      len_batch.append(token['input_ids'].size()[1])

    max_sentence_length = (max(len_batch))

    dep_tensors = pad_list_of_tensors(dep_tensors,0,max_sentence_length)
    rel_tensors = pad_list_of_tensors(rel_tensors,0,max_sentence_length)
    
    rel_logits_pred = []
    dep_logits_pred = []
    for sentence_idx in range(len(text_list)):
      sentence = "[CLS] "+text_list[sentence_idx]+" [SEP]"
      keep_tokens_idx = torch.tensor(custom_tokenize(distil_bert_tokenizer.tokenize(sentence))).to(current_device)

      encoded_ip_tensor = distil_bert_tokenizer(sentence,return_tensors='pt',padding=True).to(current_device)
      rel_logits,dep_logits = model(encoded_ip_tensor)


      rel_logits_tensor = torch.index_select(rel_logits, 1, keep_tokens_idx)
      dep_logits_tensor = torch.index_select(dep_logits, 1, keep_tokens_idx)

      print ("Validation Rel Logits Argmax:",rel_logits_tensor.argmax(-1))
      print ("Validation Rel Logits Argmax Shape:",rel_logits_tensor.argmax(-1).shape)
      print ("Validation Dep Logits Argmax",dep_logits_tensor.argmax(-1))
      print ("Validation Dep Logits Argmax Shape",dep_logits_tensor.argmax(-1).shape)

# Saving Model
torch.save({
    'options': options,
    'model_dict': model.state_dict()
    },'./saved_models/bert-parser-0.25.pt')

0it [00:00, ?it/s]

Relative Position Loss 4.80020809173584
Dependecy Label Loss 3.876997709274292
Overall Loss 4.107800483703613


1it [00:12, 12.18s/it]

Relative Position Loss 4.814797878265381
Dependecy Label Loss 3.881988525390625
Overall Loss 4.1151909828186035


2it [00:23, 11.88s/it]

Relative Position Loss 4.802700519561768
Dependecy Label Loss 3.87044620513916
Overall Loss 4.103509902954102


3it [00:36, 12.26s/it]

Relative Position Loss 4.842522144317627
Dependecy Label Loss 3.880371570587158
Overall Loss 4.120909214019775


4it [00:48, 12.20s/it]

Relative Position Loss 4.8447771072387695
Dependecy Label Loss 3.8700318336486816
Overall Loss 4.113718032836914


5it [01:00, 12.11s/it]

Relative Position Loss 4.807460308074951
Dependecy Label Loss 3.8675732612609863
Overall Loss 4.102545261383057


6it [01:14, 12.65s/it]

Relative Position Loss 4.816173553466797
Dependecy Label Loss 3.8593785762786865
Overall Loss 4.098577499389648


7it [01:25, 12.25s/it]

Relative Position Loss 4.823151111602783
Dependecy Label Loss 3.85317063331604
Overall Loss 4.09566593170166


8it [01:37, 12.18s/it]

Relative Position Loss 4.797887802124023
Dependecy Label Loss 3.8429136276245117
Overall Loss 4.081657409667969


9it [01:48, 11.87s/it]

Relative Position Loss 4.8195481300354
Dependecy Label Loss 3.8388705253601074
Overall Loss 4.084039688110352


10it [02:00, 11.78s/it]

Relative Position Loss 4.786026477813721
Dependecy Label Loss 3.839552164077759
Overall Loss 4.076170921325684


11it [02:12, 11.72s/it]

Relative Position Loss 4.799475193023682
Dependecy Label Loss 3.845923900604248
Overall Loss 4.084311485290527


12it [02:24, 11.81s/it]

Relative Position Loss 4.827788352966309
Dependecy Label Loss 3.822166681289673
Overall Loss 4.073572158813477


13it [02:38, 12.16s/it]

Relative Position Loss 4.800118923187256





IndexError: ignored