# Imports & Installs

In [167]:
!pip install datasets transformers[torch] huggingface_hub
!pip install accelerate -U
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [2]:
from transformers import DistilBertTokenizer, DataCollatorForTokenClassification, BertForSequenceClassification,TrainingArguments, Trainer
from sklearn.metrics import f1_score
import torch
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Load Data from Google Drive

In [170]:
# Connect to Google Drive
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/NLP')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def load_data(filename):
  with open(filename, 'r', encoding="utf8") as file:
    return [line.strip() for line in file.readlines()]

train_sents = load_data("NYT29/train.sent")
dev_sents = load_data("NYT29/dev.sent")
test_sents = load_data("NYT29/test.sent")
relations = load_data("NYT29/relations.txt")
train_tups = load_data("NYT29/train.tup")
dev_tups = load_data("NYT29/dev.tup")
test_tups = load_data("NYT29/test.tup")
relation_to_id = {relation: idx for idx, relation in enumerate(relations)}

In [4]:
# Load in our data sets as pandas data frames
train_sents_df = pd.DataFrame(train_sents)
dev_sents_df = pd.DataFrame(dev_sents)
test_sents_df = pd.DataFrame(test_sents)
train_tups_df = pd.DataFrame(train_tups)
dev_tups_df = pd.DataFrame(dev_tups)
test_tups_df = pd.DataFrame(test_tups)

# 1. Preprocessing

For each set we will create a pandas DataFrame with the columns:


*   sentence
*   entity1
*   entity2
*   label (represents relation as int)


In [5]:

# THis method helps us extract entities and their relations
def get_entities(sent, tups):
  sent_list = []
  e1_list = []
  e2_list = []
  relation_list = []

  for sent, tup in zip(sent, tups):
    relations = tup.split('|')

    for ent in relations:
      entity1, entity2, relation = [rel.strip() for rel in ent.split(';')]
      sent_list.append(sent)
      e1_list.append(entity1)
      e2_list.append(entity2)
      relation_list.append(relation)

  return sent_list, e1_list, e2_list, relation_list

# convert our relations to ints
def convert_relation(relations):
  return [relation_to_id[key] for key in relations]

## Generating All Possible Permuations with Other Label

In [6]:
def permutations(sentences, e1, e2, label):

  schema={'sent': 'str', 'e1': 'str', 'e2': 'str', 'label': 'int64'}
  df = pd.DataFrame(columns=schema.keys()).astype(schema)

  local_entities = []
  index = 0
  all_entities = []
  all_entities_with_label = []
  labels = []


  while index < len(e1):
    local_entities = []

    # builds a sentence index list
    sent_index_list = [index]
    local_entities.append((e1[index], e2[index]))
    all_entities.append((e1[index], e2[index]))
    all_entities_with_label.append((e1[index], e2[index]))
    labels.append(label[index])
    # builds a counter so we can get all of the same sentences
    counter = index + 1

    # gets our current and the next sentence
    current_sent = sentences[index]
    current_e1 = e1[index]
    current_e2 = e2[index]
    if counter != len(e1):
      next_sent    = sentences[counter]
      next_e1 = e1[counter]
      next_e2 = e2[counter]
    # gets each next sentence while they are the same
      while current_sent == next_sent and current_e1 != next_e1 and current_e2 != next_e2:
        # appends to the list these indices
        sent_index_list.append(counter)
        local_entities.append((e1[counter], e2[counter]))
        all_entities.append((e1[counter], e2[counter]))
        all_entities_with_label.append((e1[counter], e2[counter]))
        labels.append(label[counter])

        counter = counter + 1
        next_sent = sentences[counter]

        if counter == len(e1):
          break



    e1_sent_list = []
    e2_sent_list = []

    for x in range(0, len(sent_index_list)):
      # we now have all of the e1 and e2 for our current sentences
      e1_sent_list.append(e1[sent_index_list[x]])
      e2_sent_list.append(e2[sent_index_list[x]])

      for y in range(0, len(e1_sent_list)):
        for z in range(0, len(e1_sent_list)):

          if e1[sent_index_list[y]] != e2[sent_index_list[z]]:
            new_tup_1 = (e1[sent_index_list[y]], e2[sent_index_list[z]])
          else:
            new_tup_1 = ""

          if e2[sent_index_list[y]] != e1[sent_index_list[z]]:
            new_tup_2 = (e2[sent_index_list[y]], e1[sent_index_list[z]])
          else:
            new_tup_2 = ""

          if e1[sent_index_list[y]] != e1[sent_index_list[z]]:
            new_tup_3 = (e1[sent_index_list[y]], e1[sent_index_list[z]])
          else:
            new_tup_3 = ""

          if e2[sent_index_list[y]] != e2[sent_index_list[z]]:
            new_tup_4 = (e2[sent_index_list[y]], e2[sent_index_list[z]])
          else:
            new_tup_4 = ""

          if new_tup_1 not in local_entities and new_tup_1 != "":
            local_entities.append(new_tup_1)
            all_entities.append(new_tup_1)

          if new_tup_2 not in local_entities and new_tup_2 != "":
            local_entities.append(new_tup_2)
            all_entities.append(new_tup_2)

          if new_tup_3 not in local_entities and new_tup_3 != "":
            local_entities.append(new_tup_3)
            all_entities.append(new_tup_3)

          if new_tup_4 not in local_entities and new_tup_4 != "":
            local_entities.append(new_tup_4)
            all_entities.append(new_tup_4)
    # we now have all possible entities for local and have updated all of the entities, we also have the same num sentences
    for i in range(0, len(local_entities)):
      new_sent = sentences[index]
      new_e1 = local_entities[i][0]
      new_e2 = local_entities[i][1]


      if local_entities[i] not in all_entities_with_label:
        new_label = 29
        all_entities_with_label.append(local_entities[i])
        labels.append(new_label)
        df = dataframe_handler(new_sent, new_e1, new_e2, new_label, df)
      else:
        if len(sent_index_list) == 1:
          if i != 1:
            new_label = label[index]
            df = dataframe_handler(new_sent, new_e1, new_e2, new_label, df)
        else:
          loc_index = all_entities_with_label.index(local_entities[i])
          new_label = labels[loc_index]
          df = dataframe_handler(new_sent, new_e1, new_e2, new_label, df)




    index += len(sent_index_list)
  return df


def dataframe_handler(new_sent, new_e1, new_e2, new_label, df):
  data = {
      "sent": [new_sent],
      "e1": [new_e1],
      "e2": [new_e2],
      "label": [new_label]
  }
  schema={'sent': 'str', 'e1': 'str', 'e2': 'str', 'label': 'int64'}
  new_df = pd.DataFrame([[new_sent, new_e1, new_e2, new_label]], columns=schema.keys()).astype(schema)
  df = pd.concat([df, new_df], ignore_index=True)
  return df


## Preprocessing Training Set


In [7]:
train_df = pd.DataFrame()

# Get the seperated entities
train_sentences, train_e1, train_e2, train_relations = get_entities(train_sents, train_tups)
# converts our relations to the necessary int labels
train_label = convert_relation(train_relations)

train_df["sent"] = train_sentences
train_df["e1"] = train_e1
train_df["e2"] = train_e2
train_df["label"] = train_label

train_df = permutations(train_sentences, train_e1, train_e2, train_label)

# export as csv so we can reload
train_set = train_df.to_csv('train_NYT29.csv', index = False)

train_df.head(10)

Unnamed: 0,sent,e1,e2,label
0,"then terrorism struck again , this time in the...",jakarta,indonesia,0
1,"then terrorism struck again , this time in the...",indonesia,jakarta,1
2,"then terrorism struck again , this time in the...",indonesia,jakarta,1
3,a12 new york\/region b1-7 enclave for middle c...,stuyvesant town,manhattan,3
4,a12 new york\/region b1-7 enclave for middle c...,manhattan,stuyvesant town,29
5,a12 new york\/region b1-7 enclave for middle c...,peter cooper village,manhattan,3
6,a12 new york\/region b1-7 enclave for middle c...,manhattan,peter cooper village,29
7,"before long , though , he 's continent-hopping...",spain,pamplona,4
8,"before long , though , he 's continent-hopping...",pamplona,spain,29
9,general casey said the iraqi forces had little...,syria,euphrates,4


## Preprocessing Testing Set

In [8]:
test_df = pd.DataFrame()

# Get the seperated entities
test_sentences, test_e1, test_e2, test_relations = get_entities(test_sents, test_tups)
# converts our relations to the necessary int labels
test_label = convert_relation(test_relations)

test_df["sent"] = test_sentences
test_df["e1"] = test_e1
test_df["e2"] = test_e2
test_df["label"] = test_label

test_df = permutations(test_sentences, test_e1, test_e2, test_label)

# export as csv so we can reload
test_set = test_df.to_csv('test_NYT29.csv', index = False)

test_df.head(10)

Unnamed: 0,sent,e1,e2,label
0,a1 zoning laws slow growth the 20-year buildin...,new york city,staten island,4
1,a1 zoning laws slow growth the 20-year buildin...,staten island,new york city,29
2,"like them , he hopes that their gathering will...",israel,west bank,4
3,"like them , he hopes that their gathering will...",west bank,israel,29
4,if we can figure out how to program thousands ...,california,berkeley,4
5,if we can figure out how to program thousands ...,berkeley,california,29
6,outsiders who have been mentioned as possible ...,carol bellamy,bear stearns,8
7,outsiders who have been mentioned as possible ...,bear stearns,carol bellamy,29
8,"anselm kiefer 's '' falling stars , '' the fir...",paris,grand palais,4
9,"anselm kiefer 's '' falling stars , '' the fir...",grand palais,paris,29


## Preprocessing Validation Set


In [11]:
dev_df = pd.DataFrame()

# Get the seperated entities
dev_sentences, dev_e1, dev_e2, dev_relations = get_entities(dev_sents, dev_tups)
# converts our relations to the necessary int labels
dev_label = convert_relation(dev_relations)

dev_df["sent"] = dev_sentences
dev_df["e1"] = dev_e1
dev_df["e2"] = dev_e2
dev_df["label"] = dev_label

dev_df = permutations(dev_sentences, dev_e1, dev_e2, dev_label)

# export as csv so we can reload
dev_set = dev_df.to_csv('dev_NYT29.csv', index = False)



## Setting up NN Settings

In [4]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

## Loading Data and Encoding it

In [5]:
train_dataset = pd.read_csv('train_NYT29.csv')
test_dataset = pd.read_csv('test_NYT29.csv')
dev_dataset = pd.read_csv('dev_NYT29.csv')

In [6]:
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

train_dataset['label'] = train_dataset['label'].apply(lambda x: encode_cat(x))
test_dataset['label'] = train_dataset['label'].apply(lambda x: encode_cat(x))
dev_dataset['label'] = train_dataset['label'].apply(lambda x: encode_cat(x))

In [7]:
class Tokenizing(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sent = str(self.data.sent[index])
        e1 = str(self.data.e1[index])
        e2 = str(self.data.e2[index])
        relation = sent + '[SEP]' + e1 + '[SEP]' + e2
        inputs = self.tokenizer.encode_plus(
            relation,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [8]:
# Creating the dataset and dataloader for the neural network

# Need to load in data here properly
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
print("DEV Dataset: {}".format(dev_dataset.shape))

training_set = Tokenizing(train_dataset, tokenizer, MAX_LEN)
testing_set = Tokenizing(test_dataset, tokenizer, MAX_LEN)
development_set = Tokenizing(dev_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (104478, 4)
TEST Dataset: (7888, 4)
DEV Dataset: (13400, 4)


## Setting up more Parameters

In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

dev_params  = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
development_loader = DataLoader(development_set, **dev_params)

## Creating the DistillBERT Model

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 30)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [14]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

## Defining Necessary Scoring Functions

In [13]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
# Function to calcuate the F1 score of the model
def calculate_f1_score(predictions, labels):
  f1 = f1_score(labels, predictions, average = 'micro')
  return f1

## Build our Training Loop

In [17]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    f1 = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        f1 += calculate_f1_score(big_idx.tolist(), targets.tolist())

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            f1_step = (f1*100)/nb_tr_examples
            print(f"Training F1 per 5000 steps: {f1_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    epoch_f1 = (f1*100)/nb_tr_examples
    print(f"Training F1 Epoch: {epoch_f1}")

    return

In [18]:
for epoch in range(EPOCHS):
    train(epoch)

Training F1 per 5000 steps: 0.0
Training F1 per 5000 steps: 12.71495700859828
Training F1 per 5000 steps: 15.700929907009298
Training F1 per 5000 steps: 17.208019465368974
Training F1 per 5000 steps: 18.10784460776961
Training F1 per 5000 steps: 18.724501019959202
Training F1 Epoch: 18.83434790099351
Training F1 per 5000 steps: 25.0
Training F1 per 5000 steps: 22.050589882023594
Training F1 per 5000 steps: 22.062793720627937
Training F1 per 5000 steps: 22.095610292647155
Training F1 per 5000 steps: 22.180765961701916
Training F1 per 5000 steps: 22.23086076556938
Training F1 Epoch: 22.238413828748637
Training F1 per 5000 steps: 25.0
Training F1 per 5000 steps: 22.984153169366127
Training F1 per 5000 steps: 22.957079292070794
Training F1 per 5000 steps: 22.992217185520964
Training F1 per 5000 steps: 23.024161291935403
Training F1 per 5000 steps: 23.046828126874924
Training F1 Epoch: 23.04001799421888
Training F1 per 5000 steps: 25.0
Training F1 per 5000 steps: 23.461557688462307
Training

In [19]:
torch.save(model, 'multi_class_model.pt')

## Build our Testing Loop

In [12]:
model = torch.load('multi_class_model.pt')

In [87]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    fin_outputs = []
    fin_targets = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)

            outputs = model(ids, mask)
            big_val, big_idx = torch.max(outputs.data, dim=1)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(big_idx.cpu().detach().numpy().tolist())
            

        optimizer.zero_grad()
        # # When using GPU
        optimizer.step()


    return fin_targets, fin_outputs

In [88]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

f1_targets, f1_outputs = valid(model, testing_loader)
f1 = calculate_f1_score(f1_outputs, f1_targets)
print("F1 on test data = ", f1)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
F1 on test data =  0.08050202839756593


In [89]:
f1_targets, f1_outputs = valid(model, development_loader)
f1 = calculate_f1_score(f1_outputs, f1_targets)
print("F1 on development data = ", f1)

F1 on development data =  0.08835820895522388


In [90]:
f1_targets, f1_outputs = valid(model, training_loader)
f1 = calculate_f1_score(f1_outputs, f1_targets)
print("F1 on training data = ",  f1)

F1 on training data =  0.9710465361128658
