In [None]:
import pandas as pd
from preprocessing import preprocess

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import time
from sklearn.metrics import roc_auc_score

import numpy as np

import datetime

import re

In [None]:
class SentencePairDataset(Dataset):

  def __init__(self, df, maxlen = 512, model_string=""):

    self.span1s = []
    self.span2s = []
    self.labels = []

    self.maxlen = maxlen

    for i, row in tqdm(df.iterrows()):
      self.span1s.append(row["span1"])
      self.span2s.append(row["span2"])
      self.labels.append(row["label"])
      
    self.tokenizer = AutoTokenizer.from_pretrained(model_string, do_lower_case=True)
  
  def __len__(self):
    return len(self.span1s)

  def __getitem__(self, index):
    span1 = self.span1s[index]
    span2 = self.span2s[index]
    label = self.labels[index]

    tensor, mask = prepare_features(self.tokenizer, span1, span2, self.maxlen) #Tokenize the sentences # TODO switch input

    return tensor, mask, label

In [None]:
class PairClassifier(nn.Module):
  def __init__(self, model_string):
    super(PairClassifier, self).__init__()

    self.model_string = model_string

    #Pretrained
    self.bert_layer = AutoModel.from_pretrained(model_string)
    
    #Dropout
    self.dropout = nn.Dropout(0.15)

    #Linear
    self.classifier = nn.Linear(768, 2) #768 #1024


  def forward(self, tensor, mask):
    
    #Pretrained 
    emb = self.bert_layer(tensor, attention_mask = mask)[0][:,0]
    
    #Dropout
    drop = self.dropout(emb)

    #Linear
    logits = self.classifier(drop)

    return logits

In [None]:
def train_model(model, criterion, optimizer, scheduler, train_dataloader, dev_dataloader, device, epochs, max_patience):
  loss_values = []

  #Training
  print("- - T R A I N I N G - -")

  model_name = "{}-{}.mdl".format(model.model_string, datetime.datetime.now().replace(microsecond=0).isoformat()).replace("/","_")
  patience = 0
  scores = []

  for epoch_i in range(epochs):

    print()
    print('> Epoch {:} of {:}'.format(epoch_i + 1, epochs))

    t0 = time.time()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):

      if step % 100 == 0 and not step == 0:
        print('├───Batch {:>3,} of {:>3,}. time={:.1f}s.'.format(step, len(train_dataloader)-1, time.time() - t0))
      if step == len(train_dataloader)-1:
        print('└───Batch {:>3,} of {:>3,}. time={:.1f}s.'.format(step, len(train_dataloader)-1, time.time() - t0))

      tokens = batch[0].to(device)
      mask = batch[1].to(device)
      labels = batch[2].to(device)

      model.zero_grad()

      outputs = model(tokens, mask)

      # print(outputs.shape)
      # print()
      # print(labels.shape)

      loss = criterion(outputs, labels)
      total_loss += loss

      loss.backward()
      
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step()
      scheduler.step()
    
    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("")
    print("- Training loss: {0:.2f}".format(avg_train_loss))
    print("- Training Epoch took: {:.1f}s".format(time.time() - t0))
    
    # Validation

    print()
    print("- - V A L I D A T I N G - -")

    t0 = time.time()
    
    total_loss = 0

    model.eval()

    eval_loss, eval_accuracy, eval_auroc, eval_precision, eval_recall = 0, 0, 0, 0, 0
    nb_acc_steps, nb_auroc_steps, nb_eval_examples, nb_prc_steps, nb_rcl_steps = 0, 0, 0, 0, 0

    

    softmax = nn.Softmax(dim=1)

    for batch in dev_dataloader:
      tokens = batch[0].to(device)
      mask = batch[1].to(device)
      labels = batch[2].to(device)

      with torch.no_grad():
        logits = model(tokens, mask)
      
      loss = criterion(logits, labels)

      total_loss += loss.item()
      
      probs = softmax(logits).detach().cpu().numpy()[:,1]
      logits = logits.detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()

      eval_accuracy += flat_accuracy(logits, label_ids)
      nb_acc_steps += 1

      eval_precision += precision(logits, label_ids)
      nb_prc_steps += 1

      eval_recall += recall(logits, label_ids)
      nb_rcl_steps += 1 

      try:
        tmp_auroc = roc_auc_score(label_ids, probs)

        eval_auroc += tmp_auroc

        nb_auroc_steps += 1
      except:
        pass
    
    eval_loss = total_loss / len(dev_dataloader)
    
    print("")
    print("- Validation Accuracy: {0:.4f}".format(eval_accuracy/nb_acc_steps))
    print("- Validation Precision: {0:.4f}".format(eval_precision/nb_prc_steps))
    print("- Validation Recall: {0:.4f}".format(eval_recall/nb_rcl_steps))
    print("- Validation AUROC: {0:.4f}".format(eval_auroc/nb_auroc_steps))
    print("- Validation loss: {0:.4f}".format(eval_loss))
    print("- Validation took: {:.1f}s".format(time.time() - t0))

    auroc_score = eval_auroc/nb_auroc_steps
    # Save if new best score has been achieved
    if len(scores)==0:
      print("")
      print("Saving model...")
      torch.save(model.state_dict(), "drive/MyDrive/relatedworks/savedmodels/pairmodel/{}".format(model_name))
    elif auroc_score >= max(scores):
      print("")
      print("New best score, saving...")
      torch.save(model.state_dict(), "drive/MyDrive/relatedworks/savedmodels/pairmodel/{}".format(model_name))
      patience = 0
    else:
      patience += 1
    
    
    if patience >= max_patience:
      "Maximum patience reached."
      break
    
    scores.append(eval_auroc/nb_auroc_steps)

    


  print("")
  print("Finished Training")
  print("")
  best_epoch = np.argmax(scores)
  print("Best seen AUROC {:.4f} at epoch {}.".format(scores[best_epoch], best_epoch+1))

  print("Best model saved with filename: {}".format(model_name))
  print("Loading best model...")
  best_model = PairClassifier(model.model_string)
  best_model.load_state_dict(torch.load("drive/MyDrive/relatedworks/savedmodels/pairmodel/{}".format(model_name)))
  best_model.eval()
  print("Done.")

  return best_model #eval_accuracy/nb_eval_steps, eval_auroc/nb_eval_steps, eval_loss, 


In [None]:
trainfile = "drive/MyDrive/relatedworks/relative_span_pairs0-9.csv"
train_df = pd.read_csv(trainfile)
#train_df = train_df.sample(frac=0.1)
#train_df = train_df[~train_df.duplicated()]
len(train_df)
#train_df = train_df.sample(frac=0.1)
#train_df = train_df[:200]

23080

In [None]:
pretrained_model_string = "allenai/scibert_scivocab_cased" #"roberta-base" #
epochs = 10
maxlen = 512
max_patience = 3

if torch.cuda.is_available():    
  DEVICE = torch.device("cuda")
  print('Using:', torch.cuda.get_device_name(0))
else:
  print('Using: CPU')
  DEVICE = torch.device("cpu")

torch.manual_seed(1004)

train, val = train_test_split(train_df, test_size = 0.1, shuffle=True, random_state=111)

train_set = SentencePairDataset(train, maxlen, pretrained_model_string)
dev_set = SentencePairDataset(val, maxlen, pretrained_model_string)
train_dataloader = DataLoader(train_set, batch_size = 12, num_workers = 2, shuffle=True)
dev_dataloader = DataLoader(dev_set, batch_size = 12, num_workers = 2, shuffle=False)


model = PairClassifier(pretrained_model_string)
model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 3e-5, eps = 1e-8)
total_steps = len(train_dataloader) * epochs
warmup_steps = int(0.1*total_steps) #10% warmup
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

model = train_model(model, criterion, optimizer, scheduler, train_dataloader, dev_dataloader, DEVICE, epochs, max_patience)

787it [00:00, 7862.55it/s]

Using: CPU


20772it [00:02, 7547.98it/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=222296.0, style=ProgressStyle(descripti…




2308it [00:00, 7175.60it/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442301670.0, style=ProgressStyle(descri…

KeyboardInterrupt: ignored

In [None]:
type()