In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 33.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 87.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 82.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
# import libraries
import os
import sys
import time
import copy
import glob
import math
from scipy import io
import numpy as np
import pandas as pd
from collections import OrderedDict


from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW, BertConfig

from google.colab import drive
import tensorflow as tf

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
import matplotlib.pyplot as plt


In [None]:
#define device for deep learning
CUDA_LAUNCH_BLOCKING=1

device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    raise SystemError('GPU device not found')

GPU: Tesla T4


In [None]:
# set environment as googledrive to folder "resource"
data_path =  "/Colab Notebooks/"

try:
    drive.mount('/content/drive')
    data_path = "/content/drive/MyDrive/Colab Notebooks/alta/"

except:
    print("You are not working in Colab at the moment :(")

Mounted at /content/drive


In [None]:
bert_model = "allenai/scibert_scivocab_cased" #"bert-base-multilingual-cased"

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

num_classes = 5
hidden = 100
batch_size = 16
epoch_size = 10
learning_rate = 2e-5
patience_all = 20
init_weight_decay = 0.1

max_length = 100
dropout = 0.1

# Transformer parameters
d_model = 1
n_layers = 1
n_hidden = 1
heads = 1

freeze_bert = True

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.text = df["Text"]
        self.targets = df["target_list"]
        self.id_list = df["Document"]
        self.sentence_id_list = df["Sentence"]


    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
      selected_text = self.text.iloc[idx]
      selected_id = self.id_list.iloc[idx]
      selected_id_text = self.sentence_id_list.iloc[idx]
      inputs = self.tokenizer.encode_plus(
          selected_text ,
          None,
          add_special_tokens=True,
          max_length=self.max_len,
          padding='max_length',
          return_token_type_ids=True,
          truncation=True,
          return_attention_mask=True,
      )
      
      tokens = torch.tensor(inputs["input_ids"], dtype=torch.long)
      token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long)
      attn_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long)


      selected_label = torch.FloatTensor(self.targets.iloc[idx])
      return tokens, token_type_ids, attn_mask, selected_label, selected_text, selected_id, selected_id_text

In [None]:
class TransformerTextModel(nn.Module):
    def __init__(self, embedding, ninp, nhead, nhid, nlayers, nout=6, dropout=0.5):
        super(TransformerTextModel, self).__init__()
        self.nhid = nhid
        
        self.bert = embedding
        
        self.dropout = nn.Dropout(dropout)
        self.pos_encoder = PositionalEncodingText(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        self.decoder = nn.Linear(ninp*2, nout)

            
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0,1)
        mask = mask.float().mask_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    

    def forward(self, input_ids, token_type_ids, attn_mask):
        src = self.bert(input_ids, token_type_ids, attn_mask)
        bert = src.hidden_states[-1][:,0,:]
        src = src[0]
        
        src = self.dropout(src)
        src = src * math.sqrt(self.nhid)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output =  torch.cat((bert, output), dim=1)
        output = self.decoder(output)
        return output


class PositionalEncodingText(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncodingText, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [None]:
def save_checkpoint(state, location):
  """save the models
  input:
  state : dict (the parameters of the model will be saved)
  file_path : string (the path wehere the model will be saved)
  """
  filepath = os.path.join(location, 'best.pth.tar')
  torch.save(state, filepath)

def load_checkpoint(location):
  """save the models
  input:
  file_path : string (the path where the model will be saved)
  output:
  model : torch nn.Module (the loaded model)
  """
  model = torch.load(location)
  return model


def train(train_dl):
  model.train()
  total_loss = 0.
  for batch in train_dl:
    optimizer.zero_grad()
    tokens, token_type_ids, attn_mask, label, selected_text, selected_id, selected_id_text = batch
    label = label.to(device)
    tokens, token_type_ids, attn_mask = tokens.to(device), token_type_ids.to(device), attn_mask.to(device)
    output = model(tokens, attn_mask, token_type_ids)
    loss = criterion(output, label)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    
    optimizer.step()
    
    total_loss += loss.item() 

  return total_loss / len(train_dl) 


def evaluate(model, dl):
  total_loss = 0.
  prediction_list = []
  label_list = []

  with torch.no_grad():
    model.eval()
    for batch in dl:
      tokens, token_type_ids, attn_mask, label, selected_text, selected_id, selected_id_text = batch

      label = label.to(device)
      tokens, token_type_ids, attn_mask = tokens.to(device), token_type_ids.to(device), attn_mask.to(device)
      output = model(tokens, attn_mask, token_type_ids)
  
      loss = criterion(output, label)
      prediction_list.extend((torch.sigmoid(output).cpu().detach().numpy() >= 0.5).astype(int).tolist())
      label_list.extend(label.data.cpu().detach().numpy().astype(int).tolist())
      total_loss += loss.item() 

    return label_list, prediction_list

def train_and_evaluate(model, optimizer, train_dl, val_dl):
  """
  train_and_evaluate function for the problem
  Input:
    model : torch.nn.Module (model that weill set in the hyperparameters (lstm, cnn etc.)
    optimizer : Optimizer
    train_dl : DataFrame (train dataframe)
    val_dl : DataFrame (val dataframe)
  Output:
    model: model
    label_best: list (true labels of the test data)
    prediction_best: list (predicted labels of the test data)
  """
  best_f = -999.9
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
  for epoch in range(1, epoch_size+1):
    total_loss = train(train_dl)
    label_list, prediction_list = evaluate(model, val_dl)

    f = f1_score(label_list, prediction_list, average='micro')
    if f >= best_f:
      patience = 0
      best_epoch = epoch
      best_f = f
      print("save the model...")
      print("the current best f is %f" % (best_f))
      save_checkpoint({'epoch': epoch , 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, location=data_path + 'result')

    else:
        patience += 1

    if patience > patience_all:
        break

    print("Epoch = ", epoch, " train loss = ", total_loss) 

    scheduler.step()
  return model, label_list, prediction_list

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model)
embedding = BertModel.from_pretrained(bert_model,output_hidden_states = True).to(device)

for param in embedding.parameters():
   param.requires_grad = False

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
pretrained_model_name = "bert"
model_path = "allenai/scibert_scivocab_cased" #"bert-base-cased"


#model =  TextModel(embedding, hidden_size=100, out_n=6, m_type="lstm", dropout=dropout).to(device)
model = TransformerTextModel(embedding, 768, heads, n_hidden, n_layers, 6, dropout=dropout).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=init_weight_decay)
#criterion = nn.CrossEntropyLoss()

criterion = torch.nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [None]:
train_dataset = pd.read_csv(data_path+"dataset/train_2022.csv")
#val_dataset = pd.read_csv(data_path+"dataset/val_2022.csv")

In [None]:
train_dataset['target_list'] = train_dataset[['population', 'intervention', 'background', 'outcome', 'study design', 'other']].values.tolist()
#val_dataset['target_list'] = val_dataset[['population', 'intervention', 'background', 'outcome', 'study design']].values.tolist()

In [None]:
from sklearn.model_selection import train_test_split

train_dataset_1, val_dataset = train_test_split(train_dataset, test_size=0.1)

In [None]:
dl_train = DataLoader(CustomDataset(train_dataset, tokenizer,  max_length), shuffle=True, num_workers=3, batch_size=batch_size)
dl_val= DataLoader(CustomDataset(val_dataset, tokenizer,  max_length), shuffle=False, num_workers=3, batch_size=batch_size)

In [None]:
model, label_list, prediction_list = train_and_evaluate(model, optimizer, dl_train, dl_val)

save the model...
the current best f is 0.069899
Epoch =  1  train loss =  0.07159443978031786
save the model...
the current best f is 0.254842
Epoch =  2  train loss =  0.05946103439227848
save the model...
the current best f is 0.277666
Epoch =  3  train loss =  0.05571258653694554
save the model...
the current best f is 0.279397
Epoch =  4  train loss =  0.05368139466509977
save the model...
the current best f is 0.291127
Epoch =  5  train loss =  0.0526219611835271
save the model...
the current best f is 0.322266
Epoch =  6  train loss =  0.0519203371893803
Epoch =  7  train loss =  0.0516083454966893
save the model...
the current best f is 0.355301
Epoch =  8  train loss =  0.05141853038149121
save the model...
the current best f is 0.356530
Epoch =  9  train loss =  0.05133574053732106
Epoch =  10  train loss =  0.051345755501943806


In [None]:
filepath = os.path.join(data_path + 'result', 'best.pth.tar')
model = TransformerTextModel(embedding, 768, heads, n_hidden, n_layers, 6, dropout=dropout).to(device)

state_dict = load_checkpoint(filepath)
model.load_state_dict(state_dict["state_dict"])

<All keys matched successfully>

In [None]:
test_dataset_new = pd.read_csv(data_path+"dataset/test_2022.csv")
test_dataset_new['target_list'] = test_dataset_new[['population', 'intervention', 'background', 'outcome', 'study design', 'other']].values.tolist()
test_dataset_new= DataLoader(CustomDataset(test_dataset_new, tokenizer,  max_length), shuffle=False, num_workers=3, batch_size=batch_size)

In [None]:
prediction_list = []
label_list = []
selected_sentences = []
selected_id_list = []
selected_id_sentence_list = []
corrects = 0
with torch.no_grad():
  model.eval()
  for batch in dl_train:
    tokens, token_type_ids, attn_mask, label, selected_text, selected_id, selected_id_text = batch

    label = label.to(device)
    tokens, token_type_ids, attn_mask = tokens.to(device), token_type_ids.to(device), attn_mask.to(device)
    output = model(tokens, attn_mask, token_type_ids)

    loss = criterion(output, label)
    prediction_list.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
    label_list.extend(label.data.cpu().detach().numpy().astype(int).tolist())
    selected_sentences.extend(selected_text)
    selected_id_list.extend(selected_id.cpu().detach().numpy().astype(int).tolist())
    selected_id_sentence_list.extend(selected_id_text.cpu().detach().numpy().astype(int).tolist())


In [None]:
prediction_list = np.array(prediction_list)
train_dl = pd.DataFrame({'Document' : selected_id_list, 'Sentence' : selected_id_sentence_list, 'population' : prediction_list[:,0],'intervention' : prediction_list[:,1],'background' : prediction_list[:,2],'outcome' : prediction_list[:,3],'study design' :prediction_list[:,4],'other':prediction_list[:,5], 'Text' :selected_sentences})
train_dl.to_csv(data_path + "answer_7518.csv",index=False)

In [None]:
val_dataset_new = pd.read_csv(data_path+"dataset/val_2022.csv")
val_dataset_new['target_list'] = val_dataset_new[['population', 'intervention', 'background', 'outcome', 'study design', 'other']].values.tolist()
dl_val_new= DataLoader(CustomDataset(val_dataset_new, tokenizer,  max_length), shuffle=False, num_workers=3, batch_size=batch_size)

In [None]:
prediction_list = []
label_list = []
selected_sentences = []
selected_id_list = []
selected_id_sentence_list = []
corrects = 0
with torch.no_grad():
  model.eval()
  for batch in dl_val_new:
    tokens, token_type_ids, attn_mask, label, selected_text, selected_id, selected_id_text = batch

    label = label.to(device)
    tokens, token_type_ids, attn_mask = tokens.to(device), token_type_ids.to(device), attn_mask.to(device)
    output = model(tokens, attn_mask, token_type_ids)

    loss = criterion(output, label)
    prediction_list.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
    label_list.extend(label.data.cpu().detach().numpy().astype(int).tolist())
    selected_sentences.extend(selected_text)
    selected_id_list.extend(selected_id.cpu().detach().numpy().astype(int).tolist())
    selected_id_sentence_list.extend(selected_id_text.cpu().detach().numpy().astype(int).tolist())


In [None]:
train_dl = pd.DataFrame({'Document' : selected_id_list, 'Sentence' : selected_id_sentence_list, 'population' : prediction_list[:,0],'intervention' : prediction_list[:,1],'background' : prediction_list[:,2],'outcome' : prediction_list[:,3],'study design' :prediction_list[:,4],'other':prediction_list[:,5], 'Text' :selected_sentences})
train_dl.to_csv(data_path + "answer_deneme.csv",index=False)