In [2]:
from transformers import RobertaTokenizer, BertTokenizer, BertForTokenClassification, BertForSequenceClassification
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

import json 

import spacy

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
# !wget https://github.com/naver/biobert-pretrained/releases/download/v1.1-pubmed/biobert_v1.1_pubmed.tar.gz

# !tar -xvzf biobert_v1.1_pubmed.tar.gz

# !export BERT_BASE_DIR=biobert_v1.1_pubmed

# !pip install tensorflow

# !pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch biobert_v1.1_pubmed/model.ckpt-1000000 biobert_v1.1_pubmed/bert_config.json biobert_v1.1_pubmed/pytorch_model.bin



In [4]:
# !tar -cvzf biobert.gz biobert

In [5]:
# !pip install pytorch_transformers

In [6]:
from pytorch_transformers import BertModel
model = BertModel.from_pretrained('biobert_v1.1_pubmed')
tokenizer = BertTokenizer(vocab_file='biobert_v1.1_pubmed/vocab.txt', do_lower_case=False)


In [7]:
len(list(model.parameters()))


199

In [8]:
import torch
torch.cuda.is_available()

True

In [9]:
import datasets

In [37]:
train_data = pd.read_csv('pah/data/train_fs.tsv', delimiter = '\t', header = None)
test_data = pd.read_csv('pah/data/dev_fs.tsv', delimiter = '\t', header = None)

In [38]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Rel,"Methods In this double-blind , placebo-control...",dosage,drug,T_133115,T_13350,F_133
1,NotRel,"In a preliminary study , the orally administer...",drug,reason,T_1331,T_133113,F_133
2,Rel,"Methods In this double-blind , placebo-control...",drug,duration,T_1335,T_1331161,F_133
3,Rel,"Methods In this double-blind , placebo-control...",drug,dosage,T_1336,T_1331172,F_133
4,Rel,"Methods In this double-blind , placebo-control...",drug,duration,T_1336,T_1331183,F_133


In [39]:
train_data = train_data[[0,1]]
train_data.columns = ['label','utterance']

test_data = test_data[[0,1]]
test_data.columns = ['label','utterance']

In [13]:
def prepare_features(tokenizer, seq_1, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # input_ids = input_ids.squeeze(0)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [14]:
class Relations(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        X, _  = prepare_features(tokenizer, utterance)
        y = label_to_ix[self.data.label[index]]
        return X,y #{k: v[index] for k, v in zip(X,y)}#
    
    def __len__(self):
        return self.len

In [15]:
class Relations_B(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        # X, _  = prepare_features(tokenizer, utterance)
        y = label_to_ix[self.data.label[index]]
        return utterance,y #{k: v[index] for k, v in zip(X,y)}#
    
    def __len__(self):
        return self.len

In [28]:
train_data = train_df[['label','tokens']]
train_data.columns = ['label','utterance']

test_data = test_df[['label','tokens']]
test_data.columns = ['label','utterance']

In [40]:
training_set = Relations(train_data)
testing_set = Relations(test_data)

In [41]:
training_set = Relations_B(train_data)
testing_set = Relations_B(test_data)

In [31]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.cuda()

In [54]:
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [55]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [34]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [35]:
label_to_ix = {}
for label in train_data.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'Rel': 0, 'NotRel': 1}

In [17]:
train_jsons_aug = json.load(open('pah/data/relations_marked_train_aug.json'))
len(train_jsons_aug)
test_jsons_aug = json.load(open('pah/data/relations_marked_test_aug.json'))
len(test_jsons_aug)
inf_jsons_aug = json.load(open('pah/data/articles_inf_aa_aug.json'))


In [18]:
vocab = json.load(open('pah/tokenizer/vocab.json'))
len(vocab)

52000

In [19]:
len_train = 0
for i in range(len(train_jsons_aug)):
    len_train+=len(train_jsons_aug[i]['tokens'])
len_train

35404

In [20]:
len_test = 0
for i in range(len(test_jsons_aug)):
    len_test+=len(test_jsons_aug[i]['tokens'])
len_test

8395

In [21]:
5455/35404

0.15407863518246526

In [22]:
len_train_oov = 0
for i in range(len(train_jsons_aug)):
    for t in train_jsons_aug[i]['tokens']:
        if t not in vocab.keys():
            len_train_oov+=1
len_train_oov

5455

In [23]:
data_test = pd.read_excel('pah/data/relations_проверка_связей.xlsx')['ss1']
tokens_test = 0
for i in range(len(data_test)):
    for t in str(data_test[i]).split(' '):
        if t!='[s1]' and t!='[e1]':
            tokens_test+=1
tokens_test

6670

In [24]:
TAG_LIST = [".",",",":", "-LRB-","-RRB-","``","\"\"","''",",","$","#","AFX","CC","CD","DT","EX","FW","HYPH","IN","JJ","JJR","JJS","LS","MD","NIL","NN","NNP","NNPS","NNS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SP","SYM","TO","UH","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB","ADD","NFP","GW","XX","BES","HVS","_SP"]
POS_LIST = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"]
DEP_LIST = ["acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "cop", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nn", "npmod", "npadvmod", "nsubj", "nsubjpass", "nummod","nmod", "oprd", "obj", "obl", "parataxis", "pcomp","predet", "pobj", "poss", "preconj", "prep", "prt", "punct",  "quantmod", "relcl", "ROOT", "xcomp"]
NER_LIST = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]

pos_tags = {}
for i,p in enumerate(TAG_LIST):
    pos_tags[p] = i
dep_tags = {}
for i,p in enumerate(DEP_LIST):
    dep_tags[p] = i
    

In [25]:
ent_dict = {}
for i in range(len(train_jsons_aug)):
    for ent in  train_jsons_aug[i]['entities']:
        if ent['type'] not in ent_dict:
            ent_dict[ent['type']] = len(ent_dict)
            
for i in range(len(test_jsons_aug)):
    for ent in  test_jsons_aug[i]['entities']:
        if ent['type'] not in ent_dict:
            ent_dict[ent['type']] = len(ent_dict)
            
for i in range(len(inf_jsons_aug)):
    for ent in  inf_jsons_aug[i]['entities']:
        if ent['type'] not in ent_dict:
            ent_dict[ent['type']] = len(ent_dict)
ent_dict['O'] = len(ent_dict)

In [26]:
def prepare_df(data):
    df = pd.DataFrame({'tokens':[], 'pos_tags': [], 'dep_label': [], 'label': [], 'ent_type': []})
    for index in tqdm_notebook(range(len(data))):
        k = 0
        ss1 = []
        ent_types = []
        for r in range(len(data[index]['relations'])):
            k = 0
            ss1 = []
            label = data[index]['relations'][r]['type']
            ent_types = []
            # print(label)
            while k<len(data[index]['tokens']):
                if k == data[index]['entities'][data[index]['relations'][r]['head']]['start']:
                    m = data[index]['entities'][data[index]['relations'][r]['head']]['end']
                    ss1.append('[s1]')
                    ss1.extend(data[index]['tokens'][k:m])
                    ss1.append('[e1]')
                    if k==m:
                        m+=1
                    for e in range(k,m):
                        ent_types.append(data[index]['entities'][data[index]['relations'][r]['head']]['type'])
                    k = m
                elif k == data[index]['entities'][data[index]['relations'][r]['tail']]['start']:
                    m = data[index]['entities'][data[index]['relations'][r]['tail']]['end']
                    ss1.append('[s2]')
                    ss1.extend(data[index]['tokens'][k:m])
                    ss1.append('[e2]')
                    if k==m:
                        m+=1
                    for e in range(k,m):
                        ent_types.append(data[index]['entities'][data[index]['relations'][r]['tail']]['type'])
                    k = m
                else:
                    ss1.append(data[index]['tokens'][k])
                    ent_types.append('O')
                    k+=1
            df0 = pd.concat([pd.Series(' '.join(ss1)), pd.Series([[pos_tags[t] for t in data[index]['pos_tags']]]), \
                             pd.Series([[dep_tags[t] for t in data[index]['dep_label']]]), 
                            pd.Series([label]), pd.Series([[ent_dict[t] for t in ent_types]])], axis = 1)
            df0.columns = df.columns
            df = pd.concat([df, df0])
            # df.loc[index,'tokens'] = ss1
            # df.loc[index,'pos_tags'] = data[index]['pos_tags']
            # df.loc[index,'dep_label'] = data[index]['dep_label']
            # df.loc[i,'label'] = label
    return df.reset_index()

In [27]:
train_df = prepare_df(train_jsons_aug)
test_df = prepare_df(test_jsons_aug)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/983 [00:00<?, ?it/s]

  0%|          | 0/226 [00:00<?, ?it/s]

In [42]:
def prepare_features_aug(seq_2, var_dict, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = seq_2

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    
    input_ids = tokens_a
    # input_ids = input_ids.squeeze(0)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [43]:
class Relations_Aug(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.tokens[index]
        label = self.data.label[index]
        pos_seq = self.data.pos_tags[index]
        dep_seq = self.data.dep_label[index]
        ent_seq = self.data.ent_type[index]
        pos_tensor, _ = prepare_features_aug(pos_seq, pos_tags, max_seq_length = 50, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        dep_tensor, _ = prepare_features_aug(dep_seq, dep_tags, max_seq_length = 50, 
                     zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        tok_tensor,_= prepare_features(rob_tokenizer, utterance, max_seq_length = 50, 
                     zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        ent_tensor,_ = prepare_features_aug(ent_seq, ent_dict, max_seq_length = 50, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        X = torch.cat((pos_tensor, dep_tensor, tok_tensor, ent_tensor), dim=1)
        y = label_to_ix[label]

        return X,y
    
    def __len__(self):
        return self.len

In [44]:
training_set_aug = Relations_Aug(train_df)
testing_set_aug = Relations_Aug(test_df)

In [45]:
training_loader_aug = DataLoader(training_set_aug, **params)
testing_loader_aug = DataLoader(testing_set_aug, **params)

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_epochs = 1
class CustomBERTModel(nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()

        self.bert = BertModel.from_pretrained('biobert_v1.1_pubmed')
        for param in list(self.bert.parameters())[-190:]: # total  trainable 199 Params: 79 is 40%
            param.requires_grad = False
        self.dropout = nn.Dropout(0.1)
          ### New layers:
        self.linear1 = nn.Linear(768, 50)
        self.linear2 = nn.Linear(50, 2) ## 2 is the number of classes in this example
            
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768, 50), 
            nn.ReLU(),
            nn.Linear(50, 1), nn.Sigmoid()
            ) 

    def forward(self, ids, mask):
        sequence_output, pooled_output = self.bert(
               ids, 
               attention_mask=mask)
            
#         outputs = self.bert(input_ids=input_ids,
#                             attention_mask=mask)

        linear1_output = sum([self.linear1(sequence_output[:,i,:].view(-1,768)) for i in range(len(sequence_output))])/len(sequence_output) ## extract the 1st token's embeddings

        # last_hidden_state_cls = sequence_output[:,0,:]
        # prediction = self.classifier(linear1_output)


          # sequence_output has the following shape: (batch_size, sequence_length, 768)
#         linear1_output = sum([self.linear1(sequence_output[:,i,:].view(-1,768)) for i in range(len(sequence_output))])/len(sequence_output) ## extract the 1st token's embeddings

        linear2_output = self.linear2(linear1_output)
        output = self.classifier(sequence_output)

        return linear2_output
    
    def get_embedding(self, ids, mask):
        sequence_output, pooled_output = self.bert(
               ids, 
               attention_mask=mask)

        return sequence_output

tokenizer = BertTokenizer(vocab_file='biobert_v1.1_pubmed/vocab.txt', do_lower_case=False)
model = CustomBERTModel() # You can pass the parameters if required to have more flexible model
model.to(torch.device("cuda")) ## can be gpu
criterion = nn.CrossEntropyLoss() ## If required define your own criterion
# optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

latest_best_score = 0

for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH: {}.".format(epoch))
    for i, batch in enumerate(training_loader): ## If you have a DataLoader()  object to get the data.

        data = batch[0]
        targets = batch[1].cuda() ## assuming that data loader returns a tuple of data and its targets
        
        optimizer.zero_grad()   
        encoding = tokenizer.batch_encode_plus(data, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
        input_ids = encoding['input_ids'].cuda()
        attention_mask = encoding['attention_mask'].cuda()
        
        outputs = model(input_ids, mask=attention_mask)
        outputs = F.log_softmax(outputs, dim=1)
        
        # targets = targets.unsqueeze(1) #, dtype = torch.float32)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            tp = 0
            fp = 0
            fn = 0
            for sent, label in testing_loader:
                # sent = sent.squeeze(1)
                # if torch.cuda.is_available():
                  # sent = sent.cuda()
                  # label = label.cuda()
                encoding = tokenizer.batch_encode_plus(sent, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
                input_ids = encoding['input_ids'].cuda()
                attention_mask = encoding['attention_mask'].cuda()
                output = model.forward(input_ids, mask=attention_mask)
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
                tp += ((predicted.cpu() == label.cpu())&(label.cpu()==0)).sum()
                fn += ((predicted.cpu() != label.cpu())&(label.cpu()==0)).sum()
                fp += ((predicted.cpu() != label.cpu())&(label.cpu()==1)).sum()
            accuracy = 100.00 * correct.numpy() / total
            if accuracy>latest_best_score:
                latest_best_score = accuracy
                torch.save(model.state_dict(), 'model_biobert_vars_'+str(epoch)+ str(uuid4())+'.pth')
            if tp==0 and fp==0:
                precision=0
            else:
                precision = tp/(tp+fp)
            if tp==0 and fn==0:
                recall=0
            else:                
                recall = tp/(tp+fn)
            if precision == 0 and recall == 0:
                F1 = 0
            else:
                F1=2*precision*recall/(precision+recall)
            print('Iteration: {}. Loss: {}. Accuracy: {}%. precision: {}. recall: {}. F1: {}.'.format(i, loss.item(), accuracy, precision, recall, F1))

            # accuracy = 100.00 * correct.numpy() / total
            # print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1 [00:00<?, ?it/s]

EPOCH: 0.
Iteration: 0. Loss: 0.769384503364563. Accuracy: 61.016949152542374%. precision: 0.8620689511299133. recall: 0.5681818127632141. F1: 0.6849315166473389.


In [44]:
def get_embedding(bert, ids, mask):
    sequence_output, pooled_output = bert(
           ids, 
           attention_mask=mask)

    return sequence_output

In [45]:
def get_reply(test_df,index):
    model.eval()
    sent = test_df.utterance[index]
    encoding = tokenizer.batch_encode_plus(sent, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    outputs = model(input_ids, mask=attention_mask)[0]
    outputs = F.log_softmax(outputs, dim=1)
    # print(output)
    _, predicted = torch.max(outputs.data, 0)
    prediction=list(label_to_ix.keys())[predicted]
    # print(prediction)
    return prediction

In [63]:
def get_reply(testing_loader):
    predictions = []
    for sent, label in testing_loader:
                # sent = sent.squeeze(1)
                # if torch.cuda.is_available():
                  # sent = sent.cuda()
                  # label = label.cuda()
        encoding = tokenizer.batch_encode_plus(sent, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
        input_ids = encoding['input_ids'].cuda()
        attention_mask = encoding['attention_mask'].cuda()
        output = model.forward(input_ids, mask=attention_mask)
        _, predicted = torch.max(output.data, 1)
        prediction=list(label_to_ix.keys())[predicted]
        predictions.append(prediction)
    return predictions

In [46]:
test_data.label.value_counts(), 44/69

(Rel       44
 NotRel    15
 Name: label, dtype: int64,
 0.6376811594202898)

In [47]:
model_path = 'model_biobert_vars_1217ade660-e24b-4296-83f6-4a69c856bb58.pth'
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [64]:
test_rel = []
# for i in tqdm_notebook(range(test_data.shape[0])):
#     test_rel.append(get_reply(test_data, i))
pred = get_reply(testing_loader)

In [66]:
len(pred)

59

In [67]:
inf_df = prepare_df(inf_jsons_aug)
inf_df.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/30348 [00:00<?, ?it/s]

Unnamed: 0,index,tokens,pos_tags,dep_label,label,ent_type
0,0,Comparison of hemodynamic parameters in treatm...,"[25, 18, 19, 28, 18, 19, 12, 42, 28, 18, 19, 1...","[48, 43, 5, 40, 43, 5, 11, 14, 40, 43, 5, 5, 4...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5,..."
1,0,Comparison of hemodynamic parameters in treatm...,"[25, 18, 19, 28, 18, 19, 12, 42, 28, 18, 19, 1...","[48, 43, 5, 40, 43, 5, 11, 14, 40, 43, 5, 5, 4...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5,..."
2,0,E-mail address : nazzareno.galie @ unibo.it BA...,"[25, 25, 2, 25, 18, 26, 26, 2, 25, 18, 19, 19,...","[13, 48, 45, 19, 45, 13, 48, 45, 48, 43, 5, 5,...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
3,0,E-mail address : nazzareno.galie @ unibo.it BA...,"[25, 25, 2, 25, 18, 26, 26, 2, 25, 18, 19, 19,...","[13, 48, 45, 19, 45, 13, 48, 45, 48, 43, 5, 5,...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
4,0,E-mail address : nazzareno.galie @ unibo.it BA...,"[25, 25, 2, 25, 18, 26, 26, 2, 25, 18, 19, 19,...","[13, 48, 45, 19, 45, 13, 48, 45, 48, 43, 5, 5,...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."


In [69]:
inf_df = inf_df[['tokens','label']]
inf_df.columns = ['utterance','label']

In [70]:
inf_set = Relations_B(inf_df)
inf_loader = DataLoader(inf_set, **params)

In [71]:
pred = get_reply(inf_loader)

In [73]:
len(pred), inf_df.shape

(74220, (74220, 2))

In [36]:
pd.DataFrame(test_rel)[0].value_counts()

NotRel    1034
Rel        334
Name: 0, dtype: int64

In [None]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(inp.shape, output.shape)

In [None]:
inp.size()

In [None]:
training_set.__getitem__(28)[0].shape

In [49]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52000,
    max_position_embeddings=400,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [50]:
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizerFast,PreTrainedTokenizerFast

model = RobertaForSequenceClassification(config=config)
# for param in list(model.parameters())[-90:]: # total  trainable 199 Params: 79 is 40%
#     param.requires_grad = False
# rob_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
rob_tokenizer = RobertaTokenizerFast.from_pretrained("pah/tokenizer", max_len=200)
# tokenizer = PreTrainedTokenizerFast(tokenizer_file="pah/tokenizer/tokenizer_file")

model_bert = BertModel.from_pretrained('biobert_v1.1_pubmed') # You can pass the parameters if required to have more flexible model
for param in list(model_bert.parameters())[-190:]: # total  trainable 199 Params: 79 is 40%
    param.requires_grad = False
model_bert.to(torch.device("cuda")) ## can be gpu
model.to(torch.device("cuda")) ## can be gpu

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(400, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [97]:
criterion = nn.BCEWithLogitsLoss() ## If required define your own criterion
# optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
# optimizer = optim.Adam(model.parameters(), lr=0.0001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
optimizer_bert = torch.optim.AdamW(model_bert.parameters(), lr=2e-5, weight_decay=1e-1)


max_epochs = 1
i=0

for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH: {}.".format(epoch))
    i=0
    for batch, batch_aug in zip(training_loader,training_loader_aug): ## If you have a DataLoader()  object to get the data.
        
        data = batch[0]
        targets = batch[1].cuda() ## assuming that data loader returns a tuple of data and its targets
        if len(batch_aug[0].size())>2:
            data_aug = batch_aug[0].view(batch_aug[0].size()[0],200).cuda()
        
        optimizer.zero_grad()
        optimizer_bert.zero_grad() 
        encoding = tokenizer.batch_encode_plus(data, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
        input_ids = encoding['input_ids'].cuda()
        attention_mask = encoding['attention_mask'].cuda()
        
        outputs_bert = get_embedding(model_bert, input_ids, mask=attention_mask)        
        outputs_bert = torch.mean(outputs_bert,dim=1)
        outputs_bert = outputs_bert.view(outputs_bert.size()[0],4,192)
        outputs_bert = torch.mean(outputs_bert,dim=1)
        
        inputs = torch.cat((outputs_bert, data_aug), dim = 1)
        input_ids = torch.cat((input_ids, data_aug), dim=1)

        # print(outputs_bert)
        # outputs = F.log_softmax(outputs, dim=1)
        # input_ids = torch.tensor(list(range(0,392)))
        # if outputs_bert.size()[0]==3:
        #     input_ids = torch.stack((input_ids,input_ids,input_ids), dim = 0).cuda()
        # else:
        #     input_ids = torch.stack((input_ids,input_ids,input_ids,input_ids,input_ids), dim = 0).cuda()
        outputs = model.forward(input_ids)[0][:,1]
        # _, predicted = torch.max(outputs, 1)
        
        # targets = targets.squeeze(1)
        targets = torch.tensor(targets, dtype=torch.float32)
        
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()
        optimizer_bert.step()
        
        if i%100 == 0:
            
            correct = 0
            total = 0
            tp = 0
            fp = 0
            fn = 0
            for sent, sent_aug in zip(testing_loader, testing_loader_aug):
                
                label = sent[1].cuda()
                sent = sent[0]
                data_test_aug = sent_aug[0].cuda()
                
                if len(data_test_aug.size())>2:
                    data_test_aug = data_test_aug.view(data_test_aug.size()[0],200).cuda()
                    
                # sent = sent.squeeze(1)
                # if torch.cuda.is_available():
                  # sent = sent.cuda()
                  # label = label.cuda()
                encoding = tokenizer.batch_encode_plus(sent, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
                input_ids = encoding['input_ids'].cuda()
                attention_mask = encoding['attention_mask'].cuda()
                outputs_bert = get_embedding(model_bert, input_ids, mask=attention_mask)        
                outputs_bert = torch.mean(outputs_bert,dim=1)
                outputs_bert = outputs_bert.view(outputs_bert.size()[0],4,192)
                outputs_bert = torch.mean(outputs_bert,dim=1)
                
                inputs = torch.cat((outputs_bert, data_test_aug), dim = 1)
                input_ids = torch.cat((input_ids, data_test_aug), dim=1)
                # print(outputs_bert)
                # outputs = F.log_softmax(outputs, dim=1)
                # input_ids = torch.tensor(list(range(0,392)))
                # if outputs_bert.size()[0]==3:
                #     input_ids = torch.stack((input_ids,input_ids,input_ids), dim = 0).cuda()
                # else:
                #     input_ids = torch.stack((input_ids,input_ids,input_ids,input_ids,input_ids), dim = 0).cuda()
                outputs = model.forward(input_ids)[0]
                _, predicted = torch.max(outputs, 1)

                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
                tp += ((predicted.cpu() == label.cpu())&(label.cpu()==0)).sum()
                fn += ((predicted.cpu() != label.cpu())&(label.cpu()==0)).sum()
                fp += ((predicted.cpu() != label.cpu())&(label.cpu()==1)).sum()
            accuracy = 100.00 * correct.numpy() / total
            if tp==0 and fp==0:
                precision=0
            else:
                precision = tp/(tp+fp)
            if tp==0 and fn==0:
                recall=0
            else:                
                recall = tp/(tp+fn)
            if precision == 0 and recall == 0:
                F1 = 0
            else:
                F1=2*precision*recall/(precision+recall)
            print('Iteration: {}. Loss: {}. Accuracy: {}%. precision: {}. recall: {}. F1: {}.'.format(i, loss.item(), accuracy, precision, recall, F1))
        i+=1
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


  0%|          | 0/1 [00:00<?, ?it/s]

EPOCH: 0.




Iteration: 0. Loss: 0.5379652976989746. Accuracy: 65.64327485380117%. precision: 0. recall: 0.0. F1: 0.
Iteration: 100. Loss: 0.48615002632141113. Accuracy: 65.64327485380117%. precision: 0. recall: 0.0. F1: 0.
Iteration: 200. Loss: 0.5331752896308899. Accuracy: 65.64327485380117%. precision: 0. recall: 0.0. F1: 0.
Iteration: 300. Loss: 0.28439924120903015. Accuracy: 65.64327485380117%. precision: 0. recall: 0.0. F1: 0.
Iteration: 400. Loss: 0.9168493151664734. Accuracy: 65.64327485380117%. precision: 0. recall: 0.0. F1: 0.
Iteration: 500. Loss: 0.6019229292869568. Accuracy: 65.5701754385965%. precision: 0.4000000059604645. recall: 0.0042553190141916275. F1: 0.008421052247285843.
Iteration: 600. Loss: 0.6904578804969788. Accuracy: 65.71637426900585%. precision: 1.0. recall: 0.0021276595070958138. F1: 0.004246284253895283.
Iteration: 700. Loss: 0.6620378494262695. Accuracy: 65.5701754385965%. precision: 0.4444444477558136. recall: 0.008510638028383255. F1: 0.016701459884643555.
Iteratio

In [18]:
max_epochs = 10
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/10 [00:00<?, ?it/s]

EPOCH -- 0


NameError: name 'training_loader' is not defined

In [12]:
train_jsons_aug = json.load(open('pah/data/relations_marked_train_aug.json'))
len(train_jsons_aug)

1088

In [13]:
test_jsons_aug = json.load(open('pah/data/relations_marked_test_aug.json'))
len(test_jsons_aug)

121

In [14]:
TAG_LIST = [".",",",":", "-LRB-","-RRB-","``","\"\"","''",",","$","#","AFX","CC","CD","DT","EX","FW","HYPH","IN","JJ","JJR","JJS","LS","MD","NIL","NN","NNP","NNPS","NNS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SP","SYM","TO","UH","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB","ADD","NFP","GW","XX","BES","HVS","_SP"]
POS_LIST = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"]
DEP_LIST = ["acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "cop", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nn", "npmod", "npadvmod", "nsubj", "nsubjpass", "nummod","nmod", "oprd", "obj", "obl", "parataxis", "pcomp","predet", "pobj", "poss", "preconj", "prep", "prt", "punct",  "quantmod", "relcl", "ROOT", "xcomp"]
NER_LIST = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]

In [15]:
# nlp = spacy.load('en_core_web_sm')

# nlp.tokenizer.vocab.morphology.tag_map
pos_tags = {}
for i,p in enumerate(TAG_LIST):
    pos_tags[p] = i

In [16]:
dep_tags = {}
for i,p in enumerate(DEP_LIST):
    dep_tags[p] = i

In [88]:
for i in range(len(train_jsons_aug)):
    train_jsons_aug[i]['pos_tags_num'] = [pos_tags[t] for t in train_jsons_aug[i]['pos_tags']]
    train_jsons_aug[i]['dep_tags_num'] = [dep_tags[t] for t in train_jsons_aug[i]['dep_label']]

In [89]:
for i in range(len(test_jsons_aug)):
    test_jsons_aug[i]['pos_tags_num'] = [pos_tags[t] for t in test_jsons_aug[i]['pos_tags']]
    test_jsons_aug[i]['dep_tags_num'] = [dep_tags[t] for t in test_jsons_aug[i]['dep_label']]

In [31]:
train_data.head()

Unnamed: 0,label,utterance
0,Rel,"Methods In this double-blind , placebo-control..."
1,NotRel,"In a preliminary study , the orally administer..."
2,Rel,"Methods In this double-blind , placebo-control..."
3,Rel,"Methods In this double-blind , placebo-control..."
4,Rel,"Methods In this double-blind , placebo-control..."


In [17]:
def prepare_features_aug(seq_2, var_dict, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = seq_2

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    
    input_ids = tokens_a
    # input_ids = input_ids.squeeze(0)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [123]:
i=0
pos_tensor, _ = prepare_features_aug(train_df['pos_tags'][i], pos_tags, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
dep_tensor, _ = prepare_features_aug(train_df['dep_label'][i], dep_tags, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
tok_tensor,_= prepare_features(' '.join(train_df['tokens'][0]), max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
ent_tensor,_ = prepare_features_aug(train_df['ent_type'][i], ent_dict, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)

In [108]:
# torch.cat((pos_tensor, dep_tensor,tok_tensor), dim=1).size()
# label
tok_tensor.size()

torch.Size([1, 300])

In [61]:
class Relations_Aug(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.tokens[index]
        label = self.data.label[index]
        pos_seq = self.data.pos_tags[index]
        dep_seq = self.data.dep_label[index]
        ent_seq = self.data.ent_type[index]
        pos_tensor, _ = prepare_features_aug(pos_seq, pos_tags, max_seq_length = 100, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        dep_tensor, _ = prepare_features_aug(dep_seq, dep_tags, max_seq_length = 100, 
                     zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        tok_tensor,_= prepare_features(utterance, max_seq_length = 100, 
                     zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        ent_tensor,_ = prepare_features_aug(ent_seq, ent_dict, max_seq_length = 100, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True)
        X = torch.cat((pos_tensor, dep_tensor, tok_tensor, ent_tensor), dim=1)
        # y = label_to_ix[label]

        return X #,y
    
    def __len__(self):
        return self.len

In [19]:
def prepare_df(data):
    df = pd.DataFrame({'tokens':[], 'pos_tags': [], 'dep_label': [], 'label': [], 'ent_type': []})
    for index in tqdm_notebook(range(len(data))):
        k = 0
        ss1 = []
        ent_types = []
        for r in range(len(data[index]['relations'])):
            k = 0
            ss1 = []
            label = data[index]['relations'][r]['type']
            ent_types = []
            # print(label)
            while k<len(data[index]['tokens']):
                if k == data[index]['entities'][data[index]['relations'][r]['head']]['start']:
                    m = data[index]['entities'][data[index]['relations'][r]['head']]['end']
                    ss1.append('[s1]')
                    ss1.extend(data[index]['tokens'][k:m])
                    ss1.append('[e1]')
                    if k==m:
                        m+=1
                    for e in range(k,m):
                        ent_types.append(data[index]['entities'][data[index]['relations'][r]['head']]['type'])
                    k = m
                elif k == data[index]['entities'][data[index]['relations'][r]['tail']]['start']:
                    m = data[index]['entities'][data[index]['relations'][r]['tail']]['end']
                    ss1.append('[s2]')
                    ss1.extend(data[index]['tokens'][k:m])
                    ss1.append('[e2]')
                    if k==m:
                        m+=1
                    for e in range(k,m):
                        ent_types.append(data[index]['entities'][data[index]['relations'][r]['tail']]['type'])
                    k = m
                else:
                    ss1.append(data[index]['tokens'][k])
                    ent_types.append('O')
                    k+=1
            df0 = pd.concat([pd.Series(' '.join(ss1)), pd.Series([[pos_tags[t] for t in data[index]['pos_tags']]]), \
                             pd.Series([[dep_tags[t] for t in data[index]['dep_label']]]), 
                            pd.Series([label]), pd.Series([[ent_dict[t] for t in ent_types]])], axis = 1)
            df0.columns = df.columns
            df = pd.concat([df, df0])
            # df.loc[index,'tokens'] = ss1
            # df.loc[index,'pos_tags'] = data[index]['pos_tags']
            # df.loc[index,'dep_label'] = data[index]['dep_label']
            # df.loc[i,'label'] = label
    return df.reset_index()

In [23]:
train_df = prepare_df(train_jsons_aug)
test_df = prepare_df(test_jsons_aug)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/1088 [00:00<?, ?it/s]

  0%|          | 0/121 [00:00<?, ?it/s]

In [25]:
test_df.head()

Unnamed: 0,index,tokens,pos_tags,dep_label,label,ent_type
0,0,"In ARIES-2 , a statistically significant impro...","[18, 26, 8, 14, 33, 19, 25, 18, 25, 18, 19, 25...","[43, 40, 45, 20, 3, 5, 31, 43, 40, 43, 5, 40, ...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
1,0,"In ARIES-2 , a statistically significant impro...","[18, 26, 8, 14, 33, 19, 25, 18, 25, 18, 19, 25...","[43, 40, 45, 20, 3, 5, 31, 43, 40, 43, 5, 40, ...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
2,0,The 6-minute walk distance is an independent p...,"[14, 13, 25, 25, 46, 14, 19, 25, 18, 25, 18, 2...","[20, 33, 5, 30, 48, 20, 5, 7, 43, 40, 43, 40, ...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
3,0,The 6-minute walk distance is an independent p...,"[14, 13, 25, 25, 46, 14, 19, 25, 18, 25, 18, 2...","[20, 33, 5, 30, 48, 20, 5, 7, 43, 40, 43, 40, ...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
4,0,"From these data , it appears that both the [s1...","[18, 14, 28, 8, 31, 46, 18, 12, 14, 26, 13, 19...","[43, 20, 40, 45, 30, 48, 24, 39, 20, 32, 5, 13...",Rel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 2, 2, 0, ..."


In [22]:
ent_dict = {}
for i in range(len(train_jsons_aug)):
    for ent in  train_jsons_aug[i]['entities']:
        if ent['type'] not in ent_dict:
            ent_dict[ent['type']] = len(ent_dict)
            
for i in range(len(test_jsons_aug)):
    for ent in  test_jsons_aug[i]['entities']:
        if ent['type'] not in ent_dict:
            ent_dict[ent['type']] = len(ent_dict)
            
for i in range(len(inf_jsons_aug)):
    for ent in  inf_jsons_aug[i]['entities']:
        if ent['type'] not in ent_dict:
            ent_dict[ent['type']] = len(ent_dict)
ent_dict['O'] = len(ent_dict)

In [68]:
ent_dict

{'drug': 0,
 'reason': 1,
 '6mwd': 2,
 'dosage': 3,
 'duration': 4,
 'cnt_patients': 5,
 'fc': 6,
 'death': 7,
 'nt-probnp': 8,
 'hospitalization': 9,
 'progression': 10,
 'prev_treat': 11,
 'death_rate': 12,
 'O': 13}

In [25]:
train_df.head()

Unnamed: 0,index,tokens,pos_tags,dep_label,label
0,0,"[In, a, preliminary, study, ,, the, orally, ad...","[IN, DT, JJ, NN, ,, DT, RB, VBN, JJ, NN, NN, N...","[prep, det, amod, pobj, punct, det, advmod, am...",NotRel
1,0,"[Methods, In, this, double-blind, ,, placebo-c...","[NNS, IN, DT, NN, ,, JJ, NN, ,, PRP, RB, VBD, ...","[npadvmod, prep, det, pobj, punct, amod, pobj,...",NotRel
2,0,"[Results, At, week, 16, 16, ,, patients, treat...","[NNS, IN, NN, CD, CD, ,, NNS, VBN, IN, NNP, VB...","[nsubj, prep, pobj, nummod, nummod, punct, nsu...",Rel
3,0,"[Conclusions, The, endothelin-receptor, antago...","[NNS, DT, NN, NN, NNP, VBZ, JJ, IN, NNS, IN, J...","[nsubj, det, compound, compound, nsubj, ROOT, ...",NotRel
4,0,"[Endothelin-receptor, antagonism, with, oral, ...","[NN, NN, IN, JJ, NNP, VBZ, DT, JJ, NN, IN, NN,...","[compound, nsubj, prep, amod, pobj, ROOT, det,...",NotRel


In [55]:
data[index]['relations'][1]['head']

2

In [54]:
data[index]['entities'][]

[{'type': 'drug', 'start': 9, 'end': 10},
 {'type': 'reason', 'start': 21, 'end': 24},
 {'type': 'drug', 'start': 11, 'end': 12}]

In [62]:
training_set_aug = Relations_Aug(train_df)
testing_set_aug = Relations_Aug(test_df)

In [63]:
training_set_aug.__getitem__(0)[0].size()

torch.Size([400])

In [39]:
torch.cat((pos_tensor, dep_tensor), dim=1).size()

torch.Size([1, 600])

In [40]:
pos_tensor.squeeze(1).size()

torch.Size([1, 300])

In [31]:
params = {'batch_size': 1,
          'drop_last': False,
          # 'shuffle': True,
          'num_workers': 1}

In [75]:
sampler = torch.utils.data.sampler.BatchSampler(
    torch.utils.data.sampler.RandomSampler(training_set_aug),
    batch_size=1,
    drop_last=False)

In [76]:
training_loader_aug = DataLoader(training_set_aug, **params, sampler = sampler)
testing_loader_aug = DataLoader(testing_set_aug, **params)

In [77]:
for data in training_loader_aug:
    print(data)

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-61-c59562ad31c8>", line 13, in __getitem__
    zero_pad = True, include_CLS_token = True, include_SEP_token = True)
  File "<ipython-input-17-db20c451c7f0>", line 28, in prepare_features_aug
    input_ids.append(0)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/series.py", line 2878, in append
    to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity
  File "/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 304, in concat
    sort=sort,
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 384, in __init__
    raise TypeError(msg)
TypeError: cannot concatenate object of type '<class 'int'>'; only Series and DataFrame objs are valid


In [34]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [37]:
## Test Forward Pass
inp = training_set_aug.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(inp.shape, output.shape)

torch.Size([1, 400]) torch.Size([1, 2])


In [None]:
max_epochs = 20
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader_aug):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader_aug:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/20 [00:00<?, ?it/s]

EPOCH -- 0


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-38-76a41a839139>", line 13, in __getitem__
    zero_pad = True, include_CLS_token = True, include_SEP_token = True)
  File "<ipython-input-17-db20c451c7f0>", line 28, in prepare_features_aug
    input_ids.append(0)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/series.py", line 2878, in append
    to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity
  File "/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 304, in concat
    sort=sort,
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 384, in __init__
    raise TypeError(msg)
TypeError: cannot concatenate object of type '<class 'int'>'; only Series and DataFrame objs are valid


In [30]:
# for data in training_loader_aug:
#     print(data)

In [50]:
torch.save(model.state_dict(), 'model_biobert_vars_71'+ str(uuid4())+'.pth')

In [32]:
model_path = 'model_4_varsbfdfa1d5-0f06-4aa5-89db-c9f541216795.pth'

In [33]:
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [37]:
def get_reply(data,index):
    model.eval()
    utterance = ' '.join(data.tokens[index])
    pos_seq = data.pos_tags[index]
    dep_seq = data.dep_label[index]
    ent_seq = data.ent_type[index]
    pos_tensor, _ = prepare_features_aug(pos_seq, pos_tags, max_seq_length = 100, 
         zero_pad = True, include_CLS_token = True, include_SEP_token = True)
    dep_tensor, _ = prepare_features_aug(dep_seq, dep_tags, max_seq_length = 100, 
                 zero_pad = True, include_CLS_token = True, include_SEP_token = True)
    tok_tensor,_= prepare_features(utterance, max_seq_length = 100, 
                 zero_pad = True, include_CLS_token = True, include_SEP_token = True)
    ent_tensor,_ = prepare_features_aug(ent_seq, ent_dict, max_seq_length = 100, 
         zero_pad = True, include_CLS_token = True, include_SEP_token = True)
    X = torch.cat((pos_tensor, dep_tensor, tok_tensor, ent_tensor), dim=1)

    if torch.cuda.is_available():
        X = X.cuda()
    output = model(X)[0]
    _, pred_label = torch.max(output.data, 1)
    prediction=list(label_to_ix.keys())[pred_label]
    return prediction

In [21]:
inf_jsons_aug = json.load(open('pah/data/articles_inf_aa_aug.json'))


In [30]:
inf_jsons_aug[10]['entities']

[{'type': 'duration', 'start': 7, 'end': 9}]

In [35]:
inf_df = prepare_df(inf_jsons_aug)
inf_df.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/30348 [00:00<?, ?it/s]

Unnamed: 0,index,tokens,pos_tags,dep_label,label,ent_type
0,0,Comparison of hemodynamic parameters in treatm...,"[25, 18, 19, 28, 18, 19, 12, 42, 28, 18, 19, 1...","[48, 43, 5, 40, 43, 5, 11, 14, 40, 43, 5, 5, 4...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5,..."
1,0,Comparison of hemodynamic parameters in treatm...,"[25, 18, 19, 28, 18, 19, 12, 42, 28, 18, 19, 1...","[48, 43, 5, 40, 43, 5, 11, 14, 40, 43, 5, 5, 4...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5,..."
2,0,E-mail address : nazzareno.galie @ unibo.it BA...,"[25, 25, 2, 25, 18, 26, 26, 2, 25, 18, 19, 19,...","[13, 48, 45, 19, 45, 13, 48, 45, 48, 43, 5, 5,...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
3,0,E-mail address : nazzareno.galie @ unibo.it BA...,"[25, 25, 2, 25, 18, 26, 26, 2, 25, 18, 19, 19,...","[13, 48, 45, 19, 45, 13, 48, 45, 48, 43, 5, 5,...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
4,0,E-mail address : nazzareno.galie @ unibo.it BA...,"[25, 25, 2, 25, 18, 26, 26, 2, 25, 18, 19, 19,...","[13, 48, 45, 19, 45, 13, 48, 45, 48, 43, 5, 5,...",NotRel,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."


In [38]:
inf_df = inf_df[['tokens']]
inf_df.columns = ['utterance']

In [39]:
inf_rel = []
for i in tqdm_notebook(range(inf_df.shape[0])):
    inf_rel.append(get_reply(inf_df, i))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/74220 [00:00<?, ?it/s]

IndexError: index 3 is out of bounds for dimension 1 with size 3

In [61]:
len(inf_rel)

57733

In [74]:
inf_rel = pred
inf_rel_df = pd.DataFrame(inf_rel)
inf_rel_df.columns = ['Relation']

In [75]:
inf_rel_df.Relation.value_counts()

NotRel    54145
Rel       20075
Name: Relation, dtype: int64

In [76]:
inf_df = pd.concat([inf_df, inf_rel_df], axis = 1)
inf_df.head()

Unnamed: 0,utterance,label,Relation
0,Comparison of hemodynamic parameters in treatm...,NotRel,NotRel
1,Comparison of hemodynamic parameters in treatm...,NotRel,Rel
2,E-mail address : nazzareno.galie @ unibo.it BA...,NotRel,Rel
3,E-mail address : nazzareno.galie @ unibo.it BA...,NotRel,Rel
4,E-mail address : nazzareno.galie @ unibo.it BA...,NotRel,NotRel


In [61]:
inf_df[inf_df['Relation']=='Rel']

Unnamed: 0,index,tokens,pos_tags,dep_label,label,ent_type,Relation
9,0,"[METHODS, :, Patients, with, [s1], PAH, [e1], ...","[NNP, :, NNS, IN, NNP, WP, VBD, NN, JJ, CC, VB...","[ROOT, punct, appos, prep, pobj, nsubj, relcl,...",NotRel,"[O, O, O, O, reason, O, O, O, O, O, O, O, O, O...",Rel
10,0,"[METHODS, :, Patients, with, [s1], PAH, [e1], ...","[NNP, :, NNS, IN, NNP, WP, VBD, NN, JJ, CC, VB...","[ROOT, punct, appos, prep, pobj, nsubj, relcl,...",NotRel,"[O, O, O, O, reason, O, O, O, O, O, O, O, O, O...",Rel
13,0,"[METHODS, :, Patients, with, PAH, who, were, t...","[NNP, :, NNS, IN, NNP, WP, VBD, NN, JJ, CC, VB...","[ROOT, punct, appos, prep, pobj, nsubj, relcl,...",NotRel,"[O, O, O, O, O, O, O, O, O, O, O, O, drug, O, ...",Rel
14,0,"[METHODS, :, Patients, with, [s2], PAH, [e2], ...","[NNP, :, NNS, IN, NNP, WP, VBD, NN, JJ, CC, VB...","[ROOT, punct, appos, prep, pobj, nsubj, relcl,...",NotRel,"[O, O, O, O, reason, O, O, O, O, O, O, O, O, O...",Rel
17,0,"[METHODS, :, Patients, with, [s2], PAH, [e2], ...","[NNP, :, NNS, IN, NNP, WP, VBD, NN, JJ, CC, VB...","[ROOT, punct, appos, prep, pobj, nsubj, relcl,...",NotRel,"[O, O, O, O, reason, O, O, O, O, O, O, O, O, O...",Rel
...,...,...,...,...,...,...,...
74181,0,"[In, this, trial, ,, a, total, of, 185, PAH, p...","[IN, DT, NN, ,, DT, NN, IN, CD, NNP, NNS, IN, ...","[prep, det, pobj, punct, det, nsubjpass, prep,...",NotRel,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Rel
74184,0,"[In, this, trial, ,, a, total, of, 185, PAH, p...","[IN, DT, NN, ,, DT, NN, IN, CD, NNP, NNS, IN, ...","[prep, det, pobj, punct, det, nsubjpass, prep,...",NotRel,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Rel
74185,0,"[In, this, trial, ,, a, total, of, 185, PAH, p...","[IN, DT, NN, ,, DT, NN, IN, CD, NNP, NNS, IN, ...","[prep, det, pobj, punct, det, nsubjpass, prep,...",NotRel,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Rel
74188,0,"[In, this, trial, ,, a, total, of, 185, PAH, p...","[IN, DT, NN, ,, DT, NN, IN, CD, NNP, NNS, IN, ...","[prep, det, pobj, punct, det, nsubjpass, prep,...",NotRel,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Rel


In [75]:
train_df.label.value_counts()

NotRel    932
Rel       179
Name: label, dtype: int64

In [77]:
inf_df[['utterance', 'Relation']].to_csv('inf_model_biobert_best.csv')
inf_df[200:250].to_csv('inf_model_bb_best_sample_50.csv', index = False)

In [93]:
test_fs = pd.read_csv('ClinicalTransformerRelationExtraction/CRE_PAH/v7/test_fs.tsv', delimiter = '\t', header = None)
test_fs.columns = ['label','ss1','tag1','tag2','T','D','F']
test_fs.head()

Unnamed: 0,label,ss1,tag1,tag2,T,D,F
0,NotRel,we hypothesized [s1] milrinone [e1] milrinone ...,Drug,Route,T_121,T_1170,F_1
1,NotRel,we hypothesized [s1] milrinone [e1] [s2] inhal...,Drug,Route,T_121,T_1254,F_1
2,NotRel,we hypothesized [s1] milrinone [e1] [s2] prost...,Drug,Drug,T_121,T_1262,F_1
3,NotRel,we hypothesized [s1] inhaled [e1] [s2] prostac...,Route,Drug,T_1170,T_1262,F_1
4,NotRel,"we hypothesized that [s1] inhaled [e1] , an ad...",Route,Drug,T_1254,T_1262,F_1


In [100]:
inf_df = pd.read_csv('inf_model_biobert_best.csv')
inf_df.head()

Unnamed: 0.1,Unnamed: 0,utterance,Relation
0,0,Comparison of hemodynamic parameters in treatm...,NotRel
1,1,Comparison of hemodynamic parameters in treatm...,Rel
2,2,E-mail address : nazzareno.galie @ unibo.it BA...,Rel
3,3,E-mail address : nazzareno.galie @ unibo.it BA...,Rel
4,4,E-mail address : nazzareno.galie @ unibo.it BA...,NotRel


In [103]:
inf_df.sample(150).to_csv('inf_model_biobert_best_sample_150.csv')

In [97]:
inf_df['utterance'] = inf_df['utterance'].str.lower().str.replace(' ','')
inf_df.utterance

0        comparisonofhemodynamicparametersintreatment-n...
1        comparisonofhemodynamicparametersintreatment-n...
2        e-mailaddress:nazzareno.galie@unibo.itbackgrou...
3        e-mailaddress:nazzareno.galie@unibo.itbackgrou...
4        e-mailaddress:nazzareno.galie@unibo.itbackgrou...
                               ...                        
74215    the[s1]2-and[s2]48-week[e2]results[s1]uggestth...
74216    inflammationalsohasanincreasinglyrecognisedrol...
74217    inflammationalsohasanincreasinglyrecognisedrol...
74218    severalformsof[s1]pah[e1],includingidiopathicp...
74219    severalformsof[s2]pah[e2],includingidiopathicp...
Name: utterance, Length: 74220, dtype: object

In [99]:
test_fs['ss1'] = test_fs['ss1'].str.lower().str.replace(' ','')
test_fs.merge(inf_df, left_on = 'ss1', right_on = 'utterance').shape

0         wehypothesized[s1]milrinone[e1]milrinone,anade...
1         wehypothesized[s1]milrinone[e1][s2]inhaled[e2]...
2         wehypothesized[s1]milrinone[e1][s2]prostacycli...
3         wehypothesized[s1]inhaled[e1][s2]prostacyclin[...
4         wehypothesizedthat[s1]inhaled[e1],anadenosine-...
                                ...                        
149981    tablev[s1]gelatin[e1][s2]3%[e2]invivooraldtcou...
149982    tablevshows[s1]3%[e1]invivooraldtcouldbearrang...
149983    [s1]sublingual[e1]meanplasmaconcentrationversu...
149984    [s1]sublingual[e1][s2]oral[e2]plasmaconcentrat...
149985    [s1]lyophylizedsublingual[e1]plasmaconcentrati...
Name: ss1, Length: 149986, dtype: object