In [None]:
#dataloader
#rnncharmodel

In [2]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np

import os
import time
from tqdm import tqdm
import argparse

from torch.utils.data import Dataset, DataLoader

In [3]:

parser = argparse.ArgumentParser(description="Running ner...")
parser.add_argument('--device', default = torch.device('cpu'), 
                    help='cpu or gpu')
parser.add_argument('--hs', default=768, type=int,
                    help='Hidden layer size')
parser.add_argument('--bs', default=32, 
                    help='batch size')
parser.add_argument('--nl', default=2, 
                    help='Number of layers')
parser.add_argument('--bidir', default=1,
                   help='bi directional')
parser.add_argument('--inplen', default=50,
                   help='sequence/sentence length')
parser.add_argument('--inpsize', default=768,
                   help='embedding size')
parser.add_argument('--vocabsize', default=768,
                   help='vocab size')
parser.add_argument('--lr', default= 0.001, type=float,
                    help="Learning rate of loss optimization")

'''
parser.add_argument('--data_dir', required=True, type=pathlib.Path, 
                    help='location to dataset files')
parser.add_argument('--device', default=torch.device('cpu'), 
                    help='gpu or cpu')
parser.add_argument('--mtype', default='linear', 
                    help='Type of model')
parser.add_argument('--load_model', action='store_true', 
                    help='To load and run model')
parser.add_argument('--save_dir', default='/home/amsinha/wsd-grid/wsd/scripts/runs/', 
                    help='model saving dir')
parser.add_argument('--model_num', default=1, type=str, 
                    help='saved model identifier')
parser.add_argument('--save-model', action='store_true', 
                    help='to save the model')
parser.add_argument('--early-stopping', action='store_true', 
                    help='to save checkpoint early')
parser.add_argument('--report-every', default=10, type=int,
                    help='Log report period')

parser.add_argument('--patience', default=5, 
                    help='patience for early_stopping')
parser.add_argument('--semantics', action='store_true', 
                    help='To load semantics A')
parser.add_argument('--trainable', action='store_true', help='to train adjancency')
parser.add_argument('--fragment', action='store_true')
'''

params,_ = parser.parse_known_args()


In [4]:
# rnn : inp_dim x rnn_hidden_dim x n_layer
# input : bs x seq_len x inp_dim
# h0 : n_layer x bs x rnn_hidden_dim

rnn = nn.GRU(10, 20, 2, batch_first=True)
input,h0 = torch.randn(3, 5, 10),torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [5]:
class nermodel(torch.nn.Module):
    def __init__(self, params):
        super(nermodel, self).__init__()
        self.device = params.device
        self.hiddensize = params.hs
        self.num_layer = params.nl
        self.bidir = params.bidir
        self.bs = params.bs
        self.inplen = params.inplen
        self.inpsize = params.inpsize
        self.vocabsize = params.vocabsize
        
        self.char_embedding = nn.Embedding(self.vocabsize, self.inpsize)
        # keep batch first
        self.rnn = nn.GRU(params.inpsize, params.hs, \
                          num_layers=self.num_layer, batch_first=True)
        
        self.fc = torch.nn.Linear(self.bidir * self.hiddensize, 1)
        self.relu = torch.nn.ReLU()
        
    def forward(self, x):
        #print('x_i:',x.shape, 'h0:',self.h0.shape)
        self.h0 = torch.randn(self.bidir * self.num_layer, 
                              x.shape[0], self.hiddensize)
        x = self.char_embedding(x)
        rnn_otpt, hn = self.rnn(x, self.h0)
        #print(rnn_otpt.shape)
        fc_otpt = self.relu(self.fc(rnn_otpt))
        #print('o shape:',fc_otpt.shape)
        return fc_otpt  

In [6]:
# model testing
params.vocabsize = 805#len(train_set.vocab)
m = nermodel(params)
trial_inp = torch.randint(0, 805, (32,50))
print('model:', m)
m(trial_inp).shape

model: nermodel(
  (char_embedding): Embedding(805, 768)
  (rnn): GRU(768, 768, num_layers=2, batch_first=True)
  (fc): Linear(in_features=768, out_features=1, bias=True)
  (relu): ReLU()
)


torch.Size([32, 50, 1])

# Dataloader and Input preprocessing - char model

In [34]:
train_df = pd.read_csv('../data/BioCreative_TrainTask3.0.tsv', sep='\t')
#train_df = pd.read_csv('../data/SMM4H18_train_modified.csv', sep='\t')
dev_df = pd.read_csv('../data/BioCreative_ValTask3.tsv', sep='\t')
train_df

Unnamed: 0,tweet_id,user_id,created_at,text,start,end,span,drug
0,525872716411580416,2333890110,2014-10-25,@Rhy_QD10 yeah irking he need his ass whoop I ...,-,-,-,-
1,809577207597244417,165916824,2016-12-16,Panda Express 🐼😛,-,-,-,-
2,590918768269864960,2414667758,2015-04-22,Well..technology wins agains. People are fight...,-,-,-,-
3,237385144221192193,24324898,2012-08-20,@jennabennabear what happened?????,-,-,-,-
4,166288274300735488,181819579,2012-02-05,A first grade teacher asked her class to compl...,-,-,-,-
...,...,...,...,...,...,...,...,...
49995,760235205047320577,1143892999,2016-08-01,Damnnnnnnnnnn I didn't know it cost 20 to get ...,-,-,-,-
49996,860946084117577728,583610519,2017-05-06,I got good news today! @sieelyn_ &amp; Austin ...,-,-,-,-
49997,798113426644041729,21343364,2016-11-14,@MissSarahLou6 @CocaCola_GB what?? I never got...,-,-,-,-
49998,790015640174080000,65087044,2016-10-23,Every last episode was funny as fuck lol,-,-,-,-


In [35]:
#positive
tpdf = train_df.loc[train_df['start'] != '-']
tndf = train_df.loc[train_df['start'] == '-']
dpdf = dev_df.loc[dev_df['start'] != '-']
dndf = dev_df.loc[dev_df['start'] == '-']

In [74]:
# traindf - subsampled tdf
tdf = pd.concat([tpdf,tndf.iloc[:877]])
tdf = tdf.sample(frac=1)
# devdf - subsampled ddf
ddf = pd.concat([dpdf,dndf.iloc[:396]])
#ddf = ddf.sample(frac=1)

ddf

Unnamed: 0,tweet_id,user_id,created_at,text,start,end,span,drug
234,707715042477744128,218297421,2016-03-09,Just get on birth control and use two condoms....,12,25,birth control,birth control
550,841466126655664128,184620477,2017-03-14,I have a possible infection from my Zofran pum...,36,42,Zofran,zofran
921,612522575776907264,2206562811,2015-06-21,@bethanygiuffre it's time for the epidural!,34,42,epidural,epidural
968,767004439450451968,19546372,2016-08-20,"@elvisrockysly yeah, one is heparin. Jesus, th...",28,35,heparin,heparin
1410,765379008838131712,1486889246,2016-08-16,"*puts cocoa butter on, takes prenatal vitamin,...",29,45,prenatal vitamin,prenatal vitamins
...,...,...,...,...,...,...,...,...
392,629018857571614720,2557491804,2015-08-05,@AlexisBourque1 @jiannabroussard @Love_Nylaaaa...,-,-,-,-
393,644596144681697281,2783087789,2015-09-17,almost 19 with no h.s. diploma or job about to...,-,-,-,-
394,769520550012186624,252235856,2016-08-27,I loveee bananas 😋,-,-,-,-
395,226415059742638080,24324898,2012-07-20,Just posted a photo http://t.co/fFN4aShq,-,-,-,-


In [13]:
class biodata(Dataset):
    def __init__(self, df, vocab=None, name='train'):
        self.len = len(df)
        self.data = df
        self.max_len = max(df.text.apply(lambda x: len(x)).to_numpy())
        self.setname = name
        self.vocab = vocab
        self.vdict = None
        self.create_vocab(vocab)
        self.data['text'] = self.data['text'].apply(lambda x: x.lower())
        
    def __getitem__(self, index):
        sen = self.data['text'].iloc[index]
        start, end = self.data['start'].iloc[index], self.data['end'].iloc[index]
        start = 0 if start == '-' else int(start)
        end = 0 if end == '-' else int(end)
        return {'ids' : torch.tensor(self.transform_input(sen, pad=True), dtype=torch.long),
               'targets' : torch.tensor(self.make_label(self.max_len, start, end), dtype=torch.float64)}
    
    def transform_input(self, sentence, pad=False):
        es = []
        for e in sentence.lower():
            if e in self.vdict:
                es.append(self.vdict[e])
            else:
                es.append(self.vdict['<oov>'])
        diff = 0 if self.max_len<len(es) else self.max_len-len(es)
        diff = 0 if not pad else diff
        return es + [1]*diff
    
    def make_label(self,l, start, end):
        label = np.zeros(l)
        try:
            if start <= l:
                label[start:end] = 1
        except:
            print('======>',start, l)
            import sys;sys.exit()
        return label
    
    def create_vocab(self, vocab):
        if not vocab:
            iv = {'<oov>', '<pad>'}
            for line in self.data['text'].to_numpy():
                iv |= set(line)
            self.vocab = iv
        else:
            iv = vocab
        ivdict = {'<oov>':0, '<pad>':1}
        for e in iv:
            if e not in ivdict:
                ivdict[e] = len(ivdict)
        self.vdict = ivdict
        
        print(f'{self.setname} vocab created!')
    
    def __len__(self):
        return self.len
    
    def __name__(self):
        return self.setname

In [75]:
train_set = biodata(tdf, name='train')
dev_set = biodata(ddf, vocab=train_set.vocab, name='dev')

train vocab created!
dev vocab created!


In [76]:
train_params = {'batch_size': params.bs,
               'shuffle':True,
               'num_workers': 2}
dev_params = {'batch_size': params.bs,
              'shuffle': False,
              'num_workers': 2}

trainloader = DataLoader(train_set, **train_params)
devloader = DataLoader(dev_set, **dev_params)

In [17]:
def train(epoch):
    tr_loss, tr_steps = 0,0
    n_correct = 0
    
    model.train()
    for _, data in tqdm(enumerate(trainloader)):
        optimizer.zero_grad()
        ids = data['ids']#.to(device, dtype=torch.long)
        tar = data['targets']#.to(device, dtype=torch.long)
        output = model(ids).squeeze(-1)
        loss = loss_function(output, tar)
        tr_loss += loss.item()
        tr_steps += 1
        
        #acc_light = 

        if _ % 50 == 0:
            print(f'training loss per 50 step : {tr_loss/ tr_steps}')
            
        loss.backward()
        optimizer.step()
        
def eval(testloader):
    test_loss, test_steps = 0,0
    n_correct = 0
    
    model.eval()
    with torch.no_grad():
        for _, data in tqdm(enumerate(testloader)):
            
            ids = data['ids']#.to(device, dtype=torch.long)
            tar = data['targets']#.to(device, dtype=torch.long)
            output = model(ids).squeeze(-1)
            loss = loss_function(output, tar)
            test_loss += loss.item()
            test_steps += 1

            if _ % 50 == 0:
                print(f'Testing loss per 50 step : {test_loss/ test_steps}')

In [15]:
m.parameters

<bound method Module.parameters of nermodel(
  (char_embedding): Embedding(805, 768)
  (rnn): GRU(768, 768, num_layers=2, batch_first=True)
  (fc): Linear(in_features=768, out_features=1, bias=True)
  (relu): ReLU()
)>

In [38]:
loss_function = torch.nn.BCEWithLogitsLoss() #torch.nn.CrossEntropyLoss()
model = nermodel(params)
optimizer = torch.optim.Adam(params =model.parameters(), lr=params.lr)
EPOCHS = 2

for e in range(EPOCHS):
    train(e)
   # if e:
        

0it [00:00, ?it/s]

training loss per 50 step : 0.7210162563664788


50it [01:40,  2.09s/it]

training loss per 50 step : 0.6937965346892583


100it [03:36,  2.53s/it]

training loss per 50 step : 0.6934781471940069


150it [05:32,  2.21s/it]

training loss per 50 step : 0.6933707815770979


200it [07:40,  2.88s/it]

training loss per 50 step : 0.6933163647809388


220it [08:25,  2.30s/it]
0it [00:00, ?it/s]

training loss per 50 step : 0.6931471824645996


50it [02:13,  2.55s/it]

training loss per 50 step : 0.6931498325339908


100it [04:23,  2.42s/it]

training loss per 50 step : 0.6931491550368587


150it [06:42,  2.59s/it]

training loss per 50 step : 0.6931502173916612


200it [09:01,  2.57s/it]

training loss per 50 step : 0.6931489466073624


220it [09:56,  2.71s/it]


In [110]:
## evaluating labels
outputs = []
with torch.no_grad():
    for _, data in tqdm(enumerate(devloader)):
        #optimizer.zero_grad()
        ids = data['ids']#.to(device, dtype=torch.long)
        tar = data['targets']#.to(device, dtype=torch.long)
        output = model(ids).squeeze(-1)
        outputs.append(output)
    #loss = loss_function(output, tar)
        

16it [00:08,  1.91it/s]


In [22]:
loss_function(output, tar)

tensor(0.7015, dtype=torch.float64,
       grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [42]:
for t,o in zip(tar, output):
    print(o)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [33]:
# nonzero or threshold
# consider all comparison for correct with 1s vs compare with all
# loss consider all
sum((output > 0).float())

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [109]:
# eval post processing
#ddf : evaluation data

predques = ddf.iloc[:32, 0:4].copy()
spans = []
for t in output:
    #print(t)
    span = []
    start,end = -1,-1
    for i,tt in enumerate(t):
        if start == -1 and tt == 1:
            start,end = i,i
        if start != -1:
            if tt == 1:
                end +=1
            else:
                span.append((start,end))
                start, end = -1,-1
    spans.append(span)
print(spans)

########################################
rest_columens = []
for i,sp in enumerate(spans):
    if len(sp) == 0:
        rest_columens.append(('-','-','-','-'))
    else:
        # one span detected else first*
        if len(sp) == 1:
            #print(sp)
            s = ddf.iloc[i]['text']
            wrd = s[sp[0][0]:sp[0][1]]
            rest_columens.append((*(sp[0]),wrd, wrd.lower()))
        
predans = pd.DataFrame(rest_columens, columns=['start', 'end', 'span', 'drug'])      

pred = pd.concat([predques.reset_index(drop=True), predans.reset_index(drop=True)], axis = 1, )
pred

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


Unnamed: 0,tweet_id,user_id,created_at,text,start,end,span,drug
0,707715042477744128,218297421,2016-03-09,just get on birth control and use two condoms....,-,-,-,-
1,841466126655664128,184620477,2017-03-14,i have a possible infection from my zofran pum...,-,-,-,-
2,612522575776907264,2206562811,2015-06-21,@bethanygiuffre it's time for the epidural!,-,-,-,-
3,767004439450451968,19546372,2016-08-20,"@elvisrockysly yeah, one is heparin. jesus, th...",-,-,-,-
4,765379008838131712,1486889246,2016-08-16,"*puts cocoa butter on, takes prenatal vitamin,...",-,-,-,-
5,448063246231015424,113610499,2014-03-24,"update: i got the epidural, back contractions ...",-,-,-,-
6,448063246231015424,113610499,2014-03-24,"update: i got the epidural, back contractions ...",-,-,-,-
7,502162994231791617,17084008,2014-08-20,i seem to have to wash most cups of tea down w...,-,-,-,-
8,534528525379055617,2551446475,2014-11-18,our kid is now 25 weeks and the size of a turn...,-,-,-,-
9,786755084231270401,177251944,2016-10-14,still have one day of antibiotics left for my ...,-,-,-,-


In [108]:
len(tar)

32