In [None]:
#dataloader
#rnncharmodel

In [1]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np

import os
import time
from tqdm import tqdm
import argparse

from torch.utils.data import Dataset, DataLoader

In [2]:

parser = argparse.ArgumentParser(description="Running ner...")
parser.add_argument('--device', default = torch.device('cpu'), 
                    help='cpu or gpu')
parser.add_argument('--hs', default=768, type=int,
                    help='Hidden layer size')
parser.add_argument('--bs', default=32, 
                    help='batch size')
parser.add_argument('--nl', default=2, 
                    help='Number of layers')
parser.add_argument('--bidir', default=1,
                   help='bi directional')
parser.add_argument('--inplen', default=50,
                   help='sequence/sentence length')
parser.add_argument('--inpsize', default=768,
                   help='embedding size')
parser.add_argument('--vocabsize', default=768,
                   help='vocab size')
parser.add_argument('--lr', default= 0.001, type=float,
                    help="Learning rate of loss optimization")

'''
parser.add_argument('--data_dir', required=True, type=pathlib.Path, 
                    help='location to dataset files')
parser.add_argument('--device', default=torch.device('cpu'), 
                    help='gpu or cpu')
parser.add_argument('--mtype', default='linear', 
                    help='Type of model')
parser.add_argument('--load_model', action='store_true', 
                    help='To load and run model')
parser.add_argument('--save_dir', default='/home/amsinha/wsd-grid/wsd/scripts/runs/', 
                    help='model saving dir')
parser.add_argument('--model_num', default=1, type=str, 
                    help='saved model identifier')
parser.add_argument('--save-model', action='store_true', 
                    help='to save the model')
parser.add_argument('--early-stopping', action='store_true', 
                    help='to save checkpoint early')
parser.add_argument('--report-every', default=10, type=int,
                    help='Log report period')

parser.add_argument('--patience', default=5, 
                    help='patience for early_stopping')
parser.add_argument('--semantics', action='store_true', 
                    help='To load semantics A')
parser.add_argument('--trainable', action='store_true', help='to train adjancency')
parser.add_argument('--fragment', action='store_true')
'''

params,_ = parser.parse_known_args()


In [3]:
# rnn : inp_dim x rnn_hidden_dim x n_layer
# input : bs x seq_len x inp_dim
# h0 : n_layer x bs x rnn_hidden_dim

rnn = nn.GRU(10, 20, 2, batch_first=True)
input,h0 = torch.randn(3, 5, 10),torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [102]:
class nermodel(torch.nn.Module):
    def __init__(self, params):
        super(nermodel, self).__init__()
        self.device = params.device
        self.hiddensize = params.hs
        self.num_layer = params.nl
        self.bidir = params.bidir
        self.bs = params.bs
        self.inplen = params.inplen
        self.inpsize = params.inpsize
        self.vocabsize = params.vocabsize
        
        self.char_embedding = nn.Embedding(self.vocabsize, self.inpsize)
        # keep batch first
        self.rnn = nn.GRU(params.inpsize, params.hs, \
                          num_layers=self.num_layer, batch_first=True)
        
        self.fc = torch.nn.Linear(self.bidir * self.hiddensize, 1)
        self.relu = torch.nn.ReLU()
        
    def forward(self, x):
        #print('x_i:',x.shape, 'h0:',self.h0.shape)
        self.h0 = torch.randn(self.bidir * self.num_layer, 
                              x.shape[0], self.hiddensize)
        x = self.char_embedding(x)
        rnn_otpt, hn = self.rnn(x, self.h0)
        #print(rnn_otpt.shape)
        fc_otpt = self.relu(self.fc(rnn_otpt))
        #print('o shape:',fc_otpt.shape)
        return fc_otpt  

In [37]:
# model testing
params.vocabsize = 805#len(train_set.vocab)
m = nermodel(params)
trial_inp = torch.randint(0, 805, (32,50))
print('model:', m)
m(trial_inp).shape

model: nermodel(
  (char_embedding): Embedding(805, 768)
  (rnn): GRU(768, 768, num_layers=2, batch_first=True)
  (fc): Linear(in_features=768, out_features=1, bias=True)
  (relu): ReLU()
)
x_i: torch.Size([32, 50]) h0: torch.Size([2, 32, 768])
torch.Size([32, 50, 768])
o shape: torch.Size([32, 50, 1])


torch.Size([32, 50, 1])

# Dataloader and Input preprocessing - char model

In [9]:
train_df = pd.read_csv('../data/BioCreative_TrainTask3.0.tsv', sep='\t')
dev_df = pd.read_csv('../data/BioCreative_ValTask3.tsv', sep='\t')
train_df.head()

Unnamed: 0,tweet_id,user_id,created_at,text,start,end,span,drug
0,525872716411580416,2333890110,2014-10-25,@Rhy_QD10 yeah irking he need his ass whoop I ...,-,-,-,-
1,809577207597244417,165916824,2016-12-16,Panda Express 🐼😛,-,-,-,-
2,590918768269864960,2414667758,2015-04-22,Well..technology wins agains. People are fight...,-,-,-,-
3,237385144221192193,24324898,2012-08-20,@jennabennabear what happened?????,-,-,-,-
4,166288274300735488,181819579,2012-02-05,A first grade teacher asked her class to compl...,-,-,-,-


In [91]:
#positive
tpdf = train_df.loc[train_df['start'] != '-']
tndf = train_df.loc[train_df['start'] == '-']
dpdf = dev_df.loc[dev_df['start'] != '-']
dndf = dev_df.loc[dev_df['start'] == '-']

In [96]:
# traindf - subsampled tdf
tdf = pd.concat([tpdf,tndf.iloc[:877]])
tdf = tdf.sample(frac=1)
# devdf - subsampled ddf
ddf = pd.concat([dpdf,dndf.iloc[:396]])
ddf = ddf.sample(frac=1)

ddf

Unnamed: 0,tweet_id,user_id,created_at,text,start,end,span,drug
22964,665056375295184896,1000206378,2015-11-13,anyone have muscle relaxers or vicodeine !??,31,40,Vicodeine,vicodin
56,488852816237039618,2333890110,2014-07-15,@muslimah_fatima bye girl he can't see shit he...,-,-,-,-
301,709112139575795712,250358074,2016-03-13,@fash_chronicles we're dress twins!!👯 yay! i t...,-,-,-,-
273,549622749896130561,151190725,2014-12-29,@misskd @ashstronge lol so did we! my waist is...,-,-,-,-
342,550132579274620928,2799031971,2014-12-31,the official nesting has started. i have gotte...,-,-,-,-
...,...,...,...,...,...,...,...,...
157,852222771170095108,1267370436,2017-04-12,@_reynaportillo @siml_25 dude go swimming !! 😩,-,-,-,-
19955,407728229857112064,220948289,2013-12-03,"the nightly pill arsenal... acetaminophen, 3 d...",28,41,Acetaminophen,acetaminophen
305,825077608794689538,2511267649,2017-01-27,stop posting them. stop tweeting about them. s...,-,-,-,-
2,553397745701376000,2333890110,2015-01-09,@mamas_ripdad thank yu 😘,-,-,-,-


In [75]:
class biodata(Dataset):
    def __init__(self, df, vocab=None, name='train'):
        self.len = len(df)
        self.data = df
        self.max_len = max(df.text.apply(lambda x: len(x)).to_numpy())
        self.setname = name
        self.vocab = vocab
        self.vdict = None
        self.create_vocab(vocab)
        self.data['text'] = self.data['text'].apply(lambda x: x.lower())
        
    def __getitem__(self, index):
        sen = self.data['text'].iloc[index]
        start, end = self.data['start'].iloc[index], self.data['end'].iloc[index]
        start = 0 if start == '-' else int(start)
        end = 0 if end == '-' else int(end)
        return {'ids' : torch.tensor(self.transform_input(sen, pad=True), dtype=torch.long),
               'targets' : torch.tensor(self.make_label(self.max_len, start, end), dtype=torch.float64)}
    
    def transform_input(self, sentence, pad=False):
        es = []
        for e in sentence.lower():
            if e in self.vdict:
                es.append(self.vdict[e])
            else:
                es.append(self.vdict['<oov>'])
        diff = 0 if self.max_len<len(es) else self.max_len-len(es)
        diff = 0 if not pad else diff
        return es + [1]*diff
    
    def make_label(self,l, start, end):
        label = np.zeros(l)
        try:
            if start <= l:
                label[start:end] = 1
        except:
            print('======>',start, l)
            import sys;sys.exit()
        return label
    
    def create_vocab(self, vocab):
        if not vocab:
            iv = {'<oov>', '<pad>'}
            for line in self.data['text'].to_numpy():
                iv |= set(line)
            self.vocab = iv
        else:
            iv = vocab
        ivdict = {'<oov>':0, '<pad>':1}
        for e in iv:
            if e not in ivdict:
                ivdict[e] = len(ivdict)
        self.vdict = ivdict
        
        print(f'{self.setname} vocab created!')
    
    def __len__(self):
        return self.len
    
    def __name__(self):
        return self.setname

In [97]:
train_set = biodata(tdf, name='train')
dev_set = biodata(ddf, vocab=train_set.vocab, name='dev')

train vocab created!
dev vocab created!


In [98]:
train_params = {'batch_size': params.bs,
               'shuffle':True,
               'num_workers': 2}
dev_params = {'batch_size': params.bs,
              'shuffle': False,
              'num_workers': 2}

trainloader = DataLoader(train_set, **train_params)
devloader = DataLoader(dev_set, **dev_params)

In [103]:
def train(epoch):
    tr_loss, tr_steps = 0,0
    n_correct = 0
    
    model.train()
    
    for _, data in tqdm(enumerate(trainloader)):
        optimizer.zero_grad()
        ids = data['ids']#.to(device, dtype=torch.long)
        tar = data['targets']#.to(device, dtype=torch.long)
        output = model(ids).squeeze(-1)
        #print('outshape:', output.shape, 'tarshape:', tar.shape)
        loss = loss_function(output, tar)
        tr_loss += loss.item()
        tr_steps += 1
        
        if _ % 50 == 0:
            print(f'training loss per step : {tr_loss/ tr_steps}')
            
        loss.backward()
        optimizer.step()

In [79]:
m.parameters

<bound method Module.parameters of nermodel(
  (char_embedding): Embedding(805, 768)
  (rnn): GRU(768, 768, num_layers=2, batch_first=True)
  (fc): Linear(in_features=768, out_features=1, bias=True)
  (relu): ReLU()
)>

In [None]:
loss_function = torch.nn.BCEWithLogitsLoss()
#torch.nn.CrossEntropyLoss()
model = nermodel(params)

optimizer = torch.optim.Adam(params =model.parameters(), lr=params.lr)

EPOCHS = 2

for e in range(EPOCHS):
    train(e)

0it [00:00, ?it/s]

training loss per step : 0.733324588171955


8it [00:13,  1.65s/it]