# 3.3.5. LSTM

In [1]:
# load data and take a quick look
import pandas as pd
df = pd.read_csv('clinic_c.csv')#cleaned dataset
df.head()

Unnamed: 0,Sent1,Sent2,Score
0,insulin nph human novolin unit suspension subc...,insulin nph human novolin unit suspension unit...,3.5
1,patient arrives ambulatory gait steady history...,complex assessment performed patient arrives a...,2.5
2,peripheral iv site established right forearm g...,peripheral iv site present prior arrival estab...,3.45
3,new confusion inability stay alert awake curre...,new confusion inability stay alert awake chest...,4.0
4,spent minute patient greater time wa spent cou...,nurse visit ten minute half wa spent counselin...,3.0


In [2]:
# convert to list

sent1 = df['Sent1'].tolist()
sent2 = df['Sent2'].tolist()
lables = df['Score'].tolist()

In [3]:
# data split. 
# %store -r cleaned1
# %store -r cleaned2
# %store -r dtree1
# %store -r dtree2

train_text1 = sent1[:600]
train_text2 = sent2[:600]
train_lables =lables[:600]

dev_text1 = sent1[600:750]
dev_text2 = sent2[600:750]
dev_lables =lables[600:750]


test_text1 = sent1[750:]
test_text2 = sent2[750:]
test_lables =lables[750:]



In [4]:
#concatenating sentences

sentences_pair_tr =  [x1 +' '+ x2 for x1, x2 in zip(train_text1, train_text2)]
sentences_pair_tr[1]
#len(sentences_pair_tr)

sentences_pair_dv = [x1 +' '+ x2 for x1, x2 in zip(dev_text1, dev_text2)]
sentences_pair_dv[1]
#len(sentences_pair_dv)


sentences_pair_tt = [x1 +' '+ x2 for x1, x2 in zip(test_text1, test_text2)]
sentences_pair_tt[1]
#len(sentences_pair_tt)



'negative gastrointestinal review system historian denies abdominal pain nausea vomiting negative ear nose throat review system historian denies otalgia sore throat stridor'

In [5]:
# load pre-trained Pubmed embeddings
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
path_of_downloaded_bin = "/Users/aswath/PycharmProjects/mfac038/IndividualProject/PubMed-and-PMC-w2v.bin"
word_vectors = KeyedVectors.load_word2vec_format(datapath(path_of_downloaded_bin), binary=True)


In [6]:
# then we define the RNN-based Regressor
import torch
import torch.nn as nn

class RNN_Regressor(nn.Module):
    def __init__(self, embd_dim, hidden_dim, model_type, cls_num, pooler_type, dropout, gpu):
        super(RNN_Regressor, self).__init__()
        assert model_type in ['rnn','lstm','bilstm','gru']
        assert pooler_type in ['max','avg']
        # rnn type
        if model_type == 'rnn':
            self.rnn = nn.RNN(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'bilstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, bidirectional=True, dropout=dropout)
        else: # model_type == 'gru'
            self.rnn = nn.GRU(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        # map from rnn output to logits
        if model_type == 'bilstm':
            self.fc = nn.Linear(2*hidden_dim, cls_num)
        else:
            self.fc = nn.Linear(hidden_dim, cls_num)
        # pooler type
        self.pooler_type = pooler_type
        # gpu or not
        self.gpu = gpu
        if gpu: self.to('cuda')
            
    def forward(self, input_matrix):
        token_num = input_matrix.shape[1]
        hidden_vecs = self.rnn(input_matrix)[0]
        if self.pooler_type == 'max':
            pooler = nn.MaxPool1d(token_num)
        else: 
            pooler = nn.AvgPool1d(token_num)
        if self.gpu: pooler.to('cuda')
        pooled_hidden = pooler(torch.transpose(hidden_vecs,1,2)).squeeze()
        return self.fc(pooled_hidden)

In [10]:
# define functions that build mini-batches
from nltk.tokenize import word_tokenize
import numpy as np

embd_dim = 200
hidden_dim = 200
rnn_type = 'bilstm'
pooler_type = 'avg'
dropout = 1
gpu = False

oov_vec = oov_vec = np.random.rand(embd_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    # print('mini batch shape',np.array(text_vecs).shape)
    return np.array(text_vecs)

def make_batch_prediction(sent_list, word_vectors, model, use_gpu=False):
    batch = build_mini_batch(sent_list, word_vectors)
    batch_logits = torch.tensor([])
    if use_gpu: batch_logits = batch_logits.to('cuda')
    for i in range(batch.shape[0]):
        input_sents = torch.from_numpy(batch[i]).float()
        if use_gpu: input_sents = input_sents.to('cuda')
        logits = model(input_sents.unsqueeze(0))
        batch_logits = torch.cat( (batch_logits, logits) )
    return batch_logits.view(batch.shape[0],-1)
  
# sanity check 
vc=[]
for i in range(0,600):
    m = build_mini_batch([sentences_pair_tr[i]],word_vectors)
    vc.append(m)
print(vc)

[array([[[-0.00273595,  0.43090543,  0.30492294, ..., -0.10342743,
         -0.32617137,  0.14303872],
        [ 0.11711555, -0.25026593, -0.01221739, ...,  0.16170172,
         -0.10028663,  0.0927908 ],
        [ 0.04796707, -0.01414114,  0.17470697, ...,  0.12878786,
         -0.02805804, -0.23365951],
        ...,
        [-0.14847782,  0.29313922,  0.04621191, ...,  0.09179814,
          0.05813854,  0.11894796],
        [ 0.29317576, -0.06439125,  0.08923125, ...,  0.14544795,
         -0.03383052, -0.35494852],
        [-0.40184683,  0.11974438,  0.19560894, ...,  0.20655231,
         -0.34781277, -0.3281492 ]]]), array([[[-0.29100496,  0.03303194,  0.26286083, ...,  0.27469832,
          0.05622837, -0.21587971],
        [-0.16989006, -0.10291775,  0.12711562, ...,  0.08290271,
          0.40714577, -0.36411515],
        [-0.87558216, -0.03858835,  0.2959018 , ..., -0.08977405,
          0.19316487, -0.4915945 ],
        ...,
        [-0.09387331,  0.2521318 ,  0.06766402, ...,

In [13]:
loss_fnc = torch.nn.MSELoss() # mse loss
model = RNN_Regressor(embd_dim, hidden_dim, rnn_type,1, pooler_type, dropout, gpu)

# hyper parameters
n_epochs = 10 # number of epoch (i.e. number of iterations)
batch_size = 32
lr = 0.001 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.999) # after each epoch, the learning rate is discounted to its 95%

In [16]:
# training the LSTM model

best_mse = 1.285
best_model = None
import copy
import numpy as np
from sklearn.metrics import mean_squared_error
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(sentences_pair_tr),batch_size):
        # Step 0: Get the data
        sents = sentences_pair_tr[idx:idx+batch_size]
        if len(sents) == 0: break
        y_target = torch.tensor([train_lables[idx:idx+batch_size]], dtype=torch.float32).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = make_batch_prediction(sents, word_vectors, model, gpu)
        pred_labels = [entry for entry in y_pred.cpu().detach().numpy()]
        #print('pred labels', pred_labels)
        #print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        #print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()
        
        # Step 4+: clip the gradient, to avoid gradient explosion
        nn.utils.clip_grad_value_(model.parameters(), clip_value=3.)

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        predictions = []
        test_docs = sentences_pair_tt
        test_labels = test_lables
        
        for idx in range(0,len(sentences_pair_tt),batch_size):
            y_pred = make_batch_prediction(
                sentences_pair_tt[idx:idx+batch_size], word_vectors, model, gpu)
            pred_labels = [entry for entry in y_pred.cpu().detach().numpy()]
            predictions += pred_labels
        mse = mean_squared_error(test_labels, predictions)

        print('\n---> after epoch {} the mse on test set is {}'.format(epoch_i, mse))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if mse < best_mse:
            best_mse = mse
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best mse',mse)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()

  0%|          | 0/10 [00:00<?, ?it/s]




 10%|█         | 1/10 [00:45<06:53, 45.97s/it]


---> after epoch 0 the mse on test set is 1.2521612410172118
learning rate 0.0009900448802097482
best model updated; new best mse 1.2521612410172118



 20%|██        | 2/10 [01:16<05:31, 41.39s/it]


---> after epoch 1 the mse on test set is 1.252472011157654
learning rate 0.0009890548353295385



 30%|███       | 3/10 [02:01<04:57, 42.50s/it]


---> after epoch 2 the mse on test set is 1.2528238760743862
learning rate 0.000988065780494209



 40%|████      | 4/10 [02:40<04:08, 41.34s/it]


---> after epoch 3 the mse on test set is 1.253208102852632
learning rate 0.0009870777147137147



 50%|█████     | 5/10 [03:09<03:09, 37.81s/it]


---> after epoch 4 the mse on test set is 1.2536173873876246
learning rate 0.000986090636999001



 60%|██████    | 6/10 [03:38<02:19, 34.95s/it]


---> after epoch 5 the mse on test set is 1.2540445495555232
learning rate 0.000985104546362002



 70%|███████   | 7/10 [04:10<01:41, 33.99s/it]


---> after epoch 6 the mse on test set is 1.254482415071221
learning rate 0.00098411944181564



 80%|████████  | 8/10 [04:38<01:04, 32.25s/it]


---> after epoch 7 the mse on test set is 1.2549243535408716
learning rate 0.0009831353223738245



 90%|█████████ | 9/10 [05:06<00:30, 30.97s/it]


---> after epoch 8 the mse on test set is 1.255364590906488
learning rate 0.0009821521870514505



100%|██████████| 10/10 [05:34<00:00, 33.43s/it]


---> after epoch 9 the mse on test set is 1.255798509803564
learning rate 0.000981170034864399





In [17]:
print("the best MSE is ----> ", best_mse)

the best MSE is ---->  1.2521612410172118


In [18]:
#storing variables
%store best_mse


Stored 'best_mse' (float64)
