# 3.3.4. CNN  

In [1]:
# load data and take a quick look
import pandas as pd
df = pd.read_csv('clinic_c.csv')#cleaned dataset
df.head()

Unnamed: 0,Sent1,Sent2,Score
0,insulin nph human novolin unit suspension subc...,insulin nph human novolin unit suspension unit...,3.5
1,patient arrives ambulatory gait steady history...,complex assessment performed patient arrives a...,2.5
2,peripheral iv site established right forearm g...,peripheral iv site present prior arrival estab...,3.45
3,new confusion inability stay alert awake curre...,new confusion inability stay alert awake chest...,4.0
4,spent minute patient greater time wa spent cou...,nurse visit ten minute half wa spent counselin...,3.0


In [2]:
# convert ot list

sent1 = df['Sent1'].tolist()
sent2 = df['Sent2'].tolist()
lables = df['Score'].tolist()


In [3]:
# data split. 
# %store -r cleaned1
# %store -r cleaned2
# %store -r dtree1
# %store -r dtree2

train_text1 = sent1[:600]
train_text2 = sent2[:600]
train_lables =lables[:600]

dev_text1 = sent1[600:750]
dev_text2 = sent2[600:750]
dev_lables =lables[600:750]


test_text1 = sent1[750:]
test_text2 = sent2[750:]
test_lables =lables[750:]


In [4]:
#concatenating sentences

sentences_pair_tr =  [x1 +' '+ x2 for x1, x2 in zip(train_text1, train_text2)]
sentences_pair_tr[1] #training sentence pair
#len(sentences_pair_tr)

sentences_pair_dv = [x1 +' '+ x2 for x1, x2 in zip(dev_text1, dev_text2)]
sentences_pair_dv[1] #dev sentence pair 
#len(sentences_pair_dv)


sentences_pair_tt = [x1 +' '+ x2 for x1, x2 in zip(test_text1, test_text2)]
sentences_pair_tt[1] #test sentence pair
#len(sentences_pair_tt)


'negative gastrointestinal review system historian denies abdominal pain nausea vomiting negative ear nose throat review system historian denies otalgia sore throat stridor'

In [5]:
# load pre-trained Pubmed embeddings
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
path_of_downloaded_bin = "/Users/aswath/PycharmProjects/mfac038/IndividualProject/PubMed-and-PMC-w2v.bin"
word_vectors = KeyedVectors.load_word2vec_format(datapath(path_of_downloaded_bin), binary=True)


In [6]:
# define functions that build mini-batches
from nltk.tokenize import word_tokenize
import numpy as np

word_vec_dim =200 # make sure this number matches the embedding
oov_vec = oov_vec = np.random.rand(word_vec_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    for i in range(largest_len-len(sent_words)):
        vecs.append([0.]*word_vec_dim)
    return np.array(np.transpose(vecs))

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    #print('mini batch shape',np.array(text_vecs))
    return np.array(text_vecs)
    


In [7]:
# define the CNN model

import numpy as np
import torch
import torch.nn as nn

class CNN_reg(nn.Module):
    def __init__(self, embd_dim, filter_size_list, filter_num_list, class_num, dp_rate=0.5, gpu=False):
        super(CNN_reg, self).__init__()
        self.embd_dim = embd_dim
        assert len(filter_size_list) == len(filter_num_list)
        self.output_dim = class_num
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dp_rate)
        self.fc = nn.Linear(np.sum(filter_num_list), class_num)
        self.gpu = gpu
        self.convs = self.build_convs(filter_size_list, filter_num_list, gpu)
        if self.gpu:
            self.to('cuda')
            
    def build_convs(self, f_sizes, f_nums, gpu):
        convs = nn.ModuleList()
        for fs, fn in zip(f_sizes, f_nums):
            padding_size = fs-1
            m = nn.Conv1d(self.embd_dim, fn, fs, padding=padding_size)
            if gpu: m.to('cuda')
            convs.append(m)
        return convs
        
    def get_conv_output(self, input_matrix, conv, gpu):
        # step 1: compute convolution 
        assert input_matrix.shape[1] == self.embd_dim
        conv_output = conv(input_matrix)
        # step 2: pass through an activation function 
        conv_relu = self.tanh(conv_output)
        # step 3: max-over-time pooling
        maxp = nn.MaxPool1d(conv_relu.shape[2])
        maxp_output = maxp(conv_relu)
        return maxp_output
       
    def forward(self, all_text_vectors):
        cnn_repr = torch.tensor([])
        if self.gpu: cnn_repr = cnn_repr.to('cuda')
        for cv in self.convs:
            cv_output = self.get_conv_output(all_text_vectors, cv, self.gpu)
            cnn_repr = torch.cat((cnn_repr, cv_output), dim=1)
        # print(cnn_repr.shape)
        after_dp = self.dropout(cnn_repr.squeeze())
        #print('xxxx',after_dp.shape)
        logit = self.fc(after_dp)
        return logit

In [8]:
dropout_rate = 0.5 # dropout rate
filter_sizes = [2,3,4]
filter_nums = [100]*len(filter_sizes)

gpu = False
model = CNN_reg(word_vec_dim, filter_sizes, filter_nums,1, dropout_rate, gpu)
loss_fnc = torch.nn.MSELoss() # MSE loss

# hyper parameters
n_epochs = 10 # number of epoch 
batch_size = 32
lr = 0.001 # initial learning rate

# init optimizer and scheduler
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95) # after each epoch, the learning rate is discounted to its 98%

In [9]:
# let's first see its performance on the dev set
from sklearn.metrics import mean_squared_error

with torch.no_grad():
    model.eval()
    dev_predictions = []
    for idx in range(0,len(sentences_pair_dv),batch_size):
        x_data = build_mini_batch(sentences_pair_dv[idx:idx+batch_size], word_vectors)
        if x_data.shape[0] == 0: continue # to avoid empty batch
        #print(x_data.shape)
        x_tensor = torch.tensor(x_data, dtype=torch.float)
        #print(x_tensor.shape)
        y_pred = model(x_tensor).cpu().detach().numpy()
        #print(y_pred.shape)
        #print(y_pred)
        pred_labels = [entry for entry in y_pred]
        #print(pred_labels)
        dev_predictions += pred_labels
    d_mse = mean_squared_error(dev_lables, dev_predictions)
    print('\n MSE on dev set is ',d_mse)
    


 MSE on dev set is  10.283840240817177


In [12]:
# training the CNN model

a_best_mse = 1.30
best_model = None
import copy
import numpy as np
from sklearn.metrics import mean_squared_error
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(sentences_pair_tr),batch_size):
        # Step 0: Get the data
        x_data = build_mini_batch(sentences_pair_tr[idx:idx+batch_size], word_vectors)
        if x_data.shape[0] == 0: continue # to avoid empty batch
        y_target = torch.tensor([train_lables[idx:idx+batch_size]], dtype=torch.float32).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        x_tensor = torch.tensor(x_data, dtype=torch.float)
        if gpu:
            x_tensor = x_tensor.to('cuda')
        y_pred = model(x_tensor)
        pred_labels = [entry for entry in y_pred.cpu().detach().numpy()]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        dev_predictions = []
        for idx in range(0,len(sentences_pair_tt),batch_size):
            x_data = build_mini_batch(sentences_pair_tt[idx:idx+batch_size], word_vectors)
            if x_data.shape[0] == 0: continue # to avoid empty batch
            x_tensor = torch.tensor(x_data, dtype=torch.float)
            if gpu:
                x_tensor = x_tensor.to('cuda')
            y_pred = model(x_tensor).cpu().detach().numpy()
            pred_labels = [entry for entry in y_pred]
            dev_predictions += pred_labels
            # print(pred_labels)
        mse = mean_squared_error(test_lables, dev_predictions)
        print('\n---> after epoch {} the mse on test set is {}'.format(epoch_i, mse))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if mse < a_best_mse:
            a_best_mse = mse
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best mse',mse)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()

  0%|          | 0/10 [00:00<?, ?it/s]




 10%|█         | 1/10 [00:02<00:23,  2.64s/it]


---> after epoch 0 the mse on test set is 1.307258729828282
learning rate 0.0005987369392383787



 20%|██        | 2/10 [00:05<00:20,  2.60s/it]


---> after epoch 1 the mse on test set is 1.29105153324419
learning rate 0.0005688000922764596
best model updated; new best mse 1.29105153324419



 30%|███       | 3/10 [00:07<00:18,  2.57s/it]


---> after epoch 2 the mse on test set is 1.2955990274585925
learning rate 0.0005403600876626366



 40%|████      | 4/10 [00:10<00:15,  2.55s/it]


---> after epoch 3 the mse on test set is 1.2952210727257094
learning rate 0.0005133420832795048



 50%|█████     | 5/10 [00:12<00:12,  2.55s/it]


---> after epoch 4 the mse on test set is 1.3039320666424896
learning rate 0.00048767497911552955



 60%|██████    | 6/10 [00:15<00:10,  2.56s/it]


---> after epoch 5 the mse on test set is 1.3030044165502765
learning rate 0.000463291230159753



 70%|███████   | 7/10 [00:17<00:07,  2.54s/it]


---> after epoch 6 the mse on test set is 1.3041544198802868
learning rate 0.00044012666865176535



 80%|████████  | 8/10 [00:20<00:05,  2.55s/it]


---> after epoch 7 the mse on test set is 1.3004125307607617
learning rate 0.0004181203352191771



 90%|█████████ | 9/10 [00:22<00:02,  2.54s/it]


---> after epoch 8 the mse on test set is 1.30141225729155
learning rate 0.0003972143184582182



100%|██████████| 10/10 [00:25<00:00,  2.54s/it]


---> after epoch 9 the mse on test set is 1.301798128139949
learning rate 0.00037735360253530727





In [13]:
print("the best MSE is ----> ", a_best_mse)

the best MSE is ---->  1.29105153324419


##### Cosine Similarity 

In [17]:
# sanity check 
vc=[]
for i in range(0,750):
    m = build_mini_batch(sent1[:750][i], word_vectors)
    vc.append(m)
print(vc)

vc2=[]
for i in range(0,318):
    m = build_mini_batch(sent2[:750][i], word_vectors)
    vc2.append(m)
#print(vc2)



vc1=[]
for i in range(0,318):
    m = build_mini_batch(sent1[750:][i], word_vectors)
    vc1.append(m)
#print(vc1)

vc3=[]
for i in range(0,318):
    m = build_mini_batch(sent2[750:][i], word_vectors)
    vc3.append(m)
#print(vc3)

[array([[[ 0.08174162],
        [ 0.28768539],
        [ 0.13625345],
        ...,
        [ 0.06149415],
        [-0.15567118],
        [-0.0334017 ]],

       [[ 0.21948846],
        [ 0.20526265],
        [-0.00687633],
        ...,
        [ 0.11510338],
        [-0.4214884 ],
        [-0.00622143]],

       [[-0.18975188],
        [ 0.1358013 ],
        [ 0.17584071],
        ...,
        [-0.16897507],
        [-0.02902463],
        [-0.1564308 ]],

       ...,

       [[ 0.19613242],
        [ 0.15864876],
        [ 0.0175749 ],
        ...,
        [-0.07291438],
        [-0.08997208],
        [-0.06185786]],

       [[ 0.34642762],
        [ 0.04969971],
        [ 0.07817987],
        ...,
        [-0.23744947],
        [-0.16245706],
        [ 0.12942381]],

       [[-0.03321759],
        [-0.15070027],
        [ 0.06131196],
        ...,
        [-0.32916379],
        [-0.08992583],
        [ 0.17061436]]]), array([[[-0.10638105],
        [-0.03233418],
        [ 0.37419999]

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

cc=[]
for i in range(0,750):
    c= cosine_similarity(vc1[i], vc3[i])
    cc.append(c)
#print(cc)
#len(cc)

In [20]:
%store a_best_mse
%store d_mse

Stored 'a_best_mse' (float64)
Stored 'd_mse' (float64)
