#Start

In [None]:
# useful libraries
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import stopwords                   
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer       
from sklearn.utils import shuffle   
from gensim.models import Word2Vec
import re
import matplotlib.pyplot as plt
import random


In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
from google.colab import files

# Load preprocessed data

In [None]:
!pip3 install pickle5

In [None]:
path='./gdrive/MyDrive/nlp_data/'

emb_size=302
import pickle5 as pickle
with open(path+'emb_by_essay_id.pickle', 'rb') as f:
    data = pickle.load(f) # its a dict , every entry is one essay and words are 302 size embeddings, last two are categories and indicators is the word mispelled

In [None]:
path='./gdrive/MyDrive/nlp_data/'

emb_size=301
import pickle5 as pickle
with open(path+'emb_by_essay_id_no_correction.pickle', 'rb') as f:
    data = pickle.load(f) # its a dict , every entry is one essay and words are 302 size embeddings, last two are categories and indicators is the word mispelled

In [None]:
# Put data in a list
mini=10000
maxi=0
lens=[]
data_list=[]
for i in data.keys():
    if mini>len(data[i]):
      mini=len(data[i])
    if maxi<len(data[i]):
      maxi=len(data[i])

    lens.append(len(data[i]))
    for j in range(len(data[i])):
      data[i][j]=data[i][j].reshape(1,-1)


for i in data.keys():
   data_list.append(np.concatenate(data[i],axis=0)) # every data point/essay is a word_num x 302 matrix

print(mini)
print(maxi)


In [None]:
# DO train/test split with chosen indexes and chosen label
chosen_score=1

with open('scores.pickle', 'rb') as f:
    scores = pickle.load(f)

with open('ids_for_test.pickle', 'rb') as f:
    ids_essays_test = pickle.load(f)

ids_all=np.asarray(list(data.keys()))
print(ids_all.shape)
x_test=[]
scores_test=[]
x_train=[]
scores_train=[]
y_test_reg=[]
y_train_reg=[]


for i in range(len(ids_all)):
    if ids_all[i] in ids_essays_test[chosen_score]:
        x_test.append(data_list[i])
        scores_test.append(scores[i,chosen_score])
        y_test_reg.append(np.sum(scores[i,:]))
    else:
        x_train.append(data_list[i])
        scores_train.append(scores[i,chosen_score])
        y_train_reg.append(np.sum(scores[i,:]))

y_train=np.asarray(scores_train)
y_test=np.asarray(scores_test)

y_train_reg=np.asarray(y_train_reg).astype('float')
y_test_reg=np.asarray(y_test_reg).astype('float')

print(y_train.shape)
print(len(x_train))
print(y_test.shape)
print(len(x_test))
print(y_train_reg.shape)

# Models

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#torch.manual_seed(1)

In [None]:
import numpy as np
from random import shuffle
from torch.utils.data import Sampler
import torch
import math

def collate(examples):
    
    seq = [torch.tensor(e[0]) for e in examples]
    lens = np.asarray([e[1] for e in examples])
    labels = np.asarray([e[2] for e in examples])
    return pad_sequence(seq, batch_first=True, padding_value=0), lens , labels

class BySequenceLengthSampler(Sampler):

    def __init__(self, data_source,  
                bucket_boundaries, batch_size=64, drop_last=True):
        
        ind_n_len = []
        for i, p in enumerate(data_source): # p is the data
            ind_n_len.append( (i, p[1]) ) # has indexes and lens of all data
        data=[]
        for i, p in enumerate(data_source): # p is the data
            data.append( (p[0], p[2]) ) #(sequence, label)

        self.data_source = data
        self.ind_n_len = ind_n_len
        self.bucket_boundaries = bucket_boundaries
        self.batch_size = batch_size
        self.drop_last = drop_last

        if self.drop_last:
            print("WARNING: drop_last=True, dropping last non batch-size batch in every bucket ... ")

        self.boundaries = list(self.bucket_boundaries)
        self.buckets_min = torch.tensor([np.iinfo(np.int32).min] + self.boundaries)
        self.buckets_max = torch.tensor(self.boundaries + [np.iinfo(np.int32).max])
        self.boundaries = torch.tensor(self.boundaries)

    def shuffle_tensor(self, t):
        return t[torch.randperm(len(t))]
        
    def __iter__(self):
        data_buckets = dict()
        # where p is the id number and seq_len is the length of this id number. 
        for p, seq_len in self.ind_n_len:
            pid = self.element_to_bucket_id(p,seq_len) # find in which bucket to put new data
            if pid in data_buckets.keys(): # if that bucket exists add to it other make new one
                data_buckets[pid].append(p)
            else:
                data_buckets[pid] = [p] # buckets have data indexes

        for k in data_buckets.keys(): # every bucket to tensor
            data_buckets[k] = torch.tensor(data_buckets[k])

        iter_list = []
        for k in data_buckets.keys():

            t = self.shuffle_tensor(data_buckets[k]) #shuffle data inside every bucket
            batch = torch.split(t, self.batch_size, dim=0) # split bucket on batches

            if self.drop_last and len(batch[-1]) != self.batch_size:
                batch = batch[:-1]

            iter_list += batch

        shuffle(iter_list) # shuffle all the batches so they arent ordered by bucket
        # size
        for i in iter_list:  # generator that returns batches of data indexes that are in same bucket
            yield i.numpy().tolist() # as it was stored in an array
    
    def __len__(self):
        return len(self.data_source)
    
    def element_to_bucket_id(self, x, seq_length):

        valid_buckets = (seq_length >= self.buckets_min)*(seq_length < self.buckets_max)
        bucket_id = valid_buckets.nonzero()[0].item()

        return bucket_id

## Attention

In [None]:
class Attention(nn.Module):
    def __init__(self, device,hidden_size):
        super(Attention, self).__init__()
        self.device = device
        self.hidden_size = hidden_size

        self.concat_linear = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size, hidden_size)

        ## add this model the same same device with the RNN
        self.to(device)

    def forward(self, rnn_outputs, final_hidden_state):
      attn_weights = self.attn(rnn_outputs) # (batch_size, seq_len, hidden_dim)
      attn_weights = torch.bmm(attn_weights, final_hidden_state.unsqueeze(2))
      attn_weights = F.softmax(attn_weights.squeeze(2), dim=1)
      context = torch.bmm(rnn_outputs.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(2)
      attn_hidden = torch.tanh(self.concat_linear(torch.cat((context, final_hidden_state), dim=1)))
      return attn_hidden, attn_weights

## CNN-BiLSTM

In [None]:
from torch.utils.data import Dataset
class MyDataset(Dataset):
    def __init__(self, x, lens, y):
        super(MyDataset, self).__init__()
        assert len(x) == y.shape[0] # assuming shape[0] = dataset size
        assert len(x) == lens.shape[0]
        self.x = x
        self.y = y
        self.lens = lens


    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.lens[index], self.y[index]


def masked_mean(tensor, lens, axis):

    index = torch.tensor(lens, dtype = torch.float).to(device)
    help=(torch.arange(tensor.shape[1])[None, :]).to(device)
    mask = ((help < index[:, None])*1).to(device)
    mask = torch.unsqueeze(mask,2)

    masked = torch.mul(tensor, mask)  # Apply the mask using an element-wise multiply
    return masked.sum(axis) / mask.sum(axis)  # Find the average!


In [None]:
class CNNLSTM(nn.Module):
    def __init__(self, input_size, num_class):
        super().__init__()
   
        self.cnn= nn.Conv1d(in_channels=input_size, out_channels=150, kernel_size=3, stride=1, padding=1, padding_mode='zeros') #same padding.out will be batch_size, outchannel, same length 
        self.cnn2= nn.Conv1d(in_channels=75, out_channels=150, kernel_size=3, stride=1, padding=1, padding_mode='zeros')
        
        self.cnn1= nn.Conv1d(in_channels=input_size, out_channels=150, kernel_size=5, stride=1, padding=2, padding_mode='zeros') #same padding.out will be batch_size, outchannel, same length     
        self.cnn3= nn.Conv1d(in_channels=75, out_channels=150, kernel_size=5, stride=1, padding=2, padding_mode='zeros')


        self.max_pool1= nn.MaxPool1d(kernel_size=2,stride=2)

        self.lstm = nn.LSTM(input_size=300, hidden_size = 64, batch_first = True, num_layers=2, dropout = 0.4, bidirectional=True) #300 before in ch1
        
        self.linear1 = nn.Linear(128, 200)
        self.linear2 = nn.Linear(200, 100)

       
        self.linear_end = nn.Linear(100, num_class)
      


    def forward(self, input_seq, lens):
    #
       # print(f"Input shape {input_seq.shape}")

       

        cnn_out = F.leaky_relu( self.cnn(input_seq))
       # print(f"CNN_out1 shape {cnn_out.shape}")

        pool1=self.max_pool1(cnn_out.transpose(1,2)).transpose(1,2)
      #  print(f"Pool_out1 shape {pool1.shape}")

        cnn_out2 = F.leaky_relu( self.cnn2(pool1)).transpose(1,2) 
       # print(f"CNN_out11 shape {cnn_out2.shape}")

        cnn_out1 = F.leaky_relu(self.cnn1(input_seq))
       # print(f"CNN_out2 shape {cnn_out1.shape}")

        pool2=self.max_pool1(cnn_out1.transpose(1,2)).transpose(1,2)
        #print(f"Pool_out1 shape {pool2.shape}")

        cnn_out3 = F.leaky_relu(self.cnn3(pool2)).transpose(1,2)
       # print(f"CNN_out21 shape {cnn_out3.shape}")

        cnn_concat = torch.cat((cnn_out2, cnn_out3), 2)
       # print(f"CNN_concat shape {cnn_concat.shape}")




        packed_seq = pack_padded_sequence(cnn_concat, lens, batch_first = True, enforce_sorted= False) # pack for lstm for masking
       # print(f"packed out shape {packed_seq.data.shape}")
        lstm_out, self.hidden_cell = self.lstm(packed_seq)
  

        result_unpacked, lengths_unpacked = pad_packed_sequence(lstm_out, batch_first = True)
       # print(f"LSTM out shape {result_unpacked.shape}")
  
      
        mean_over_time= masked_mean(result_unpacked, lens, 1)
      #  print(f"Mean out shape {mean_over_time.shape}")

    

        lin_out1 =  F.relu(self.linear1(mean_over_time))
       #print(f"Linear1 out shape {lin_out1.shape}")
        lin_out2 = F.relu(self.linear2(lin_out1))
       # print(f"Linear2 out shape {lin_out2.shape}")

        predictions = self.linear_end(lin_out2)
       
        return predictions


## A-CNN-BiLSTM

In [None]:
class ACNNLSTM(nn.Module):
    def __init__(self, input_size, num_class):
        super().__init__()
     
        self.cnn= nn.Conv1d(in_channels=input_size, out_channels=150, kernel_size=3, stride=1, padding=1, padding_mode='zeros') #same padding.out will be batch_size, outchannel, same length 
        self.cnn2= nn.Conv1d(in_channels=75, out_channels=150, kernel_size=3, stride=1, padding=1, padding_mode='zeros')
        
        self.cnn1= nn.Conv1d(in_channels=input_size, out_channels=150, kernel_size=5, stride=1, padding=2, padding_mode='zeros') #same padding.out will be batch_size, outchannel, same length     
        self.cnn3= nn.Conv1d(in_channels=75, out_channels=150, kernel_size=5, stride=1, padding=2, padding_mode='zeros')


        self.max_pool1= nn.MaxPool1d(kernel_size=2,stride=2)

        self.lstm = nn.LSTM(input_size=300, hidden_size = 64, batch_first = True, num_layers=2, dropout = 0.4, bidirectional=True) #300 before in ch1
        
        self.linear1 = nn.Linear(128, 200)
        self.linear2 = nn.Linear(200, 100)


        # define an attention layer
        self.attention = Attention(device,128)

       
        self.linear_end = nn.Linear(100, num_class)
      


    def forward(self, input_seq, lens):
    
       # print(f"Input shape {input_seq.shape}")

        cnn_out = F.leaky_relu( self.cnn(input_seq))
       #print(f"CNN_out1 shape {cnn_out.shape}")

        pool1=self.max_pool1(cnn_out.transpose(1,2)).transpose(1,2)
       #print(f"Pool_out1 shape {pool1.shape}")

        cnn_out2 = F.leaky_relu( self.cnn2(pool1)).transpose(1,2) 
       # print(f"CNN_out11 shape {cnn_out2.shape}")

        cnn_out1 = F.leaky_relu(self.cnn1(input_seq))
       # print(f"CNN_out2 shape {cnn_out1.shape}")

        pool2=self.max_pool1(cnn_out1.transpose(1,2)).transpose(1,2)
        #print(f"Pool_out1 shape {pool2.shape}")

        cnn_out3 = F.leaky_relu(self.cnn3(pool2)).transpose(1,2) 
       #print(f"CNN_out21 shape {cnn_out3.shape}")

        cnn_concat = torch.cat((cnn_out2, cnn_out3), 2)
       # print(f"CNN_concat shape {cnn_concat.shape}")



      

        packed_seq = pack_padded_sequence(cnn_concat, lens, batch_first = True, enforce_sorted= False) # pack for lstm for masking
      
        lstm_out, self.hidden_cell = self.lstm(packed_seq)
    

        result_unpacked, lengths_unpacked = pad_packed_sequence(lstm_out, batch_first = True)
       # print(f"LSTM out shape {result_unpacked.shape}")
   
        ## Collect last hidden state
        final_state = self.hidden_cell[0].view(2, 2, result_unpacked.shape[0], 64)[-1]
        # Since it is bidirectional
        h_1, h_2 = final_state[0], final_state[1]
        final_hidden_state = torch.cat((h_1, h_2), 1)  # Concatenate both states

        # Attention layer
        att_out, attention_weights = self.attention(result_unpacked, final_hidden_state)
       # print(f"Attention out shape {att_out.shape}")

    
        lin_out1 =  F.relu(self.linear1(att_out))
        #print(f"Linear1 out shape {lin_out1.shape}")
        lin_out2 = F.relu(self.linear2(lin_out1))
       # print(f"Linear2 out shape {lin_out2.shape}")

        predictions = self.linear_end(lin_out2)
       
        return predictions


# Debugg model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_mat = np.ones((4,3,302))
lengths = np.ones(4)
lengths[0]=3
lengths[1]=3
lengths[2]=1
lengths[3]=2
input_tens = torch.tensor(input_mat, dtype = torch.float).transpose(1,2).to(device) # vector x len_essay
model = ACNNLSTM(302,4)
model.to(device)
result = model(input_tens,lengths)
print(result.shape)




# Split function

In [None]:
# Random split to get indexes of test set 10% for diff scores

def train_test_split(labels, test_perc=0.1):# train is list of essays that are len x 302 arrays
    
     all_labels=set(labels)
    
     test_ind=[]
     train_ind=[]
     class_lens_train=[]
     for l in all_labels:
         
         Idx=np.where(labels==l)[0]
         
         
         test_size=int(len(Idx)*test_perc)

         np.random.shuffle(Idx) # shuffle labels
    
         idx=Idx[:test_size]
         idx1=Idx[test_size:]
         test_ind.append(idx)
         train_ind.append(idx1)
         

         class_lens_train.append(len(idx1))





     test_ind=np.concatenate(test_ind)
     train_ind=np.concatenate(train_ind)
     class_lens_train=np.asarray(class_lens_train)

     return  train_ind, test_ind, class_lens_train, len(list(all_labels))



# Training full batch A-CNN-BiLSTM

In [None]:
# Choose hyperparameters

lr_values=[0.1,0.01,0.001,0.0001]
w_decay_values=[1e-2,1e-3,1e-4]
amsgrad_values=[False, True]

lr_choice=3
w_decay_choice=1
ams_choice=1

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ACNNLSTM(emb_size,4)
model.to(device)


optimizer =  torch.optim.Adam(model.parameters(),lr=lr_values[lr_choice],weight_decay=w_decay_values[w_decay_choice],amsgrad=amsgrad_values[ams_choice])


# random split to train and test
PATH="model_checkpoint.pt"


# Split to test and validation
train_ind, val_ind , class_lens_train, num_class = train_test_split(y_train, 0.1) 


x_cross_train=[x_train[i] for i in train_ind]
x_cross_val=[x_train[i] for i in val_ind]

y_cross_train=y_train[train_ind]
y_cross_val=y_train[val_ind]



# Imbalanced classes weighting

max = np.sum(class_lens_train)/class_lens_train.shape[0]
weight = torch.tensor(max / class_lens_train, dtype = torch.float).to(device)
 
loss_function = nn.CrossEntropyLoss(weight=weight)

# Get lengths of sequences

len_cross_val=np.asarray([example.shape[0] for example in x_cross_val])

# Pad sequences to max length

seq = [torch.tensor(e) for e in x_cross_val]
x_cross_val_padded=pad_sequence(seq,batch_first=True)

# Make validation tensors since they dont shuffle
input_val_tensor =  torch.tensor(x_cross_val_padded, dtype = torch.float).transpose(1,2).to(device) 
y_val_tensor=torch.tensor(y_cross_val, dtype = torch.long).to(device)

print(input_val_tensor.shape)
print(y_val_tensor)

epochs = 600
loss_plot=[]
val_loss_plot=[]
train_acc_plot=[]
val_acc_plot=[]

max_acc=0
for i in range(epochs):

      
      model.train()

      print(f"Epoch {i+1}")
    
      optimizer.zero_grad()
   

      # Shuffle train set for this epoch

      c = list(zip(x_cross_train, y_cross_train))
      random.shuffle(c)
      x_cross_train, y_cross_train = zip(*c)
      x_cross_train=list(x_cross_train)
      y_cross_train=np.asarray(list(y_cross_train))

     

      # Getting lenght of every essay

      len_cross_train=np.asarray([example.shape[0] for example in x_cross_train])
     

      # Pad sequences to max length
      seq = [torch.tensor(e) for e in x_cross_train]
      x_cross_train_padded=pad_sequence(seq,batch_first=True)

      # Make the train tensors

      inputs_tensor = torch.tensor(x_cross_train_padded, dtype = torch.float).transpose(1,2).to(device) 

      y_train_tensor=torch.tensor(y_cross_train, dtype = torch.long).to(device)# long


      # Forward pass
      y_pred = model(inputs_tensor,len_cross_train)

      # Calc loss
      single_loss = loss_function(y_pred, y_train_tensor)

      
      # Calculate gradients 
      single_loss.backward()

      # Update Weights
      optimizer.step() 



      loss=single_loss.item()
      print(f"Loss {loss}")
      loss_plot.append(loss)

      

      train_acc=np.mean((np.argmax(y_pred.cpu().detach().numpy(), axis=1)==y_cross_train)*1)

      train_acc_plot.append(train_acc)
      print(f"Acc train {train_acc}")

      model.eval()

      y_pred_val = model(input_val_tensor,len_cross_val)

      # loss of validation

      single_loss = loss_function(y_pred_val, y_val_tensor)
   

      val_acc=np.mean((np.argmax(y_pred_val.cpu().detach().numpy(), axis=1)==y_cross_val)*1)

      val_acc_plot.append(val_acc)
      print(f"Acc val {val_acc}")

      val_loss=single_loss.item()
      print(f"Loss val {val_loss}")
      
      val_loss_plot.append(val_loss)



      if max_acc<val_acc: # save best weights
          torch.save({
                    'epoch': i,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': single_loss,
                    }, PATH)
          max_acc=val_acc

  


In [None]:
plt.figure()
plt.plot(loss_plot)
plt.title("Training loss")
plt.xlabel('epochs')
plt.ylabel('train loss')

plt.figure()
plt.plot(train_acc_plot)
plt.title("Training accuracy")
plt.xlabel('epochs')
plt.ylabel('train acc')

plt.figure()
plt.plot(val_loss_plot)
plt.title("Validation loss")
plt.xlabel('epochs')
plt.ylabel('val loss')

plt.figure()
plt.plot(val_acc_plot)
plt.title("Validation accuracy")
plt.xlabel('epochs')
plt.ylabel('val acc')

# Training full batch CNN-BiLSTM

In [None]:
# Choose hyperparameters

lr_values=[0.1,0.01,0.001,0.0001]
w_decay_values=[1e-2,1e-3,1e-4]
amsgrad_values=[False, True]

lr_choice=2
w_decay_choice=1
ams_choice=1

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNLSTM(emb_size,4)
model.to(device)

optimizer =  torch.optim.Adam(model.parameters(),lr=lr_values[lr_choice],weight_decay=w_decay_values[w_decay_choice],amsgrad=amsgrad_values[ams_choice])


# random split to train and test
PATH="model_checkpoint.pt"



# Split to test and validation
train_ind, val_ind , class_lens_train, num_class = train_test_split(y_train, 0.1) # can be random or cross-validation split! change later


x_cross_train=[x_train[i] for i in train_ind]
x_cross_val=[x_train[i] for i in val_ind]

y_cross_train=y_train[train_ind]
y_cross_val=y_train[val_ind]
# get only smmall portion to be train and test



print(y_cross_val)
print(y_cross_train)


# Imbalanced classes weighting

max = np.sum(class_lens_train)/class_lens_train.shape[0]
weight = torch.tensor(max / class_lens_train, dtype = torch.float).to(device)
 
loss_function = nn.CrossEntropyLoss(weight=weight)

# Get lengths of sequences

len_cross_val=np.asarray([example.shape[0] for example in x_cross_val])

# Pad sequences to max length

seq = [torch.tensor(e) for e in x_cross_val]
x_cross_val_padded=pad_sequence(seq,batch_first=True)

# Make validation tensors since they dont shuffle
input_val_tensor =  torch.tensor(x_cross_val_padded, dtype = torch.float).transpose(1,2).to(device) 
y_val_tensor=torch.tensor(y_cross_val, dtype = torch.long).to(device)# long

print(input_val_tensor.shape)
print(y_val_tensor)

epochs = 800
loss_plot=[]
val_loss_plot=[]
train_acc_plot=[]
val_acc_plot=[]

max_acc=0
for i in range(epochs):

      
      model.train()

      print(f"Epoch {i+1}")
    
      optimizer.zero_grad()
   

      # Shuffle train set for this epoch

      c = list(zip(x_cross_train, y_cross_train))
      random.shuffle(c)
      x_cross_train, y_cross_train = zip(*c)
      x_cross_train=list(x_cross_train)
      y_cross_train=np.asarray(list(y_cross_train))

     # print(y_cross_train)

      # Getting lenght of every essay

      len_cross_train=np.asarray([example.shape[0] for example in x_cross_train])
     
      # Pad sequences

      # Pad sequences to max length
      seq = [torch.tensor(e) for e in x_cross_train]
      x_cross_train_padded=pad_sequence(seq,batch_first=True)

      # Make the train tensors

      inputs_tensor = torch.tensor(x_cross_train_padded, dtype = torch.float).transpose(1,2).to(device) 

      y_train_tensor=torch.tensor(y_cross_train, dtype = torch.long).to(device)# long


      # Forward pass
      y_pred = model(inputs_tensor,len_cross_train)

      # Calc loss
      single_loss = loss_function(y_pred, y_train_tensor)

      
      # Calculate gradients 
      single_loss.backward()

      # Update Weights
      optimizer.step() 



      loss=single_loss.item()
      print(f"Loss {loss}")
      loss_plot.append(loss)

      

      train_acc=np.mean((np.argmax(y_pred.cpu().detach().numpy(), axis=1)==y_cross_train)*1)

      train_acc_plot.append(train_acc)
      print(f"Acc train {train_acc}")

      model.eval()

      y_pred_val = model(input_val_tensor,len_cross_val)

      # loss of validation

      single_loss = loss_function(y_pred_val, y_val_tensor)
   

      val_acc=np.mean((np.argmax(y_pred_val.cpu().detach().numpy(), axis=1)==y_cross_val)*1)

      val_acc_plot.append(val_acc)
      print(f"Acc val {val_acc}")

      val_loss=single_loss.item()
      print(f"Loss val {val_loss}")
      
      val_loss_plot.append(val_loss)



      if max_acc<val_acc: # save best weights
          torch.save({
                    'epoch': i,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': single_loss,
                    }, PATH)
          max_acc=val_acc

  


In [None]:
plt.figure()
plt.plot(loss_plot)
plt.title("Training loss")
plt.xlabel('epochs')
plt.ylabel('train loss')

plt.figure()
plt.plot(train_acc_plot)
plt.title("Training accuracy")
plt.xlabel('epochs')
plt.ylabel('train acc')

plt.figure()
plt.plot(val_loss_plot)
plt.title("Validation loss")
plt.xlabel('epochs')
plt.ylabel('val loss')

plt.figure()
plt.plot(val_acc_plot)
plt.title("Validation accuracy")
plt.xlabel('epochs')
plt.ylabel('val acc')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNLSTM(302,True,4)
model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer =  torch.optim.Adam(model.parameters(), lr=0.001)
#one_hot_labels = np.zeros((len(score_1),15))
#one_hot_labels[np.arange(len(score_1)),score_1] = 1

# random split to train and test
PATH="model_checkpoint.pt"


# Split to test and validation
train_ind, val_ind ,class_lens_train, num_class =train_test_split(y_train1, 0.2) # can be random or cross-validation split! change later


x_cross_train=[x_train1[i] for i in train_ind]
x_cross_val=[x_train1[i] for i in val_ind]

y_cross_train=y_train1[train_ind]
y_cross_val=y_train1[val_ind]
# get only smmall portion to be train and test


print(class_lens_train)
# Shuffle train set

#c = list(zip(x_cross_train, y_cross_train))
#random.shuffle(c)
#x_cross_train, y_cross_train = zip(*c)
#x_cross_train=list(x_cross_train)
#y_cross_train=np.asarray(list(y_cross_train))

# Getting lenght of every essay

len_cross_train=np.asarray([example.shape[0] for example in x_cross_train])
len_cross_val=np.asarray([example.shape[0] for example in x_cross_val])
print(len_cross_train.dtype)

# Make the dataset
traindata = MyDataset(x_cross_train, len_cross_train, y_cross_train)

# Make the loader

buckets=[0,21,31,41,61,81,101,151,301]
buckets=[0,301]
sampler = BySequenceLengthSampler(traindata, buckets, batch_size=1000, drop_last=False) # real batch size, 400 so it gets the whole bucket

trainloader = torch.utils.data.DataLoader(traindata, batch_size=1, # not real batch_size
                        batch_sampler=sampler, 
                        num_workers=0, 
                        collate_fn=collate,
                        drop_last=False, pin_memory=False)
print(trainloader)
for j in range(2):
   for i, data in enumerate(trainloader, 0):
      inputs, lens, labels = data

      print(inputs.shape)
      print(lens.shape)
      print(labels.shape)
      print(labels)

# Training mini batch CNN-BiLSTM

In [None]:
# Choose hyperparameters

lr_values=[0.1,0.01,0.001,0.0001]
w_decay_values=[1e-2,1e-3,1e-4]
amsgrad_values=[False, True]

lr_choice=2
w_decay_choice=1
ams_choice=1

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



optimizer =  torch.optim.Adam(model.parameters(),lr=lr_values[lr_choice],weight_decay=w_decay_values[w_decay_choice],amsgrad=amsgrad_values[ams_choice])


# random split to train and test
PATH="model_checkpoint.pt"


# Split to test and validation


train_ind, val_ind ,class_lens_train, num_class=train_test_split(y_train1, 0.2) # can be random or cross-validation split! change later

model = CNNLSTM(emb_size,num_class)
model.to(device)

x_cross_train=[x_train1[i] for i in train_ind]
x_cross_val=[x_train1[i] for i in val_ind]

y_cross_train=y_train1[train_ind]
y_cross_val=y_train1[val_ind]


# Imbalanced classes weighting

max = np.sum(class_lens_train)/class_lens_train.shape[0]
weight = torch.tensor(max / class_lens_train, dtype = torch.float).to(device)

loss_function = nn.CrossEntropyLoss(weight=weight)



# Getting lenght of every essay

len_cross_train=np.asarray([example.shape[0] for example in x_cross_train])
len_cross_val=np.asarray([example.shape[0] for example in x_cross_val])
print(len_cross_train.dtype)

# Make the dataset
traindata = MyDataset(x_cross_train, len_cross_train, y_cross_train)

# Make the loader


buckets=[0,21,41,61,81,101,151,301]
  


sampler = BySequenceLengthSampler(traindata, buckets, batch_size=2000, drop_last=False) # real batch size, to get whole buckets

trainloader = torch.utils.data.DataLoader(traindata, batch_size=1, # not real batch_size
                        batch_sampler=sampler, 
                        num_workers=0, 
                        collate_fn=collate,
                        drop_last=False, pin_memory=False)


# Make the dataset for val, here we need to padd  val also in batches and predict like that
valdata = MyDataset(x_cross_val, len_cross_val, y_cross_val)

# Make the loader

sampler_val = BySequenceLengthSampler(valdata, buckets, batch_size=1000, drop_last=False) # real batch size, to always put whole bucket, since its not important

valloader = torch.utils.data.DataLoader(valdata, batch_size=1, # not real batch_size
                        batch_sampler=sampler_val, 
                        num_workers=0, 
                        collate_fn=collate,
                        drop_last=False, pin_memory=False)


epochs = 10
loss_plot=[]
val_loss_plot=[]
train_acc_plot=[]
val_acc_plot=[]

min_loss=1000
for i in range(epochs):

  
  model.train()

  print(f"Epoch {i+1}")
 
  optimizer.zero_grad()
  loss=0
  predictions=[]
  for i, data in enumerate(trainloader, 0):
      inputs, lens, labels = data    #this batch has some padding but always diff 

      inputs_tensor=torch.tensor(inputs, dtype = torch.float).transpose(1,2).to(device) 
      print(inputs_tensor.dtype)
      y_train_tensor=torch.tensor(labels, dtype = torch.long).to(device)# long
      print(y_train_tensor.dtype)
  
    
      # Forward pass
      y_pred = model(inputs_tensor,lens)

      # Calc loss
      single_loss = loss_function(y_pred, y_train_tensor)
      loss+=single_loss.item()*lens.shape[0]
      
      # Calculate gradients 
      single_loss.backward()

      # Update Weights
      optimizer.step() 
   

      predictions.append((np.argmax(y_pred.cpu().detach().numpy(), axis=1)==labels)*1)
  
        


  predictions=np.concatenate(predictions)
  train_acc=np.mean(predictions)

  train_acc_plot.append(train_acc)
  print(f"Acc train {train_acc}")

  
  loss/=len_cross_train.shape[0]
  
  print(f"Loss {loss}")

  loss_plot.append(loss)

  model.eval()
  # forward on validation

  val_loss=0
  predictions=[]
  for i, data in enumerate(valloader, 0):
      inputs, lens, labels = data    #this batch has some padded but always diff 

      input_val=torch.tensor(inputs, dtype = torch.float).transpose(1,2).to(device) 
    
      y_val_tensor=torch.tensor(labels, dtype = torch.long).to(device)# long for class
  
    
      y_pred_val = model(input_val,lens)

      # loss of validation of one batch

      single_loss = loss_function(y_pred_val, y_val_tensor)
 
      val_loss+=single_loss.item()*lens.shape[0]
      if classification:

          predictions.append((np.argmax(y_pred_val.cpu().detach().numpy(), axis=1)==labels)*1)
      else:
          predictions.append((np.round(y_pred_val.cpu().detach().numpy())==labels)*1)

  predictions=np.concatenate(predictions)
  val_acc=np.mean(predictions)

  val_acc_plot.append(val_acc)
  print(f"Acc val {val_acc}")


  val_loss/=len_cross_val.shape[0]

  print(f"Loss val {val_loss}")

  val_loss_plot.append(val_loss)

  if min_loss>val_loss: # save best weights
      torch.save({
                'epoch': i,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': single_loss,
                }, PATH)
      min_loss=val_loss



In [None]:
plt.figure()
plt.plot(loss_plot)
plt.title("Train loss")

plt.figure()
plt.plot(train_acc_plot)
plt.title("Train acc")

plt.figure()
plt.plot(val_loss_plot)
plt.title("Val loss")

plt.figure()
plt.plot(val_acc_plot)
plt.title("Val acc")

# Predicting mini batch

In [None]:
# Predict with best weights on train set

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()
predictions=[]
for i, data in enumerate(trainloader, 0):
      inputs, lens, labels = data    #this batch has some padded but always diff 

      inputs_tensor=torch.tensor(inputs, dtype = torch.float).transpose(1,2).to(device) 
    
      y_train_tensor=torch.tensor(labels, dtype = torch.long).to(device)
  
    
      # Forward pass
      y_pred = model(inputs_tensor,lens)
      
      predictions.append((np.argmax(y_pred.cpu().detach().numpy(), axis=1)==labels)*1)

predictions=np.concatenate(predictions)
print(predictions)
print(y_cross_train)

accuracy_1 = np.mean(predictions) # if the score is exact
print(accuracy_1)

In [None]:
# Predict with best weights on validation set

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()

predictions_val=[]
for i, data in enumerate(valloader, 0):
      inputs, lens, labels = data    #this batch has some padded but always diff 

      inputs_tensor=torch.tensor(inputs, dtype = torch.float).transpose(1,2).to(device) 
    
      y_train_tensor=torch.tensor(labels, dtype = torch.long).to(device)
  
    
      # Forward pass
      y_pred = model(inputs_tensor,lens)
      predictions_val.append((np.argmax(y_pred.cpu().detach().numpy(), axis=1)==labels)*1)



predictions_val=np.concatenate(predictions_val)

accuracy_1 = np.mean(predictions_val)
print(accuracy_1)

# Predicting full batch

In [None]:
# Predict with best weights on train set

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()

    
# Forward pass
y_pred = model(inputs_tensor,len_cross_train)

predictions=np.argmax(y_pred.cpu().detach().numpy(), axis=1)

accuracy_1=np.mean((predictions==y_cross_train)*1)# if the score is exact 


print(accuracy_1)

In [None]:
# Predict with best weights on validation set

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()

    
# Forward pass
y_pred = model(input_val_tensor,len_cross_val)

predictions_val=np.argmax(y_pred.cpu().detach().numpy(), axis=1)

accuracy_1=np.mean((predictions_val==y_cross_val)*1)# if the score is exact 

print(accuracy_1)

# Kappa

In [None]:
# Kappa score

import numpy as np
from kappa import quadratic_weighted_kappa as qwk
from kappa import linear_weighted_kappa as lwk

def assert_inputs(rater_a, rater_b):
	assert np.issubdtype(rater_a.dtype, np.integer), 'Integer array expected, got ' + str(rater_a.dtype)
	assert np.issubdtype(rater_b.dtype, np.integer), 'Integer array expected, got ' + str(rater_b.dtype)

def quadratic_weighted_kappa(rater_a, rater_b, min_rating, max_rating):
	assert_inputs(rater_a, rater_b)
	return qwk(rater_a, rater_b, min_rating, max_rating)

def linear_weighted_kappa(rater_a, rater_b, min_rating, max_rating):
	assert_inputs(rater_a, rater_b)
	return lwk(rater_a, rater_b, min_rating, max_rating)

In [None]:
# Kappa score for train and validaiton set
print(quadratic_weighted_kappa(predictions.astype('int'), y_cross_train.astype('int'),min_rating=0, max_rating=3))
print(quadratic_weighted_kappa(predictions_val.astype('int'), y_cross_val.astype('int'),min_rating=0, max_rating=3))

# Predicting for test set

In [None]:
 #Get lengths of sequences

len_test=np.asarray([example.shape[0] for example in x_test])

# Pad sequences to max length

seq = [torch.tensor(e) for e in x_test]
x_test_padded=pad_sequence(seq,batch_first=True)

# Make validation tensors since they dont shuffle
input_test_tensor =  torch.tensor(x_test_padded, dtype = torch.float).transpose(1,2).to(device) #
y_test_tensor=torch.tensor(y_test, dtype = torch.long).to(device)# long

# Predict with best weights
PATH="model_checkpoint.pt"
model = ACNNLSTM(emb_size,4)
model.to(device)


checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

print(epoch)
model.eval()

    
# Forward pass
y_pred = model(input_test_tensor,len_test)


predictions=np.argmax(y_pred.cpu().detach().numpy(), axis=1)

accuracy=np.mean((predictions==y_test)*1)# if the score is exact 
print(accuracy)

# Kappa

print(quadratic_weighted_kappa(predictions.astype('int'), y_test.astype('int'),min_rating=0, max_rating=3))

print(y_test)

print(predictions)

# baseline trait 2 is 0.5813467168548725