In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import json
import logging
from collections import Counter
from pathlib import Path
from random import random, seed
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import OneHotEncoder

import os 
import tqdm
import math


device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = {
    'seed': 520,      
    'batch_size': 100,
    'learning_rate':0.001,
    'n_epochs':500,
    'data_dir':  "./drive/MyDrive/ADL_hw1/data/slot/",   # Directory to the dataset
    'glove_path': "./drive/MyDrive/ADL_hw1/glove.840B.300d.txt",   # Path to Glove Embedding
    'word2vector_path': "./gensim_glove.840B.300d.txt",     # Directory to save the processed file 
    'save_path': './models/model.ckpt',  
    'early_stop': 200,  
    'valid_ratio': 0.2     
}

In [3]:

def construct_word2vec(): 

    (count, dimensions) = glove2word2vec(config['glove_path'], config['word2vector_path'])
    print(count, '\n', dimensions)
    model = KeyedVectors.load_word2vec_format(config['word2vector_path'], binary=False)

    return model

def read_json():

    slots = [] 
    texts = []
    testing = []
    labels = set()
   
    maxi = 0

    for split in ["train", "eval"]:

        dataset_path = Path( config['data_dir'] + f"{split}.json")
        dataset = json.loads(dataset_path.read_text())
        logging.info(f"Dataset loaded at {str(dataset_path.resolve())}")

        for instance in dataset: # makes train data
             temp_text = []
             temp_label = [] 
             for tag in instance["tags"]:
                if tag == "O":
                  temp_label.append(tag)
                  
                else:
                  temp_label.append(tag[2:])
                  labels.add(tag[2:]) # makes set for intent2idx 
             
             number_of_word = 0

             for token in instance["tokens"]:
                 temp_text.append(token)
                 number_of_word = number_of_word + 1 

             if number_of_word> maxi : # for padding 
                  maxi = number_of_word

             texts.append(temp_text)
             slots.append(temp_label)

    # make test data
    dataset_path = Path( config['data_dir'] + "test.json")
    dataset = json.loads(dataset_path.read_text())
    logging.info(f"Dataset loaded at {str(dataset_path.resolve())}")

    for instance in dataset: # makes test data
             
        temp = [] 
             
        for token in instance["tokens"]:
            temp.append(token)

        testing.append(temp)


    

    intent2idx = {'date': 1, 'last_name': 2, 'time': 3, 'people': 4, 'first_name': 5, 'O': 0}
    # for reproduce 

    for num in range(len(slots)):
      for id , tag in enumerate(slots[num]):
        slots[num][id] = intent2idx[tag]
    
    
   

    return slots , texts , intent2idx , maxi ,testing



train_tags , train_texts , intent2idx , maxi_text , test_text = read_json()
glove = construct_word2vec()



2196018 
 300




In [4]:

import numpy as np 
from sklearn.model_selection import train_test_split

def Word2Vector(data_list, word2vec_model, maxi_len):
    """
    look up word vectors
    turn each word into its pretrained word vector
    return a list of word vectors corresponding to each token in train.data
    """

    suffixs = ["’m","'s","’d","'ll","'ve","’s","s'","'ve","'m","'","'re","’ll","’re","!",";","]","驴"]
    
    v = word2vec_model.get_vector('king')
    dim  = len(v)


    x = []
    n = 0
    num = 0
    

    for sentence in data_list:

      vecs = []
      
      for word in  sentence :

        #word = lemmatizer.lemmatize(word)   not even better
       
        try:
          for kk in suffixs :
              word = word.replace(kk, '')

          vec = word2vec_model.get_vector(word)
          vecs.append(vec)
          
        except KeyError:  
          
          if ":" in word:
            vec = word2vec_model.get_vector("pm")
            vecs.append(vec)
          elif "." in word:
            vec = word2vec_model.get_vector("pm")
            vecs.append(vec)
          elif "/" in word:
            vec = word2vec_model.get_vector("february")
            vecs.append(vec)
          elif "pm" in word:
            vec = word2vec_model.get_vector("pm")
            vecs.append(vec)
          #elif 
          elif "august" in word:
            vec = word2vec_model.get_vector("february")
            vecs.append(vec)
          else:
            vec = word2vec_model.get_vector("name")
            vecs.append(vec)
            #print(sentence)
            #print(word)  
            #print(num)     
          pass
      num = num+1


      x.append(np.array(vecs))

      

      n += 1
    print("number of sentence :", n )

    return np.array(x)


#x_train, x_val, y_train, y_val = train_test_split( train_texts, train_tags , test_size=config["valid_ratio"], random_state=config["seed"])



x_total = Word2Vector(train_texts, glove , maxi_text)
#x_val = Word2Vector(x_val, glove , maxi_text)
x_test = Word2Vector(test_text, glove , maxi_text)



number of sentence : 8244
number of sentence : 3731




In [5]:
# pad length to the same

x = pad_sequence([torch.from_numpy(np.array(x)) for x in x_total],batch_first = True).float()
y = pad_sequence([torch.from_numpy(np.array(x)) for x in train_tags],batch_first = True).float()


x_test = pad_sequence([torch.from_numpy(np.array(x)) for x in x_test],batch_first = True).float()



x_train, x_val, y_train, y_val = train_test_split( x, y , test_size=config["valid_ratio"], random_state=config["seed"])




In [6]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def trainer(train_loader, valid_loader, model, config, device):

    #criterion = nn.CrossEntropyLoss(reduction='mean') 
    criterion = nn.HuberLoss(reduction='mean', delta=1.0)
  
    optimizer = torch.optim.RAdam(model.parameters(), lr=config["learning_rate"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
   
  

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)        
            
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        
        mean_train_loss = sum(loss_record)/len(loss_record)
        

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)                
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)

        
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return best_loss
    return best_loss

In [33]:

class Data_Converter(Dataset):

    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


train_dataset, valid_dataset, test_dataset = Data_Converter(x_train, y_train), \
                                            Data_Converter(x_val, y_val), \
                                            Data_Converter(x_test)
                                            
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)



In [32]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

class My_Model(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_first, drop=0.3):
        super(My_Model, self).__init__()
        self.GRU = torch.nn.GRU(
            input_size,
            hidden_size,
            num_layers,
            batch_first=batch_first,
            dropout=drop,
            bidirectional=True,
        )
       
        self.fc = nn.Linear(hidden_size , 35)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        # x =  [batch, sequence, channel] if batch_first else [sequence, batch, channel]
        output, _ = self.GRU(x)
        
        y = _.mean(0)
        #y = self.linear(y)
        y =  self.fc(y)

        # y = self.softmax(y) 
        return y

same_seed(config['seed'])





In [34]:
model = My_Model(input_size=len(x_train[0][0]), hidden_size = 300 ,num_layers = 3 ,batch_first = True).to(device) # put your model and data on the same computation device.
best_loss = trainer(train_loader, valid_loader, model, config, device)
print("best_loss :" + str(best_loss))


Epoch [1/500]: 100%|██████████| 66/66 [00:08<00:00,  8.01it/s, loss=0.0827]


Epoch [1/500]: Train loss: 0.0915, Valid loss: 0.0879
Saving model with loss 0.088...


Epoch [2/500]: 100%|██████████| 66/66 [00:08<00:00,  8.20it/s, loss=0.0784]


Epoch [2/500]: Train loss: 0.0818, Valid loss: 0.0772
Saving model with loss 0.077...


Epoch [3/500]: 100%|██████████| 66/66 [00:08<00:00,  8.21it/s, loss=0.065]


Epoch [3/500]: Train loss: 0.0716, Valid loss: 0.0661
Saving model with loss 0.066...


Epoch [4/500]: 100%|██████████| 66/66 [00:08<00:00,  8.20it/s, loss=0.0703]


Epoch [4/500]: Train loss: 0.0623, Valid loss: 0.0568
Saving model with loss 0.057...


Epoch [5/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.0477]


Epoch [5/500]: Train loss: 0.0537, Valid loss: 0.0512
Saving model with loss 0.051...


Epoch [6/500]: 100%|██████████| 66/66 [00:08<00:00,  8.21it/s, loss=0.0412]


Epoch [6/500]: Train loss: 0.0468, Valid loss: 0.0455
Saving model with loss 0.046...


Epoch [7/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.0371]


Epoch [7/500]: Train loss: 0.0418, Valid loss: 0.0416
Saving model with loss 0.042...


Epoch [8/500]: 100%|██████████| 66/66 [00:08<00:00,  8.19it/s, loss=0.0353]


Epoch [8/500]: Train loss: 0.0384, Valid loss: 0.0388
Saving model with loss 0.039...


Epoch [9/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.0309]


Epoch [9/500]: Train loss: 0.0351, Valid loss: 0.0359
Saving model with loss 0.036...


Epoch [10/500]: 100%|██████████| 66/66 [00:08<00:00,  8.19it/s, loss=0.0397]


Epoch [10/500]: Train loss: 0.0323, Valid loss: 0.0330
Saving model with loss 0.033...


Epoch [11/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.033]


Epoch [11/500]: Train loss: 0.0295, Valid loss: 0.0317
Saving model with loss 0.032...


Epoch [12/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.0298]


Epoch [12/500]: Train loss: 0.0276, Valid loss: 0.0318


Epoch [13/500]: 100%|██████████| 66/66 [00:08<00:00,  8.20it/s, loss=0.0371]


Epoch [13/500]: Train loss: 0.0259, Valid loss: 0.0284
Saving model with loss 0.028...


Epoch [14/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.0297]


Epoch [14/500]: Train loss: 0.0234, Valid loss: 0.0273
Saving model with loss 0.027...


Epoch [15/500]: 100%|██████████| 66/66 [00:08<00:00,  8.17it/s, loss=0.0261]


Epoch [15/500]: Train loss: 0.0212, Valid loss: 0.0246
Saving model with loss 0.025...


Epoch [16/500]: 100%|██████████| 66/66 [00:08<00:00,  8.19it/s, loss=0.0187]


Epoch [16/500]: Train loss: 0.0197, Valid loss: 0.0234
Saving model with loss 0.023...


Epoch [17/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.0211]


Epoch [17/500]: Train loss: 0.0186, Valid loss: 0.0237


Epoch [18/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.0183]


Epoch [18/500]: Train loss: 0.0174, Valid loss: 0.0226
Saving model with loss 0.023...


Epoch [19/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.0184]


Epoch [19/500]: Train loss: 0.0167, Valid loss: 0.0228


Epoch [20/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.0161]


Epoch [20/500]: Train loss: 0.0159, Valid loss: 0.0218
Saving model with loss 0.022...


Epoch [21/500]: 100%|██████████| 66/66 [00:08<00:00,  8.19it/s, loss=0.0135]


Epoch [21/500]: Train loss: 0.0149, Valid loss: 0.0224


Epoch [22/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.0138]


Epoch [22/500]: Train loss: 0.0153, Valid loss: 0.0202
Saving model with loss 0.020...


Epoch [23/500]: 100%|██████████| 66/66 [00:08<00:00,  8.14it/s, loss=0.0209]


Epoch [23/500]: Train loss: 0.0133, Valid loss: 0.0218


Epoch [24/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.0117]


Epoch [24/500]: Train loss: 0.0133, Valid loss: 0.0206


Epoch [25/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.00893]


Epoch [25/500]: Train loss: 0.0132, Valid loss: 0.0201
Saving model with loss 0.020...


Epoch [26/500]: 100%|██████████| 66/66 [00:08<00:00,  8.12it/s, loss=0.0105]


Epoch [26/500]: Train loss: 0.0118, Valid loss: 0.0198
Saving model with loss 0.020...


Epoch [27/500]: 100%|██████████| 66/66 [00:08<00:00,  8.14it/s, loss=0.0111]


Epoch [27/500]: Train loss: 0.0113, Valid loss: 0.0195
Saving model with loss 0.020...


Epoch [28/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.0101]


Epoch [28/500]: Train loss: 0.0108, Valid loss: 0.0194
Saving model with loss 0.019...


Epoch [29/500]: 100%|██████████| 66/66 [00:08<00:00,  8.12it/s, loss=0.0111]


Epoch [29/500]: Train loss: 0.0100, Valid loss: 0.0189
Saving model with loss 0.019...


Epoch [30/500]: 100%|██████████| 66/66 [00:08<00:00,  8.11it/s, loss=0.00973]


Epoch [30/500]: Train loss: 0.0097, Valid loss: 0.0196


Epoch [31/500]: 100%|██████████| 66/66 [00:08<00:00,  8.17it/s, loss=0.013]


Epoch [31/500]: Train loss: 0.0095, Valid loss: 0.0192


Epoch [32/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.0112]


Epoch [32/500]: Train loss: 0.0096, Valid loss: 0.0188
Saving model with loss 0.019...


Epoch [33/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00622]


Epoch [33/500]: Train loss: 0.0084, Valid loss: 0.0181
Saving model with loss 0.018...


Epoch [34/500]: 100%|██████████| 66/66 [00:08<00:00,  8.14it/s, loss=0.00754]


Epoch [34/500]: Train loss: 0.0081, Valid loss: 0.0178
Saving model with loss 0.018...


Epoch [35/500]: 100%|██████████| 66/66 [00:08<00:00,  8.14it/s, loss=0.00971]


Epoch [35/500]: Train loss: 0.0084, Valid loss: 0.0193


Epoch [36/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00733]


Epoch [36/500]: Train loss: 0.0081, Valid loss: 0.0192


Epoch [37/500]: 100%|██████████| 66/66 [00:08<00:00,  8.11it/s, loss=0.00353]


Epoch [37/500]: Train loss: 0.0071, Valid loss: 0.0177
Saving model with loss 0.018...


Epoch [38/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.00627]


Epoch [38/500]: Train loss: 0.0063, Valid loss: 0.0175
Saving model with loss 0.017...


Epoch [39/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.0063]


Epoch [39/500]: Train loss: 0.0062, Valid loss: 0.0178


Epoch [40/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00771]


Epoch [40/500]: Train loss: 0.0059, Valid loss: 0.0176


Epoch [41/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00779]


Epoch [41/500]: Train loss: 0.0064, Valid loss: 0.0179


Epoch [42/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.00641]


Epoch [42/500]: Train loss: 0.0060, Valid loss: 0.0172
Saving model with loss 0.017...


Epoch [43/500]: 100%|██████████| 66/66 [00:08<00:00,  8.17it/s, loss=0.0128]


Epoch [43/500]: Train loss: 0.0063, Valid loss: 0.0178


Epoch [44/500]: 100%|██████████| 66/66 [00:08<00:00,  8.13it/s, loss=0.00515]


Epoch [44/500]: Train loss: 0.0050, Valid loss: 0.0174


Epoch [45/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.00411]


Epoch [45/500]: Train loss: 0.0047, Valid loss: 0.0174


Epoch [46/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.0034]


Epoch [46/500]: Train loss: 0.0054, Valid loss: 0.0169
Saving model with loss 0.017...


Epoch [47/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00421]


Epoch [47/500]: Train loss: 0.0049, Valid loss: 0.0171


Epoch [48/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.0103]


Epoch [48/500]: Train loss: 0.0047, Valid loss: 0.0169
Saving model with loss 0.017...


Epoch [49/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00573]


Epoch [49/500]: Train loss: 0.0044, Valid loss: 0.0171


Epoch [50/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.00452]


Epoch [50/500]: Train loss: 0.0044, Valid loss: 0.0176


Epoch [51/500]: 100%|██████████| 66/66 [00:08<00:00,  8.17it/s, loss=0.00977]


Epoch [51/500]: Train loss: 0.0046, Valid loss: 0.0178


Epoch [52/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.00391]


Epoch [52/500]: Train loss: 0.0054, Valid loss: 0.0174


Epoch [53/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.00494]


Epoch [53/500]: Train loss: 0.0045, Valid loss: 0.0163
Saving model with loss 0.016...


Epoch [54/500]: 100%|██████████| 66/66 [00:08<00:00,  8.13it/s, loss=0.00399]


Epoch [54/500]: Train loss: 0.0041, Valid loss: 0.0176


Epoch [55/500]: 100%|██████████| 66/66 [00:08<00:00,  8.12it/s, loss=0.00512]


Epoch [55/500]: Train loss: 0.0047, Valid loss: 0.0170


Epoch [56/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.00567]


Epoch [56/500]: Train loss: 0.0042, Valid loss: 0.0180


Epoch [57/500]: 100%|██████████| 66/66 [00:08<00:00,  8.13it/s, loss=0.00212]


Epoch [57/500]: Train loss: 0.0050, Valid loss: 0.0173


Epoch [58/500]: 100%|██████████| 66/66 [00:08<00:00,  8.18it/s, loss=0.00559]


Epoch [58/500]: Train loss: 0.0050, Valid loss: 0.0181


Epoch [59/500]: 100%|██████████| 66/66 [00:08<00:00,  8.12it/s, loss=0.00727]


Epoch [59/500]: Train loss: 0.0055, Valid loss: 0.0171


Epoch [60/500]: 100%|██████████| 66/66 [00:08<00:00,  8.15it/s, loss=0.0027]


Epoch [60/500]: Train loss: 0.0046, Valid loss: 0.0182


Epoch [61/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.00376]


Epoch [61/500]: Train loss: 0.0042, Valid loss: 0.0169


Epoch [62/500]: 100%|██████████| 66/66 [00:08<00:00,  8.13it/s, loss=0.00193]


Epoch [62/500]: Train loss: 0.0041, Valid loss: 0.0167


Epoch [63/500]: 100%|██████████| 66/66 [00:08<00:00,  8.14it/s, loss=0.00804]


Epoch [63/500]: Train loss: 0.0053, Valid loss: 0.0198


Epoch [64/500]: 100%|██████████| 66/66 [00:08<00:00,  8.16it/s, loss=0.0146]


Epoch [64/500]: Train loss: 0.0057, Valid loss: 0.0187


Epoch [65/500]: 100%|██████████| 66/66 [00:08<00:00,  7.86it/s, loss=0.0025]


Epoch [65/500]: Train loss: 0.0047, Valid loss: 0.0165


Epoch [66/500]:  36%|███▋      | 24/66 [00:03<00:05,  7.83it/s, loss=0.0056]


KeyboardInterrupt: ignored

In [37]:
import csv

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in test_loader:
        x = x.to(device)                   
        with torch.no_grad():                   
            pred = model(x)

            preds.append(pred.detach().cpu())   

    preds = torch.cat(preds, dim=0).numpy()  
    return preds


def save_pred(preds, file):
    ''' Save predictions to specified file '''
   
    next = True
    with open(file, 'w') as fp:
        
        writer = csv.writer(fp)
        writer.writerow(['id', 'tags'])
        
        for i , num in enumerate(preds):
   
          string = ""
          for j, token in enumerate(num):
            if token == 'O' :
              next = True
              if string=="":

                string = string + token
              else:
                string = string + " "+ token
            else:
              if next == True:
                if string=="":

                  string = string + "B-"+str(token)
                else:
                  string = string + " "+ "B-"+str(token)
                next = False

              elif next == False:

                if string=="":

                  string = string + "B-"+str(token)
                else:
                  if num[j-1] == token:
                    string = string + " "+ "I-"+str(token)
                  else:
                    string = string + " "+ "B-"+str(token)
                
          
          writer.writerow(["test-"+str(i), string])
  

In [38]:

     


model = My_Model(input_size=len(x_train[0][0]), hidden_size = 300 ,num_layers = 3 ,batch_first = True).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device) 
final = []


for id , sentence in enumerate(preds) :

  temp_sentence = []
  for i , item in enumerate(sentence):
    
    
    if i < len(test_text[id]):


      if item > 0.5 and item <1.5 :

        temp_sentence.append("date")

      elif item > 1.5 and item <2.5 :

        temp_sentence.append("last_name")

      elif item > 2.5 and item <3.5 :
        
        temp_sentence.append("time")

      elif item > 3.5 and item <4.5 :
        
        temp_sentence.append("people")

      elif item > 4.5  :
       
        temp_sentence.append("first_name")

      else:
        temp_sentence.append("O")
    else:

      break
  final.append(temp_sentence)

save_pred(final, 'pred.csv')


In [16]:
yy = []
for id , sentence in enumerate(y_val) :

  temp_sentence = []
  for i , item in enumerate(sentence):
    
    
    if i < len(test_text[id]):


      if item==1 :

        temp_sentence.append("date")

      elif item ==2 :

        temp_sentence.append("last_name")

      elif item ==3:
        
        temp_sentence.append("time")

      elif item ==4 :
        
        temp_sentence.append("people")

      elif item ==5  :
       
        temp_sentence.append("first_name")

      else:
        temp_sentence.append("O")
    else:

      break
  yy.append(temp_sentence)

In [None]:
print(np.array(final).shape)
print(yy[0])
print(final[0])

(3731,)
['O', 'O', 'O', 'time', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'people', 'people', 'O', 'time']


  """Entry point for launching an IPython kernel.


In [12]:
train_dataset, valid_dataset, test_dataset = Data_Converter(x_train, y_train), \
                                            Data_Converter(x_val, y_val), \
                                            Data_Converter(x_test)
                                            
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)


In [13]:
pp_val = predict(valid_loader, model, device)

In [22]:
final = []


for id , sentence in enumerate(pp_val) :

  temp_sentence = []
  for i , item in enumerate(sentence):
    
    
    if i < len(test_text[id]):


      if item > 0.5 and item <1.5 :

        temp_sentence.append("date")

      elif item > 1.5 and item <2.5 :

        temp_sentence.append("last_name")

      elif item > 2.5 and item <3.5 :
        
        temp_sentence.append("time")

      elif item > 3.5 and item <4.5 :
        
        temp_sentence.append("people")

      elif item > 4.5  :
       
        temp_sentence.append("first_name")

      else:
        temp_sentence.append("O")
    else:

      break
  final.append(temp_sentence)

In [24]:
print(len(final[190]))
print(len(yy[190]))

f1_score(yy, final)

8
8




0.7484662576687116

In [25]:


count_sentence = 0
count_token = 0
right_sentence =0
right_token = 0 

for id , sentence in enumerate(final) :
  count_sentence = count_sentence + 1

  if sentence == yy[id]:
    right_sentence = right_sentence +1 

  for i , item in enumerate(sentence):
    count_token = count_token +1 
    if item == yy[id][i]:
        right_token = right_token +1


print("Joint Accuracy : " , right_sentence/count_sentence)
print("Token Accuracy : " , right_token / count_token )
    

Joint Accuracy :  0.8611279563371741
Token Accuracy :  0.9755653961727029
