In [1]:
import numpy as np
import torch
#import psutil
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import time
import matplotlib.pyplot as plt
import random
import sys
import pickle
from torch.optim import SGD

In [2]:
sys.path.insert(1, '/home/akshatgu/Intents-Analysis/Analysis')
#sys.path.insert(1, '/Users/manjugupta/Desktop/CMU_Courses/Intents/getting_intents/Analysis')

In [3]:
from get_vocab import load_data, get_vocab
from get_frequency import get_frequency

In [4]:
#Check if cuda is available
cuda = torch.cuda.is_available()
print('CUDA is', cuda)
CUDA_LAUNCH_BLOCKING=1

num_workers = 8 if cuda else 0

print(num_workers)

CUDA is False
0


  return torch._C._cuda_getDeviceCount() > 0


In [5]:
##Needed Functions
def load_data(filename):
    a_file = open(filename, "rb")
    output = pickle.load(a_file)
    a_file.close()
    return output


def create_vocabulary(train_file):
    '''This function creates an indexed vocabulary dictionary from the training file'''
    
    vocab, _ = get_vocab(1, train_file)
    
    phone_to_idx = {'eos':1, 'unk': 2}#Padding indx = 0, eos = 1, unkown_idx = 2, indexing starts from 3
    for i, phone in enumerate(vocab):
        phone_to_idx[phone] = i + 3
        
    return phone_to_idx

In [6]:
class MyDataset(Dataset):
    def __init__(self, data_file, intent_labels, phone_to_idx):
        data = load_data(data_file)
        self.all_data = []
        
        for intent in data:
            for utterance in data[intent]:
                if len(utterance) != 0:
                    utterance_to_idx = []

                    for phone in utterance:
                        if phone not in phone_to_idx:
                            phone = 'unk'

                        utterance_to_idx.append(phone_to_idx[phone])

                    self.all_data.append([utterance_to_idx, intent_labels[intent]])
            
    def __len__(self):
        return len(self.all_data)

    def __getitem__(self,index):
        input_vector = self.all_data[index][0]
        label = self.all_data[index][1]

        return input_vector, label

In [7]:
def collate_indic(tuple_lst):

    x_lst = [x[0] for x in tuple_lst]
    y_lst = [x[1] for x in tuple_lst]

    # collate x
    B = len(tuple_lst)#Number of training samples
    T = max(len(x) for x in x_lst)#Max length of a sentence

    # x values
    x = torch.zeros([B, T], dtype=torch.int64)
    lengths = torch.zeros(B, dtype=torch.int64)

    for i, x_np in enumerate(x_lst):
        lengths[i] = len(x_np)
        x[i,:len(x_np)] = torch.tensor(x_np)

    # collate y
    y = torch.tensor(y_lst)

    ids = torch.argsort(lengths, descending=True)

    return x[ids], lengths[ids], y[ids]

In [8]:
class MyLMDataset(Dataset):
    def __init__(self, data_file, intent_labels, phone_to_idx):
        data = load_data(data_file)
        self.all_data = []
        
        for intent in data:
            for utterance in data[intent]:
                if len(utterance) != 0:
                    utterance_to_idx = []

                    for phone in utterance:
                        if phone not in phone_to_idx:
                            phone = 'unk'

                        utterance_to_idx.append(phone_to_idx[phone])

                    self.all_data.append([utterance_to_idx, intent_labels[intent]])
            
    def __len__(self):
        return len(self.all_data)

    def __getitem__(self,index):
        input_vector = self.all_data[index][0]
        output_vector = self.all_data[index][0][1:] + [1]

        return input_vector, output_vector

In [9]:
def collate_LM(tuple_lst):

    x_lst = [x[0] for x in tuple_lst]
    y_lst = [x[1] for x in tuple_lst]
    

    # collate x
    B = len(tuple_lst)#Number of training samples
    T = max(len(x) for x in x_lst)#Max length of a sentence

    # x values
    x = torch.zeros([B, T], dtype=torch.int64)
    y = torch.zeros([B, T], dtype=torch.int64)
    lengths = torch.zeros(B, dtype=torch.int64)

    for i, x_np in enumerate(x_lst):
        lengths[i] = len(x_np)
        x[i,:len(x_np)] = torch.tensor(x_np)
        y[i,:len(x_np)] = torch.tensor(y_lst[i])


    ids = torch.argsort(lengths, descending=True)

    return x[ids], lengths[ids], y[ids]

In [10]:
def get_intents():
    all_intents = ['1', '2', '3', '4', '5', '6']
    return all_intents

def get_intent_labels(class_type):
    all_intents = get_intents()
        
    intent_labels = {}
    labels_to_intents = {}
    for i, intent in enumerate(all_intents):
        intent_labels[intent] = i
        labels_to_intents[i] = intent
        
    return intent_labels, labels_to_intents

In [11]:
class_type = 'intents'

intent_labels, labels_to_intents = get_intent_labels(class_type)

#Loading data
split = '1'
train_file = '../../Tamil_Dataset/datasplit_top5_split1/tamil_train_split_' + split + '.pkl'
test_file = '../../Tamil_Dataset/datasplit_top5_split1/tamil_test_split_' + split + '.pkl'
#create vocabulary and phone_to_idx
phone_to_idx = create_vocabulary(train_file)
vocab_size = len(phone_to_idx) + 1
print(vocab_size)

48


In [12]:
train_dataset = MyLMDataset(train_file, intent_labels, phone_to_idx)
train_loader_args = dict(shuffle=True, batch_size=128, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=True, batch_size=64)
train_loader_LM = DataLoader(train_dataset, **train_loader_args, collate_fn=collate_LM)

In [13]:
train_dataset = MyDataset(train_file, intent_labels, phone_to_idx)
train_loader_args = dict(shuffle=True, batch_size=128, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=True, batch_size=64)
train_loader = DataLoader(train_dataset, **train_loader_args, collate_fn=collate_indic)

test_dataset = MyDataset(test_file, intent_labels, phone_to_idx)
test_loader_args = dict(shuffle=False, batch_size=128, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=False, batch_size=1)
valid_loader = DataLoader(test_dataset, **test_loader_args, collate_fn=collate_indic)

In [27]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, pretrained_emb = None, embed_size=128, hidden_size=128, label_size=6):
        super().__init__()
        self.embed_size = embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)

        self.cnn  = nn.Conv1d(embed_size, embed_size, kernel_size=3, padding=1)
        self.cnn2 = nn.Conv1d(embed_size, embed_size, kernel_size=5, padding=2)
        #self.cnn3 = nn.Conv1d(embed_size, embed_size, kernel_size=7, padding=3)

        self.batchnorm = nn.BatchNorm1d(embed_size*2)

        self.lstm_LM = nn.LSTM(embed_size, hidden_size, num_layers=1)
        self.lstm = nn.LSTM(embed_size*2, hidden_size, num_layers=1)

        self.linear_LM = nn.Linear(hidden_size, embed_size)
        self.output_LM = nn.Linear(embed_size, vocab_size)
        self.output_LM.weight = self.embed.weight
        
        
        self.linear = nn.Linear(hidden_size, label_size)
        
        if pretrained_emb != None:
            self.pretrained_embed = nn.Embedding(vocab_size, embed_size)
            self.pretrained_embed.weight = nn.Parameter(pretrained_emb)
            

    def forward(self, x, lengths, lm = True, pretrained = False):
        """
        padded_x: (B,T) padded LongTensor
        """
        # B,T,H
        
        if pretrained == False:
            input = self.embed(x)
        
        if lm:
            pack_tensor = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=True)
            output, (hn, cn) = self.lstm_LM(pack_tensor)
            output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
            output = F.relu(self.linear_LM(output))
            logits = self.output_LM(output)

        else:
            
            ######if using pre-trained embeddings for top 5 
            if pretrained:
                batch_size = x.size(0)
                max_len = x.size(1)
                new_max_len = max_len//6 + 1
                input = torch.zeros((batch_size, new_max_len, self.embed_size ))

                counter = 0
                for j in range(0, max_len, 6):
                    temp = self.pretrained_embed(x[:,j:j+5])
                    temp = temp.mean(dim = 1)
                    input[:,counter,:] = temp
                    counter += 1

                lengths = lengths//6 + 1

                if cuda:
                    input = input.cuda()
            ##########ending using top 5
            
            
            

            # (B,T,H) -> (B,H,T)
            input = input.transpose(1,2)

            #cnn_output = torch.cat([self.cnn(input), self.cnn2(input), self.cnn3(input)], dim=1)
            cnn_output = torch.cat([self.cnn(input), self.cnn2(input)], dim=1)

            # (B,H,T)
            input = F.relu(self.batchnorm(cnn_output))

            input = input.transpose(1,2)

            pack_tensor = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=True)
            output, (hn, cn) = self.lstm(pack_tensor)

            logits = self.linear(hn[0])

        return logits

In [30]:
pretrained = True
if pretrained:
    pretrained_emb = torch.from_numpy(np.load('classifier_embedding_weights.npy'))
    model = RNNClassifier(vocab_size, pretrained_emb)
else:
    model = RNNClassifier(vocab_size)
opt = optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()
#opt = SGD(model.parameters(), lr=0.01)
device = torch.device("cuda" if cuda else "cpu")
model.to(device)

RNNClassifier(
  (embed): Embedding(48, 128)
  (cnn): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (cnn2): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (batchnorm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm_LM): LSTM(128, 128)
  (lstm): LSTM(256, 128)
  (linear_LM): Linear(in_features=128, out_features=128, bias=True)
  (output_LM): Linear(in_features=128, out_features=48, bias=True)
  (linear): Linear(in_features=128, out_features=6, bias=True)
  (pretrained_embed): Embedding(48, 128)
)

In [18]:
#Train language model
print(class_type, split)
max_acc = 0

for j in range(100):
    #print("epoch ", i)
    loss_accum = 0.0
    batch_cnt = 0

    model.train()
    start_time = time.time()
    for batch, (x, lengths, y) in enumerate(train_loader_LM):

        x = x.to(device)
        lengths = lengths.to(device)
        y = y.to(device)
        opt.zero_grad()

        logits = model(x, lengths)
        
        loss = criterion(logits.permute(0,2,1), y)
        loss_score = loss.cpu().item()

        loss_accum += loss_score
        batch_cnt += 1
        loss.backward()
        opt.step()
                    

    print(j, " train loss: ", loss_accum / batch_cnt, '--time:', time.time() - start_time)

intents 1
0  train loss:  4.189269924163819 --time: 6.7287726402282715
1  train loss:  3.8274669647216797 --time: 7.1290366649627686
2  train loss:  3.5428661346435546 --time: 7.128594636917114
3  train loss:  3.2561341285705567 --time: 6.790801763534546
4  train loss:  3.0536319732666017 --time: 7.0973474979400635
5  train loss:  2.8292887687683104 --time: 6.8741655349731445
6  train loss:  2.659030294418335 --time: 7.137286424636841
7  train loss:  2.4670814514160155 --time: 6.927358388900757
8  train loss:  2.2899849891662596 --time: 6.797726154327393
9  train loss:  2.1423481941223144 --time: 6.861461639404297
10  train loss:  1.9984972715377807 --time: 6.8908960819244385
11  train loss:  1.8685822010040283 --time: 6.82082200050354
12  train loss:  1.740103578567505 --time: 7.1625916957855225
13  train loss:  1.6260349750518799 --time: 7.093270540237427
14  train loss:  1.5216002702713012 --time: 6.957597255706787
15  train loss:  1.4139403581619263 --time: 7.179672002792358
16  tr

KeyboardInterrupt: 

In [19]:
if False:
    parameters2 = model.embed.weight.detach().cpu().numpy()
    np.save('classifier_embedding_weights.npy', parameters2)

In [31]:
print(class_type, split)
max_acc = 0

for j in range(200):
    #print("epoch ", i)
    loss_accum = 0.0
    batch_cnt = 0

    acc_cnt = 0
    err_cnt = 0

    model.train()
    start_time = time.time()
    for batch, (x, lengths, y) in enumerate(train_loader):

        x = x.to(device)
        lengths = lengths.to(device)
        y = y.to(device)
        opt.zero_grad()

        logits = model(x, lengths, False, pretrained)

        loss = criterion(logits, y)
        loss_score = loss.cpu().item()

        loss_accum += loss_score
        batch_cnt += 1
        loss.backward()
        opt.step()

        out_val, out_indices = torch.max(logits, dim=1)
        tar_indices = y

        for i in range(len(out_indices)):
            if out_indices[i] == tar_indices[i]:
                acc_cnt += 1
            else:
                err_cnt += 1
                    

    print("train acc: ", acc_cnt/(err_cnt+acc_cnt), " train loss: ", loss_accum / batch_cnt, '--time:', time.time() - start_time)

    model.eval()
    acc_cnt = 0
    err_cnt = 0

    #start_time = time.time()
    for x, lengths, y in valid_loader:
        
        x = x.to(device)
        lengths = lengths.to(device)
        y = y.to(device)
        
        logits = model(x, lengths, False, pretrained)

        out_val, out_indices = torch.max(logits, dim=1)
        tar_indices = y
    
        for i in range(len(out_indices)):
            if out_indices[i] == tar_indices[i]:
                acc_cnt += 1
            else:
                err_cnt += 1

    current_acc = acc_cnt/(err_cnt+acc_cnt)
    if current_acc > max_acc:
        max_acc = current_acc
                
    print(j, "validation: ", current_acc, '--max', max_acc, '--time:', time.time() - start_time)

intents 1
train acc:  0.234375  train loss:  1.774974513053894 --time: 0.6522302627563477
0 validation:  0.2875 --max 0.2875 --time: 1.0606937408447266
train acc:  0.3375  train loss:  1.6742762565612792 --time: 0.6787009239196777
1 validation:  0.3625 --max 0.3625 --time: 1.08732008934021
train acc:  0.36875  train loss:  1.5840079545974732 --time: 0.6678898334503174
2 validation:  0.35 --max 0.3625 --time: 1.0899434089660645
train acc:  0.4625  train loss:  1.4209823846817016 --time: 0.7056839466094971
3 validation:  0.525 --max 0.525 --time: 1.1270074844360352
train acc:  0.575  train loss:  1.2795114278793336 --time: 0.66668701171875
4 validation:  0.525 --max 0.525 --time: 1.0753934383392334
train acc:  0.54375  train loss:  1.2264013528823852 --time: 0.6768698692321777
5 validation:  0.55 --max 0.55 --time: 1.0857722759246826
train acc:  0.69375  train loss:  1.0105744361877442 --time: 0.6704621315002441
6 validation:  0.6125 --max 0.6125 --time: 1.0791642665863037
train acc:  0.

59 validation:  0.9 --max 0.9 --time: 1.0878159999847412
train acc:  1.0  train loss:  0.0015195765998214483 --time: 0.6682863235473633
60 validation:  0.9 --max 0.9 --time: 1.0768110752105713
train acc:  1.0  train loss:  0.0013429088285192847 --time: 0.685553789138794
61 validation:  0.9 --max 0.9 --time: 1.0945963859558105
train acc:  1.0  train loss:  0.0013704062439501286 --time: 0.6707956790924072
62 validation:  0.9 --max 0.9 --time: 1.0798630714416504
train acc:  1.0  train loss:  0.0012689422583207488 --time: 0.6631002426147461
63 validation:  0.9 --max 0.9 --time: 1.0724318027496338
train acc:  1.0  train loss:  0.001248785015195608 --time: 0.6506338119506836
64 validation:  0.9 --max 0.9 --time: 1.059323787689209
train acc:  1.0  train loss:  0.001261731586419046 --time: 0.6642398834228516
65 validation:  0.9 --max 0.9 --time: 1.0853466987609863
train acc:  1.0  train loss:  0.001174606056883931 --time: 0.7132470607757568
66 validation:  0.9 --max 0.9 --time: 1.1221733093261

train acc:  1.0  train loss:  0.0004204548429697752 --time: 0.6956043243408203
120 validation:  0.9 --max 0.9 --time: 1.1139469146728516
train acc:  1.0  train loss:  0.0004224275762680918 --time: 0.6468467712402344
121 validation:  0.9 --max 0.9 --time: 1.0597503185272217
train acc:  1.0  train loss:  0.0004350206349045038 --time: 0.6633293628692627
122 validation:  0.8875 --max 0.9 --time: 1.0752029418945312
train acc:  1.0  train loss:  0.00040714633651077746 --time: 0.6584987640380859
123 validation:  0.8875 --max 0.9 --time: 1.0728037357330322
train acc:  1.0  train loss:  0.0004149166110437363 --time: 0.6467995643615723
124 validation:  0.8875 --max 0.9 --time: 1.0585107803344727
train acc:  1.0  train loss:  0.0003998524509370327 --time: 0.6976003646850586
125 validation:  0.9 --max 0.9 --time: 1.1092073917388916
train acc:  1.0  train loss:  0.00038583410205319526 --time: 0.6617221832275391
126 validation:  0.9 --max 0.9 --time: 1.074068307876587
train acc:  1.0  train loss:  0

KeyboardInterrupt: 

In [None]:
1 layer, 512, only 2 CNN contexts