In [2]:
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

%matplotlib inline

## data prep

In [3]:
data_path = 'data'

In [4]:
files = glob.glob(os.path.join(data_path, '*'))

In [5]:
files

['data/all_tickets.csv']

In [6]:
d_data = pd.read_csv(files[0])

In [8]:
d_data.drop(d_data[d_data.category.isin([0,2,10])].index, inplace=True)

In [21]:
d_data.reset_index(drop=True, inplace=True)

In [14]:
label2idx = dict((w, k) for k, w in enumerate(d_data.category.value_counts().sort_index().index))

In [15]:
d_data['category'] = d_data.category.map(label2idx)

## enrichment

In [18]:
d_data['title'] = d_data.title.fillna('')

In [19]:
d_data['title_body'] = d_data.title + d_data.body

## encoding

In [23]:
text = " ".join(list(d_data.title_body))

In [24]:
word_unique = set(word_tokenize(text))

In [25]:
idx2word = dict((i, w) for i, w in enumerate(word_unique, 1))
word2idx = dict((w, i) for i, w in enumerate(word_unique, 1))

In [27]:
row, col = d_data.shape

In [29]:
train, test = d_data.iloc[:int(row * 0.8)], d_data.iloc[int(row * 0.8):]

In [30]:
train.head()

Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact,title_body
0,,hi since recruiter lead permission approve req...,1,2,2,21,71,3,4,hi since recruiter lead permission approve req...
1,connection with icon,icon dear please setup icon per icon engineers...,1,4,22,7,26,3,4,connection with iconicon dear please setup ico...
2,work experience user,work experience user hi work experience studen...,1,3,13,7,32,3,4,work experience userwork experience user hi wo...
3,requesting for meeting,requesting meeting hi please help follow equip...,1,3,13,7,32,3,4,requesting for meetingrequesting meeting hi pl...
4,reset passwords for external accounts,re expire days hi ask help update passwords co...,1,2,2,76,4,3,4,reset passwords for external accountsre expire...


## Torch 

In [43]:
class Dataset():
    def __init__(self):
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        self.set_split('train')
        
    def set_split(self, split = 'train'):
        self.target, self.length = self.lookup[split]
        
    def word_encoding(self, sentence, encoder):
        wordidx = []
        for word in word_tokenize(sentence):
            wordidx.append(encoder[word])
        
        return wordidx
    
    def __getitem__(self, index):
        x = self.target.loc[index, 'title_body']
        x = self.word_encoding(x, word2idx)
        y = self.target.loc[index, 'ticket_type']
        
        return {
            'x': x,
            'y': y
        }
    
    def __len__(self):
        return self.length

In [55]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Classifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, 512)
        
        self.fc = nn.Linear(512, 1)
        
    def  forward(self, input_, apply_sigmoid = False):
        x = self.embedding(input_)
        x = self.lstm(x)
        x = self.fc(x)
        
        if apply_sigmoid:
            x = F.sigmoid(x).squeeze()
            
        return x

In [60]:
def compute_accuracy(y_pred, y_true):
    y_indicies = torch.LongTensor(y_pred > 0.5)
    n_correct = torch.eq(y_indicies, y_true).long()
    accuracy = (n_correct / len(y_true)) * 100
    
    return accuracy

In [56]:
dataset = Dataset()
classifier = Classifier(vocab_size=len(word2idx), embedding_dim=512)

In [57]:
loss_func = nn.BCEWithLogitsLoss()
optimzer = optim.Adam(classifier.parameters(), lr=1e-3)

In [58]:
running_loss = 0
running_acc = 0
running_loss_val = 0
running_acc_val = 0

In [59]:
history = {
    'loss': [],
    'acc': [],
    'loss_val': [],
    'acc_val': []
}

In [None]:
for epoch in range(100):
    classifier.train()
    dataset.set_split('train')
    data_gen = DataLoader(dataset=dataset, shuffle=1, batch_size=128)
    for batch_index, batch_dict in enumerate(data_gen, 1):
        # step 1
        optimzer.zero_grad()
        
        # step 2
        y_pred = classifier(batch_dict['x'])
        
        # step 3
        loss = loss_func(y_pred, batch_dict['y'])
        
        # step 4
        compute_accuracy(y_pred, batch_dict['y'])
        
        # step 5
        loss.backward()
        
        # step 6
        optimzer.step()