In [1]:
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

%matplotlib inline

## data preparation

In [2]:
data_path = 'data'

In [3]:
files = glob.glob(os.path.join(data_path, '*'))

In [4]:
d_data = pd.read_csv(files[0])

## data exploration

In [5]:
d_data.shape

(48549, 9)

In [6]:
d_data.columns

Index(['title', 'body', 'ticket_type', 'category', 'sub_category1',
       'sub_category2', 'business_service', 'urgency', 'impact'],
      dtype='object')

In [7]:
d_data.category.value_counts()

4     34061
5      9634
6      2628
7       921
11      612
8       239
9       191
3       137
1        72
12       45
0         4
2         3
10        2
Name: category, dtype: int64

In [8]:
d_data.ticket_type.value_counts()

1    34621
0    13928
Name: ticket_type, dtype: int64

In [9]:
d_data.urgency.value_counts()

3    34621
1     6748
2     5528
0     1652
Name: urgency, dtype: int64

In [10]:
d_data.impact.value_counts()

4    34621
3    13184
0      471
2      228
1       45
Name: impact, dtype: int64

In [11]:
d_data.drop(d_data[d_data.category.isin([0,2,10])].index, inplace=True)

In [12]:
d_data.category.value_counts().sort_index()

1        72
3       137
4     34061
5      9634
6      2628
7       921
8       239
9       191
11      612
12       45
Name: category, dtype: int64

In [13]:
cat2new = dict((index, value) for index, value in enumerate(d_data.category.value_counts().sort_index().index))
new2cat = dict((value, index) for index, value in enumerate(d_data.category.value_counts().sort_index().index))

In [14]:
d_data['category'] = d_data.category.map(new2cat)

## value encoding

fill na in title

In [15]:
d_data.title = d_data.title.fillna('')

In [16]:
d_data.isna().sum()

title               0
body                0
ticket_type         0
category            0
sub_category1       0
sub_category2       0
business_service    0
urgency             0
impact              0
dtype: int64

In [17]:
d_data['title_body'] = d_data['title'] + d_data['body']

In [18]:
d_data.title_body = d_data.title_body.str.lower()

In [19]:
text = " ".join(list(d_data.title_body))

In [20]:
word_set = set(word_tokenize(text))

In [21]:
idx2word = dict((index, word) for index, word in enumerate(word_set, 1))
word2idx = dict((word, index) for index, word in enumerate(word_set, 1))

In [22]:
def word_encoding(sentence, encoding):
    word_list = word_tokenize(sentence)
    sent2idx = []
    for word in word_list:
        try:
            sent2idx.append(encoding[word])
        except:
            sent2idx.append(0)
            
    return sent2idx

In [23]:
word2idx['hi']

457

In [24]:
d_data = d_data.sample(frac=1)

In [25]:
d_data_selected = d_data[['title_body', 'category', 'ticket_type', 'urgency', 'impact']]

In [26]:
d_data_selected.reset_index(drop=True, inplace=True)

In [27]:
row, col = d_data_selected.shape

In [28]:
train, test = d_data_selected.iloc[:int(row * 0.8)], d_data_selected.iloc[int(row * 0.8):]

In [29]:
train.shape

(38832, 5)

In [30]:
test.shape

(9708, 5)

In [31]:
col_label = ['category', 'ticket_type', 'urgency', 'impact']

In [32]:
for col in col_label:
    print(col)
    print(train[col].value_counts().sort_index())
    print('\n')

category
0       63
1      105
2    27272
3     7733
4     2101
5      709
6      192
7      148
8      475
9       34
Name: category, dtype: int64


ticket_type
0    11094
1    27738
Name: ticket_type, dtype: int64


urgency
0     1289
1     5355
2     4450
3    27738
Name: urgency, dtype: int64


impact
0      374
1       38
2      170
3    10512
4    27738
Name: impact, dtype: int64




In [33]:
for col in col_label:
    print(col)
    print(test[col].value_counts().sort_index())
    print('\n')

category
0       9
1      32
2    6789
3    1901
4     527
5     212
6      47
7      43
8     137
9      11
Name: category, dtype: int64


ticket_type
0    2831
1    6877
Name: ticket_type, dtype: int64


urgency
0     362
1    1391
2    1078
3    6877
Name: urgency, dtype: int64


impact
0      97
1       7
2      58
3    2669
4    6877
Name: impact, dtype: int64




In [34]:
# len_sent = train.title_body.apply(word_tokenize)

In [35]:
# sns.distplot(len_sent)

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
class Dataset():
    def __init__(self, word2idx):
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        
        self.set_split('train')
    
    def set_split(self, split = 'train'):
        self.data, self.length = self.lookup[split]
        
    def set_length_pad(self, x):
        if len(x) > 200:
            x = x[:200]
        else:
            len_pad = 200 - len(x)
            pad = [0] * len_pad
            x.extend(pad)
            
        return x
    
    def __getitem__(self, index):
        x = self.data.iloc[index, 0]
        x = word_encoding(x, word2idx)
        x = self.set_length_pad(x)
        x = torch.Tensor(x)
        
        category = self.data.iloc[index, 1]
        ticket_type = self.data.iloc[index, 2]
        urgency = self.data.iloc[index, 3]
        impact = self.data.iloc[index, 4]
        
        return {
            'x': x,
            'category': category,
            'ticket': ticket_type,
            'urgency': urgency,
            'impact': impact
        }
        
        
    def __len__(self):

        return self.length

In [38]:
class Classifier(nn.Module):
    def __init__(self, len_category, len_ticket_type, len_urgency, len_impact):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(200, 300)
        
        self.fc2_1 = nn.Linear(300, 300)
        self.fc2_2 = nn.Linear(300, 300)
        self.fc2_3 = nn.Linear(300, 300)
        self.fc2_4 = nn.Linear(300, 300)
        
        self.fc3_1 = nn.Linear(300, 200)
        self.fc3_2 = nn.Linear(300, 200)
        self.fc3_3 = nn.Linear(300, 200)
        self.fc3_4 = nn.Linear(300, 200)
        
        self.category = nn.Linear(100, len_category)
        self.ticket_type = nn.Linear(100, len_ticket_type)
        self.urgency = nn.Linear(100, len_urgency)
        self.impact = nn.Linear(100, len_impact)
        
        
    def forward(self, input_, apply_softmax = False):
        x = self.fc(input_)
        
        y_category = self.fc2_1(x)
        y_category = self.fc3_1(y_category)
        y_category = self.category(y_category)
        
        y_ticket = self.fc2_2(x)
        y_ticket = self.fc3_2(y_ticket)
        y_ticket = self.ticket_type(y_ticket)
        
        y_urgency = self.fc2_3(x)
        y_urgency = self.fc3_3(y_urgency)
        y_urgency = self.urgency(y_urgency)
        
        
        y_impact = self.fc2_4(x)
        y_impact = self.fc3_4(y_impact)
        y_impact = self.impact(y_impact)
        
        if apply_softmax:
            y_category = F.softmax(y_category)
            y_ticket = F.softmax(y_ticket)
            y_urgency = F.softmax(y_urgency)
            y_impact = F.softmax(y_impact)
            
        return {'category': y_category,
               'ticket': y_ticket,
               'urgency': y_urgency,
               'impact': y_impact}

In [39]:
def compute_accuracy(y_pred, y_true):
    y_pred = y_pred.max(dim=1)[1]
    n_correct = torch.eq(y_pred, y_true).sum().item()
    accuracy = n_correct / len(y_true)
    
    return accuracy

In [40]:
dataset = Dataset(word2idx)
classifier = Classifier(10, 2, 4, 5)

classifier = classifier.to(device)

In [None]:
loss_fun_category = nn.CrossEntropyLoss()
loss_fun_ticket = nn.CrossEntropyLoss()
loss_fun_urgency = nn.CrossEntropyLoss()
loss_fun_impact = nn.CrossEntropyLoss()

optimizer = optim.Adam(classifier.parameters(), lr = 1e-3)

In [None]:
running_acc = 0
running_loss = 0
running_acc_val = 0
running_loss_val = 0

In [None]:
for epoch in range(100):
    dataset.set_split('train')
    data_gen = DataLoader(dataset=dataset, shuffle=True, batch_size=2048)
    classifier.train()
    for batch_index, batch_dict in enumerate(data_gen , 1):
        # step 1
        optimizer.zero_grad()
        
        # step 2
        y_pred = classifier(batch_dict['x'].to(device))
        
        # step 3 loss
        loss_category = loss_fun_category(y_pred['category'], batch_dict['category'].to(device))
        loss_ticket = loss_fun_category(y_pred['ticket'], batch_dict['ticket'].to(device))
        loss_urgency = loss_fun_category(y_pred['urgency'], batch_dict['urgency'].to(device))
        loss_impact = loss_fun_category(y_pred['impact'], batch_dict['impact'].to(device))
        
        loss = loss_category + loss_ticket + loss_urgency + loss_impact
        running_loss += ((loss.item() / 4) - running_loss) / batch_index
        
        # step 4 accuracy
        acc_category = compute_accuracy(y_pred['category'], batch_dict['category'].to(device))
        acc_ticket = compute_accuracy(y_pred['ticket'], batch_dict['ticket'].to(device))
        acc_urgency = compute_accuracy(y_pred['urgency'], batch_dict['urgency'].to(device))
        acc_impact = compute_accuracy(y_pred['impact'], batch_dict['impact'].to(device))
        
        accuracy = (acc_category + acc_ticket + acc_urgency + acc_impact) / 4
        running_acc += (accuracy - running_acc) / batch_index
        
        # step 6 backward
        loss.backward()
        
        # step 6 step
        optimizer.step()
    
    dataset.set_split('test')
    data_gen = DataLoader(dataset=dataset, shuffle=True, batch_size=2048)
    classifier.eval()
    for batch_index, batch_dict in enumerate(data_gen , 1):
        # step 1
        y_pred = classifier(batch_dict['x'].to(device))
        
        # step 2
        # step 3 loss
        loss_category_val = loss_fun_category(y_pred['category'], batch_dict['category'].to(device))
        loss_ticket_val = loss_fun_category(y_pred['ticket'], batch_dict['ticket'].to(device))
        loss_urgency_val = loss_fun_category(y_pred['urgency'], batch_dict['urgency'].to(device))
        loss_impact_val = loss_fun_category(y_pred['impact'], batch_dict['impact'].to(device))
        
        loss_val = loss_category_val + loss_ticket_val + loss_urgency_val + loss_impact_val
        running_loss_val += ((loss_val.item() / 4) - running_loss_val) / batch_index
        
        # step 3
        acc_category_val = compute_accuracy(y_pred['category'], batch_dict['category'].to(device))
        acc_ticket_val = compute_accuracy(y_pred['ticket'], batch_dict['ticket'].to(device))
        acc_urgency_val = compute_accuracy(y_pred['urgency'], batch_dict['urgency'].to(device))
        acc_impact_val = compute_accuracy(y_pred['impact'], batch_dict['impact'].to(device))
        
        accuracy_val = (acc_category_val + acc_ticket_val + acc_urgency_val + acc_impact_val) / 4
        running_acc += (accuracy_val - running_acc_val) / batch_index
    
    print("epoch {}/{} loss: {:.2f} accuracy: {:.2f} loss_val: {:.2f} accuracy_val: {:.2f}". format(
        epoch, 100, running_loss, running_acc, running_loss_val, running_acc_val
    ))