In [6]:
import platform
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

%matplotlib inline

## data prep

In [9]:
if platform.system() == 'Linux':
    data_path = 'data'
else:
    data_path = 'D:/Dataset/support-ticket-classification'

In [10]:
files = glob.glob(os.path.join(data_path, '*'))

In [11]:
d_data = pd.read_csv(files[0])

## data exploration

In [12]:
d_data.shape

(48549, 9)

In [13]:
d_data.columns

Index(['title', 'body', 'ticket_type', 'category', 'sub_category1',
       'sub_category2', 'business_service', 'urgency', 'impact'],
      dtype='object')

In [14]:
d_data.category.value_counts()

4     34061
5      9634
6      2628
7       921
11      612
8       239
9       191
3       137
1        72
12       45
0         4
2         3
10        2
Name: category, dtype: int64

In [15]:
d_data.ticket_type.value_counts()

1    34621
0    13928
Name: ticket_type, dtype: int64

In [16]:
d_data.urgency.value_counts()

3    34621
1     6748
2     5528
0     1652
Name: urgency, dtype: int64

In [17]:
d_data.impact.value_counts()

4    34621
3    13184
0      471
2      228
1       45
Name: impact, dtype: int64

In [18]:
d_data.drop(d_data[d_data.category.isin([0,2,10])].index, inplace=True)

In [19]:
d_data.category.value_counts().sort_index()

1        72
3       137
4     34061
5      9634
6      2628
7       921
8       239
9       191
11      612
12       45
Name: category, dtype: int64

In [20]:
cat2new = dict((index, value) for index, value in enumerate(d_data.category.value_counts().sort_index().index))
new2cat = dict((value, index) for index, value in enumerate(d_data.category.value_counts().sort_index().index))

In [21]:
d_data['category'] = d_data.category.map(new2cat)

## value encoding

fill na in title

In [22]:
d_data.title = d_data.title.fillna('')

In [23]:
d_data.isna().sum()

title               0
body                0
ticket_type         0
category            0
sub_category1       0
sub_category2       0
business_service    0
urgency             0
impact              0
dtype: int64

In [24]:
d_data['title_body'] = d_data['title'] + d_data['body']

In [25]:
d_data.title_body = d_data.title_body.str.lower()

In [26]:
text = " ".join(list(d_data.title_body))

In [27]:
word_set = set(word_tokenize(text))

In [28]:
idx2word = dict((index, word) for index, word in enumerate(word_set, 1))
word2idx = dict((word, index) for index, word in enumerate(word_set, 1))

In [29]:
def word_encoding(sentence, encoding):
    word_list = word_tokenize(sentence)
    sent2idx = []
    for word in word_list:
        try:
            sent2idx.append(encoding[word])
        except:
            sent2idx.append(0)
            
    return sent2idx

In [30]:
word2idx['hi']

22093

In [31]:
d_data = d_data.sample(frac=1)

In [32]:
d_data_selected = d_data[['title_body', 'category', 'ticket_type', 'urgency', 'impact']]

In [33]:
d_data_selected.reset_index(drop=True, inplace=True)

In [34]:
row, col = d_data_selected.shape

In [35]:
train, test = d_data_selected.iloc[:int(row * 0.8)], d_data_selected.iloc[int(row * 0.8):]

In [36]:
train.shape

(38832, 5)

In [37]:
test.shape

(9708, 5)

In [38]:
col_label = ['category', 'ticket_type', 'urgency', 'impact']

In [39]:
for col in col_label:
    print(col)
    print(train[col].value_counts().sort_index())
    print('\n')

category
0       54
1      117
2    27252
3     7662
4     2136
5      749
6      192
7      149
8      486
9       35
Name: category, dtype: int64


ticket_type
0    11149
1    27683
Name: ticket_type, dtype: int64


urgency
0     1336
1     5389
2     4424
3    27683
Name: urgency, dtype: int64


impact
0      377
1       38
2      191
3    10543
4    27683
Name: impact, dtype: int64




In [40]:
for col in col_label:
    print(col)
    print(test[col].value_counts().sort_index())
    print('\n')

category
0      18
1      20
2    6809
3    1972
4     492
5     172
6      47
7      42
8     126
9      10
Name: category, dtype: int64


ticket_type
0    2776
1    6932
Name: ticket_type, dtype: int64


urgency
0     315
1    1357
2    1104
3    6932
Name: urgency, dtype: int64


impact
0      94
1       7
2      37
3    2638
4    6932
Name: impact, dtype: int64




In [41]:
# len_sent = train.title_body.apply(word_tokenize)

In [42]:
# sns.distplot(len_sent)

In [43]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [45]:
class Dataset():
    def __init__(self, word2idx):
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        
        self.set_split('train')
    
    def set_split(self, split = 'train'):
        self.data, self.length = self.lookup[split]
        
    def set_length_pad(self, x):
        if len(x) > 200:
            x = x[:200]
        else:
            len_pad = 200 - len(x)
            pad = [0] * len_pad
            x.extend(pad)
            
        return x
    
    def __getitem__(self, index):
        x = self.data.iloc[index, 0]
        x = word_encoding(x, word2idx)
        x = self.set_length_pad(x)
        x = torch.Tensor(x)
        
        category = self.data.iloc[index, 1]
        ticket_type = self.data.iloc[index, 2]
        urgency = self.data.iloc[index, 3]
        impact = self.data.iloc[index, 4]
        
        return {
            'x': x,
            'category': category,
            'ticket': ticket_type,
            'urgency': urgency,
            'impact': impact
        }
        
        
    def __len__(self):

        return self.length

In [46]:
class Classifier(nn.Module):
    def __init__(self, len_category, len_ticket_type, len_urgency, len_impact):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(200, 300)
        
        self.fc2_1 = nn.Linear(300, 1024)
        self.fc2_2 = nn.Linear(300, 1024)
        self.fc2_3 = nn.Linear(300, 1024)
        self.fc2_4 = nn.Linear(300, 1024)
        
        self.fc3_1 = nn.Linear(1024, 2048)
        self.fc3_2 = nn.Linear(1024, 2048)
        self.fc3_3 = nn.Linear(1024, 2048)
        self.fc3_4 = nn.Linear(1024, 2048)
        
        self.fc4_1 = nn.Linear(2048, 2048)
        self.fc4_2 = nn.Linear(2048, 2048)
        self.fc4_3 = nn.Linear(2048, 2048)
        self.fc4_4 = nn.Linear(2048, 2048)
        
        self.fc5_1 = nn.Linear(2048, 2048)
        self.fc5_2 = nn.Linear(2048, 2048)
        self.fc5_3 = nn.Linear(2048, 2048)
        self.fc5_4 = nn.Linear(2048, 2048)
        
        self.fc6_1 = nn.Linear(2048, 1024)
        self.fc6_2 = nn.Linear(2048, 1024)
        self.fc6_3 = nn.Linear(2048, 1024)
        self.fc6_4 = nn.Linear(2048, 1024)
        
        self.fc7_1 = nn.Linear(1024, 512)
        self.fc7_2 = nn.Linear(1024, 512)
        self.fc7_3 = nn.Linear(1024, 512)
        self.fc7_4 = nn.Linear(1024, 512)
        
        self.fc8_1 = nn.Linear(512, 100)
        self.fc8_2 = nn.Linear(512, 100)
        self.fc8_3 = nn.Linear(512, 100)
        self.fc8_4 = nn.Linear(512, 100)
        
        self.category = nn.Linear(100, len_category)
        self.ticket_type = nn.Linear(100, len_ticket_type)
        self.urgency = nn.Linear(100, len_urgency)
        self.impact = nn.Linear(100, len_impact)
        
        
    def forward(self, input_, apply_softmax = False):
        x = self.fc1(input_)
        
        y_category = F.dropout(self.fc2_1(x), p=0.3)
        y_category = self.fc3_1(y_category)
        y_category = self.fc4_1(y_category)
        y_category = self.fc5_1(y_category)
        y_category = self.fc6_1(y_category)
        y_category = self.fc7_1(y_category)
        y_category = self.fc8_1(y_category)
        y_category = self.category(y_category)
        
        y_ticket = F.dropout(self.fc2_2(x), p=0.3)
        y_ticket = self.fc3_2(y_ticket)
        y_ticket = self.fc4_2(y_ticket)
        y_ticket = self.fc5_2(y_ticket)
        y_ticket = self.fc6_2(y_ticket)
        y_ticket = self.fc7_2(y_ticket)
        y_ticket = self.fc8_2(y_ticket)
        y_ticket = self.ticket_type(y_ticket)
        
        y_urgency = F.dropout(self.fc2_3(x), p=0.3)
        y_urgency = self.fc3_3(y_urgency)
        y_urgency = self.fc4_3(y_urgency)
        y_urgency = self.fc5_3(y_urgency)
        y_urgency = self.fc6_3(y_urgency)
        y_urgency = self.fc7_3(y_urgency)
        y_urgency = self.fc8_3(y_urgency)
        y_urgency = self.urgency(y_urgency)
        
        
        y_impact = F.dropout(self.fc2_4(x), p=0.3)
        y_impact = self.fc3_4(y_impact)
        y_impact = self.fc4_4(y_impact)
        y_impact = self.fc5_4(y_impact)
        y_impact = self.fc6_4(y_impact)
        y_impact = self.fc7_4(y_impact)
        y_impact = self.fc8_4(y_impact)
        y_impact = self.impact(y_impact)
        
        if apply_softmax:
            y_category = F.softmax(y_category)
            y_ticket = F.softmax(y_ticket)
            y_urgency = F.softmax(y_urgency)
            y_impact = F.softmax(y_impact)
            
        return {'category': y_category,
               'ticket': y_ticket,
               'urgency': y_urgency,
               'impact': y_impact}

In [47]:
def compute_accuracy(y_pred, y_true):
    y_pred = y_pred.max(dim=1)[1]
    n_correct = torch.eq(y_pred, y_true).sum().item()
    accuracy = (n_correct / len(y_true)) * 100
    
    return accuracy

In [48]:
dataset = Dataset(word2idx)
classifier = Classifier(10, 2, 4, 5)

classifier = classifier.to(device)

In [49]:
loss_fun_category = nn.CrossEntropyLoss()
loss_fun_ticket = nn.CrossEntropyLoss()
loss_fun_urgency = nn.CrossEntropyLoss()
loss_fun_impact = nn.CrossEntropyLoss()

optimizer = optim.Adam(classifier.parameters(), lr = 1e-3, weight_decay=0.0001)

In [50]:
running_acc = 0
running_loss = 0

running_acc_val = 0
running_acc_cat_val = 0
running_acc_ticket_val = 0
running_acc_urgency_val = 0
running_acc_impact_val = 0

running_loss_val = 0

In [51]:
history_dict = {
    'loss_train': [],
    'acc_train': [],
    'loss_val': [],
    'acc_val': [],
}

In [None]:
for epoch in range(100):
    dataset.set_split('train')
    data_gen = DataLoader(dataset=dataset, shuffle=True, batch_size=2048)
    classifier.train()
    for batch_index, batch_dict in enumerate(data_gen , 1):
        # step 1
        optimizer.zero_grad()
        
        # step 2
        y_pred = classifier(batch_dict['x'].to(device))
        
        # step 3 loss
        loss_category = loss_fun_category(y_pred['category'], batch_dict['category'].to(device))
        loss_ticket = loss_fun_category(y_pred['ticket'], batch_dict['ticket'].to(device))
        loss_urgency = loss_fun_category(y_pred['urgency'], batch_dict['urgency'].to(device))
        loss_impact = loss_fun_category(y_pred['impact'], batch_dict['impact'].to(device))
        
        loss = loss_category + loss_ticket + loss_urgency + loss_impact
        running_loss += ((loss.item() / 4) - running_loss) / batch_index
        
        # step 4 accuracy
        acc_category = compute_accuracy(y_pred['category'], batch_dict['category'].to(device))
        acc_ticket = compute_accuracy(y_pred['ticket'], batch_dict['ticket'].to(device))
        acc_urgency = compute_accuracy(y_pred['urgency'], batch_dict['urgency'].to(device))
        acc_impact = compute_accuracy(y_pred['impact'], batch_dict['impact'].to(device))
        
        accuracy = (acc_category + acc_ticket + acc_urgency + acc_impact) / 4
        running_acc += (accuracy - running_acc) / batch_index
        
        # step 6 backward
        loss.backward()
        
        # step 6 step
        optimizer.step()
    
    dataset.set_split('test')
    data_gen = DataLoader(dataset=dataset, shuffle=True, batch_size=2048)
    classifier.eval()
    for batch_index, batch_dict in enumerate(data_gen , 1):
        # step 1
        y_pred = classifier(batch_dict['x'].to(device))
        
        # step 2
        # step 3 loss
        loss_category_val = loss_fun_category(y_pred['category'], batch_dict['category'].to(device))
        loss_ticket_val = loss_fun_category(y_pred['ticket'], batch_dict['ticket'].to(device))
        loss_urgency_val = loss_fun_category(y_pred['urgency'], batch_dict['urgency'].to(device))
        loss_impact_val = loss_fun_category(y_pred['impact'], batch_dict['impact'].to(device))
        
        loss_val = loss_category_val + loss_ticket_val + loss_urgency_val + loss_impact_val
        running_loss_val += ((loss_val.item() / 4) - running_loss_val) / batch_index
        
        # step 3
        acc_category_val = compute_accuracy(y_pred['category'], batch_dict['category'].to(device))
        acc_ticket_val = compute_accuracy(y_pred['ticket'], batch_dict['ticket'].to(device))
        acc_urgency_val = compute_accuracy(y_pred['urgency'], batch_dict['urgency'].to(device))
        acc_impact_val = compute_accuracy(y_pred['impact'], batch_dict['impact'].to(device))
        
        accuracy_val = (acc_category_val + acc_ticket_val + acc_urgency_val + acc_impact_val) / 4
        running_acc_val += (accuracy_val - running_acc_val) / batch_index
        
        running_acc_cat_val += (acc_category_val - running_acc_cat_val) / batch_index
        running_acc_ticket_val += (acc_ticket_val - running_acc_ticket_val) / batch_index
        running_acc_urgency_val += (acc_urgency_val - running_acc_urgency_val) / batch_index
        running_acc_impact_val += (acc_impact_val - running_acc_impact_val) / batch_index
    
    print("epoch {}/{} loss: {:.2f} accuracy: {:.2f} loss_val: {:.2f} accuracy_val: {:.2f} \n \
          [cat:{:.2f} ticket:{:.2f} urgency: {:.2f} impact: {:.2f}]". format(
        epoch, 100, running_loss, running_acc, running_loss_val, running_acc_val,
        running_acc_cat_val, running_acc_ticket_val, running_acc_urgency_val, running_acc_impact_val
    ))
    history_dict['loss_train'].append(running_loss)
    history_dict['acc_train'].append(running_acc)
    history_dict['loss_val'].append(running_loss_val)
    history_dict['acc_val'].append(running_acc_val)

epoch 0/100 loss: 4306.88 accuracy: 38.46 loss_val: 1061.86 accuracy_val: 42.51 
           [cat:10.19 ticket:37.00 urgency: 59.67 impact: 63.17]
epoch 1/100 loss: 762.35 accuracy: 53.46 loss_val: 266.15 accuracy_val: 53.15 
           [cat:43.63 ticket:56.59 urgency: 54.97 impact: 57.42]
epoch 2/100 loss: 118.45 accuracy: 52.85 loss_val: 49.46 accuracy_val: 55.99 
           [cat:44.69 ticket:58.83 urgency: 55.19 impact: 65.26]
epoch 3/100 loss: 28.02 accuracy: 55.86 loss_val: 19.62 accuracy_val: 56.68 
           [cat:41.77 ticket:64.60 urgency: 51.39 impact: 68.96]
epoch 4/100 loss: 13.17 accuracy: 56.45 loss_val: 10.32 accuracy_val: 50.84 
           [cat:37.27 ticket:44.91 urgency: 55.10 impact: 66.09]
epoch 5/100 loss: 7.27 accuracy: 57.13 loss_val: 4.90 accuracy_val: 60.97 
           [cat:56.85 ticket:67.77 urgency: 53.39 impact: 65.87]
epoch 6/100 loss: 3.57 accuracy: 58.93 loss_val: 2.76 accuracy_val: 59.71 
           [cat:51.38 ticket:68.65 urgency: 55.71 impact: 63.12]
epo