In [1]:
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

%matplotlib inline

In [2]:
data_path = 'data'

In [3]:
files = glob.glob(os.path.join(data_path, '*'))

In [4]:
d_data = pd.read_csv(files[0])

In [5]:
d_data.shape

(48549, 9)

In [6]:
d_data.columns

Index(['title', 'body', 'ticket_type', 'category', 'sub_category1',
       'sub_category2', 'business_service', 'urgency', 'impact'],
      dtype='object')

In [7]:
d_data.category.value_counts()

4     34061
5      9634
6      2628
7       921
11      612
8       239
9       191
3       137
1        72
12       45
0         4
2         3
10        2
Name: category, dtype: int64

In [8]:
d_data.ticket_type.value_counts()

1    34621
0    13928
Name: ticket_type, dtype: int64

In [9]:
d_data.urgency.value_counts()

3    34621
1     6748
2     5528
0     1652
Name: urgency, dtype: int64

In [10]:
d_data.impact.value_counts()

4    34621
3    13184
0      471
2      228
1       45
Name: impact, dtype: int64

In [11]:
d_data.drop(d_data[d_data.category.isin([0,2,10])].index, inplace=True)

In [12]:
d_data.category.value_counts().sort_index()

1        72
3       137
4     34061
5      9634
6      2628
7       921
8       239
9       191
11      612
12       45
Name: category, dtype: int64

In [13]:
cat2new = dict((index, value) for index, value in enumerate(d_data.category.value_counts().sort_index().index))
new2cat = dict((value, index) for index, value in enumerate(d_data.category.value_counts().sort_index().index))

In [14]:
d_data['category'] = d_data.category.map(new2cat)

## value encoding

fill na in title

In [15]:
d_data.title = d_data.title.fillna('')

In [16]:
d_data.isna().sum()

title               0
body                0
ticket_type         0
category            0
sub_category1       0
sub_category2       0
business_service    0
urgency             0
impact              0
dtype: int64

In [17]:
d_data['title_body'] = d_data['title'] + d_data['body']

In [18]:
d_data.title_body = d_data.title_body.str.lower()

In [19]:
text = " ".join(list(d_data.title_body))

In [20]:
word_set = set(word_tokenize(text))

In [21]:
idx2word = dict((index, word) for index, word in enumerate(word_set, 1))
word2idx = dict((word, index) for index, word in enumerate(word_set, 1))

In [22]:
def word_encoding(sentence, encoding):
    word_list = word_tokenize(sentence)
    sent2idx = []
    for word in word_list:
        try:
            sent2idx.append(encoding[word])
        except:
            sent2idx.append(0)
            
    return sent2idx

In [23]:
word2idx['hi']

9859

In [24]:
d_data = d_data.sample(frac=1)

In [25]:
d_data_selected = d_data[['title_body', 'category', 'ticket_type', 'urgency', 'impact']]

In [26]:
d_data_selected.reset_index(drop=True, inplace=True)

In [27]:
row, col = d_data_selected.shape

In [28]:
train, test = d_data_selected.iloc[:int(row * 0.8)], d_data_selected.iloc[int(row * 0.8):]

In [29]:
train.shape

(38832, 5)

In [30]:
test.shape

(9708, 5)

In [31]:
col_label = ['category', 'ticket_type', 'urgency', 'impact']

In [33]:
for col in col_label:
    print(col)
    print(train[col].value_counts().sort_index())
    print('\n')

category
0       50
1      111
2    27276
3     7725
4     2076
5      734
6      195
7      147
8      483
9       35
Name: category, dtype: int64


ticket_type
0    11105
1    27727
Name: ticket_type, dtype: int64


urgency
0     1329
1     5354
2     4422
3    27727
Name: urgency, dtype: int64


impact
0      366
1       37
2      188
3    10514
4    27727
Name: impact, dtype: int64




In [34]:
for col in col_label:
    print(col)
    print(test[col].value_counts().sort_index())
    print('\n')

category
0      22
1      26
2    6785
3    1909
4     552
5     187
6      44
7      44
8     129
9      10
Name: category, dtype: int64


ticket_type
0    2820
1    6888
Name: ticket_type, dtype: int64


urgency
0     322
1    1392
2    1106
3    6888
Name: urgency, dtype: int64


impact
0     105
1       8
2      40
3    2667
4    6888
Name: impact, dtype: int64




In [35]:
len_sent = train.title_body.apply(word_tokenize)

In [None]:
sns.distplot(len_sent)

In [None]:
class Dataset():
    def __init__(self, word2idx):
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        
        self.set_split('train')
    
    def set_split(self, split = 'train'):
        self.data, self.length = self.lookup[split]
        
    def set_length(x):
        if x > 100:
            x = x[100]
        else:
            len_pad = 100 - len(x)
            pad = [0] * len_pad
            x.extend(pad)
            
        return x
    
    def __getitem__(self, index):
        x = self.data.loc[index, 'title_body']
        x = word_encoding(x, word2idx)
        x = set_length(x)
        
        category = self.data.loc[index, 'category']
        ticket_type = self.data.loc[index, 'ticket_type']
        urgency = self.data.loc[index, 'urgency']
        impact = self.data.loc[index, 'impact']
        
        return {
            'x': x,
            'category': category,
            'ticket_type': ticket_type,
            'urgency': urgency,
            'impact': impact
        }
        
        
    def __len__(self):

        return self.length

In [None]:
class Classifier(nn.Module):
    def __init__(self, len_category, len_ticket_type, len_urgency, len_impact):
        super(Classifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(200, 150),
            nn.ReLU(),
            nn.Linear(150, 150),
            nn.ReLU(),
            nn.Linear(150, 100),
            nn.ReLU()
        )
        
        self.category = nn.Linear(100, len_category)
        self.ticket_type = nn.Linear(100, len_ticket_type)
        self.urgency = nn.Linear(100, len_urgency)
        self.impact = nn.Linear(100, len_impact)
        
        
    def forward(self, input_, apply_softmax = False):
        x = self.fc(input_)
        y_category = self.category(x)
        y_ticket = self.ticket_type(x)
        y_urgency = self.urgency(x)
        y_impact = self.impact(x)
        
        if apply_softmax:
            y_category = F.softmax(y_category)
            y_ticket = F.softmax(y_ticket)
            y_urgency = F.softmax(y_urgency)
            y_impact = F.softmax(y_impact)

In [None]:
dataset = Dataset(word2idx)
classifier = Classifier(10, 2, 4, 5)

In [None]:
loss_fun_category = nn.CrossEntropyLoss()
loss_fun_ticket = nn.CrossEntropyLoss()
loss_fun_urgency = nn.CrossEntropyLoss()
loss_fun_impact = nn.CrossEntropyLoss()

optimizer = optim.Adam(classifier.parameters(), lr = 1e-3)

In [None]:
dataset.__getitem__(0)

In [None]:
with torch.no_grad():
    classifier()