### 1. Download Tickets Data

In [None]:
#Download file using wget
!wget https://privdatastorage.blob.core.windows.net/github/support-tickets-classification/datasets/all_tickets.csv --quiet

In [None]:
#Check if the file is available
!ls -l

### 2. Tickets Data Exploration

Load tickets data as dataframe

In [None]:
import pandas as pd
import numpy as np
import textwrap

In [None]:
# read file into pandas using a relative path. Please change the path as needed
tix_df = pd.read_table('all_tickets.csv', sep=',')

In [None]:
#Total number of tickets
tix_df.shape

In [None]:
#Columns in the dataframe
tix_df.columns

In [None]:
#Check the contents of dataframe
tix_df.sample(n=5)

Checking ticket content

In [None]:
my_wrap = textwrap.TextWrapper()

In [None]:
tix_num = np.random.randint(0, tix_df.shape[0])
print('Title: ')
print(tix_df.loc[tix_num, 'title'])
print('Body: ')
for line in my_wrap.wrap(tix_df.loc[tix_num, 'body']):
    print(line)

In [None]:
#Checkout missing values
tix_df.isnull().sum()

Visualization

In [None]:
#Ticket type
tix_df['ticket_type'].value_counts().plot(kind='barh')

In [None]:
#Category
tix_df['category'].value_counts().plot(kind='barh')

In [None]:
#Impact
tix_df['impact'].value_counts().plot(kind='barh')

In [None]:
#Urgency
tix_df['urgency'].value_counts().plot(kind='barh')

In [None]:
#Sub-category 1
tix_df['sub_category1'].value_counts().plot(kind='barh', figsize=(20,15))

In [None]:
#Sub-category 1 for specific category
tix_df[tix_df['category'].isin(['4'])]['sub_category1'].value_counts().plot(kind='barh')

### 3. Create Training & Test Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Build a new Columns which combines 'title' and 'body'
tix_df['title_body'] = tix_df['title'].astype('str') + tix_df['body']

#Column to predict
column_to_predict = 'ticket_type'

In [None]:
# split X and y into training and testing sets
trainX, testX, trainY, testY = train_test_split(tix_df['title_body'], tix_df[column_to_predict], random_state=2)

In [None]:
#Traing data
print(trainX.shape)
print(trainY.shape)

In [None]:
#Test Data
print(testX.shape)
print(testY.shape)

### 4. Tokenization & Vectorization

In [None]:
import tensorflow as tf

In [None]:
#Build Tokenizer
t = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
t.fit_on_texts(trainX)

In [None]:
#Convert words in tickets to indexes - both Train and test
trainX_index = t.texts_to_sequences(trainX)
testX_index = t.texts_to_sequences(testX)

In [None]:
#Number of words in different tickets is different
len(trainX_index[100]), len(trainX_index[500])

Padding

In [None]:
max_length = 100 #this can be changed
trainX_index = tf.keras.preprocessing.sequence.pad_sequences(trainX_index, maxlen=max_length, padding='pre', truncating='post')
testX_index = tf.keras.preprocessing.sequence.pad_sequences(testX_index, maxlen=max_length, padding='pre', truncating='post')

In [None]:
trainX_index.shape

### 4. Building a Ticket Classifier

In [None]:
tf.keras.backend.clear_session()

In [None]:
#Initialize model
model = tf.keras.Sequential()

#Add layer for Word2Vec embedding
model.add(tf.keras.layers.Embedding(10001, #Vocab size +1  
                                    50, #Embedding size
                                    input_length=max_length))

#Add LSTM
model.add(tf.keras.layers.LSTM(200))

#Add dropout layer
model.add(tf.keras.layers.Dropout(0.4))

#Add output layer - this is for ticket_type which has only 2 classes
#For other columns, check number of classes which are more than 2 and
#change output layer. Use softmax if number of classes is more than 2
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
#Fit Model
model.fit(trainX_index, trainY, 
          validation_data=(testX_index, testY), 
          epochs=5, batch_size=64)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
def print_confusion_matrix(testY, predicted_test_y):
    mat = confusion_matrix(testY, predicted_test_y)

    plt.figure(figsize=(4, 4))
    sns.set()
    sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=np.unique(testY),
                yticklabels=np.unique(testY))

    plt.xlabel('true label')
    plt.ylabel('predicted label')
    plt.show()

In [None]:
#Calculate accuracy on Test Dataset
predicted_test_y = model.predict(testX_index)
predicted_test_y_binary = predicted_test_y >= 0.5

print_confusion_matrix(testY, predicted_test_y_binary)