Note: this notebook is meant to be run on Google Colab with a runtime that has GPU and High RAM

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


## Set Up

### Imports

In [2]:
# Imports taken from DNNModelAnalysis.ipynb
import pandas as pd
import numpy as np
import random
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split

# For examining results
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sn

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

  import pandas.util.testing as tm


In [0]:
# Needed to serialize the model
import pickle

### Common Functions

In [0]:
def plot_training_history(history, model, test_data, test_labels, eval_images=False):
    figure = plt.figure()

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['training', 'validation'], loc='best')
    plt.tight_layout()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['training', 'validation'], loc='best')
    plt.tight_layout()

    figure.tight_layout()
    plt.show()
    
    loss, accuracy  = model.evaluate(test_data, test_labels, verbose=False)
        
    print(f'Test loss: {loss:.3}')
    print(f'Test accuracy: {accuracy:.3}')

### Formatting

In [0]:
# This just makes formatting easier
pd.options.display.max_rows = 60
pd.options.display.min_rows = None
pd.set_option('max_colwidth', 150)

## Load and Prepare Data

In [0]:
# This is pre-processed data - see section "Increase size of data set" in DNNModelAnalysis.ipynb
df3 = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Octopus2/JIRA_OPEN_DATA_LARGESET_PROCESSED.csv')

In [0]:
df3['labels'] = df3['priority'].map({'Optional': 0, 'Trivial': 1, 'Minor': 2, 'Major': 3, 'Blocker': 4, 'Critical': 5})
Y1 = df3['labels'].values

In [0]:
# As there is no need for a train-test split, we are using the whole data set for training
df_train1 = df3['features']
Ytrain1 = Y1

In [0]:
# Convert sentences to sequences
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df_train1)
sequences_train = tokenizer.texts_to_sequences(df_train1)

In [10]:
# get word -> integer mapping
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)

Found 59814 unique tokens.


In [11]:
# pad sequences so that we get a N x T matrix
data_train1 = pad_sequences(sequences_train)
print('Shape of data train tensor:', data_train1.shape)

# get sequence length
T = data_train1.shape[1]

Shape of data train tensor: (41513, 9524)


In [0]:
num_classes = 6
training_labels1 = to_categorical(Ytrain1, num_classes)

In [0]:
name_labels = [
    'Optional',
    'Trivial',
    'Minor',
    'Major',
    'Blocker',
    'Critical'
]

## Training the Model

In [22]:
D = 20

early_stopper = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model = Sequential()
model.add(Input(shape=(T,)))
model.add(Embedding(V + 1, D))
model.add(Conv1D(32, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, 3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(3))

model.add(Conv1D(64, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(64, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(64, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(64, 3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(3))

model.add(Conv1D(128, 3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, activation='relu'))
model.add(BatchNormalization())
model.add(GlobalMaxPooling1D())

model.add(Dense(units=800, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(units=400, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(units=200, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(units=100, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(data_train1, training_labels1, epochs=40, batch_size=8, verbose=True, validation_split=.2, callbacks=[early_stopper])
#plot_training_history(history, model, [], [])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40


## Save the model

In [25]:
model.save("/content/gdrive/My Drive/Colab Notebooks/Octopus2/jira_open_data_classifier.model", save_format='tf')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Colab Notebooks/Octopus2/jira_open_data_classifier.model/assets


## Test the Model

In [0]:
test_sentance = ['The hard coded host of the client can only let it run on the same host as the thrift server.']

In [0]:
test_seq = tokenizer.texts_to_sequences(test_sentance)

In [28]:
print(test_seq)

[[1, 1485, 3042, 1686, 10, 1, 332, 32, 130, 1445, 28, 26, 21, 1, 144, 1686, 36, 1, 2536, 103]]


In [29]:
test_padded = pad_sequences(test_seq, maxlen=T)
print(test_padded)

[[   0    0    0 ...    1 2536  103]]


In [30]:
# prediction
p = model.predict_classes(test_padded)
print(p)

[2]


In [31]:
name_labels[p[0]]

'Minor'