In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import keras
from keras.models import Model
from keras import optimizers
from keras.layers import Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Input, concatenate, Dropout, Reshape
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import precision_recall_fscore_support as fscore

Using TensorFlow backend.


In [3]:
CLEANING_PATTERSN = re.compile("[\s\n\r\t.,:;\-_\'\"?!#&()*]")
LSTM_HIDDEN_SIZE = 200
MAX_TIME = 30
VOCAB_SIZE = 10000
DROPOUT = 0.2
LEARNING_RATE = 0.0001
NUM_EPOCHS = 100
BATCH_SIZE = 2000
FILE_TYPE = 'all'

In [4]:
Z = pd.read_csv('DATA/GENERATED/TRAIN/Z_CONCATED_commentType.csv',delimiter='\t')
FEATS = pd.read_csv('DATA/GENERATED/TRAIN/CONCATED_commentType_'+FILE_TYPE+'.csv')
FEATS.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,ProgramDomain,ProjectManagement,ProblemDomain,Index
0,0.75,0.05,0.05,0.77,0.23,0.05,0.05,0.05,0.41,0.05,0.05,0.05,0,0,0,0
1,9.15,3.65,0.05,2.21,0.59,0.05,0.05,0.05,3.61,0.05,0.05,0.05,1,0,0,1
2,5.65,3.65,0.05,0.13,1.67,0.05,0.05,0.05,15.727505,0.05,0.05,0.05,1,0,1,2
3,3.55,0.05,0.05,0.77,0.23,0.05,0.05,0.05,0.41,0.05,0.05,0.05,0,0,1,3
4,0.75,0.05,0.05,0.77,0.23,0.05,0.05,0.05,0.41,0.05,0.05,0.05,0,0,0,4


In [8]:
comments = np.array(Z['F2'])
X = np.array(FEATS)[:,:12]
if FILE_TYPE == 'all':
    Y = np.array(FEATS[['ProgramDomain','ProjectManagement','ProblemDomain']])
else:
    Y = np.array(FEATS['Class'])

In [9]:
ctr = Counter()
mp = {}
sentences = []
for comment in comments:
    sent = [x.strip() for x in CLEANING_PATTERSN.split(comment) if x!='']
    ctr[len(sent)] += 1
    sentences.append(sent)
    if len(sent) not in mp:
        mp[len(sent)] = []
    mp[len(sent)].append(sent)

In [10]:
ctr = Counter()
for sent in sentences:
    for word in sent:
        ctr[word] += 1

In [11]:
# VOCAB_SIZE = len(ctr)+1
# VOCAB = ['<UNK>']
# for el in ctr:
#     VOCAB.append(el)

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [13]:
len(tokenizer.word_index)

7732

In [14]:
train_sent = tokenizer.texts_to_sequences(sentences)
train_sent = pad_sequences(train_sent, maxlen=MAX_TIME,padding='post')

In [15]:
if FILE_TYPE == 'all':
    train_y = Y
else:
    train_y = to_categorical(Y)
print(train_y.shape)

(12774, 3)


In [16]:
NUM_TRAIN = int(0.9*len(X))
print(NUM_TRAIN)
train_x = X[:NUM_TRAIN]
test_x = X[NUM_TRAIN:]
train_y, test_y = train_y[:NUM_TRAIN], train_y[NUM_TRAIN:]
train_sent, test_sent = train_sent[:NUM_TRAIN], train_sent[NUM_TRAIN:]
print(train_x.shape, train_y.shape, train_sent.shape, test_x.shape, test_y.shape, test_sent.shape)

11496
(11496, 12) (11496, 3) (11496, 30) (1278, 12) (1278, 3) (1278, 30)


In [57]:
# Model

In [23]:
def build_model():
    sent_input = Input(shape=(MAX_TIME,))
    extracted_feats = Input(shape=(12,))
    embeddingLayer = Embedding(VOCAB_SIZE, 100, input_length=MAX_TIME,  trainable=True)
    sent = embeddingLayer(sent_input)
    _, h1, c1 = LSTM(LSTM_HIDDEN_SIZE,dropout=DROPOUT,return_state=True)(sent)
    print(h1.shape)
    # Concat h1 and 12 features
    feature_vector = concatenate([h1,extracted_feats],axis=1)
    print(feature_vector.shape)
    probs = Dense(64,activation='relu')(feature_vector)
    print(probs.shape)
    probs = Dense(3,activation='sigmoid')(probs)
    print(probs.shape)
    model = Model(inputs=[sent_input,extracted_feats],outputs=probs)
    rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
    model.compile(loss='binary_crossentropy',
                 optimizer=rmsprop,
                 metrics=['binary_accuracy','categorical_accuracy'])
    return model

In [24]:
model = build_model()

(?, 200)
(?, 212)
(?, 64)
(?, 3)


In [26]:
model.fit([train_sent,train_x],train_y,epochs=NUM_EPOCHS,batch_size=BATCH_SIZE,verbose=1,
          validation_data=([test_sent, test_x],test_y))

Train on 11496 samples, validate on 1278 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


Epoch 37/100
Epoch 38/100
Epoch 39/100

KeyboardInterrupt: 

In [33]:
predictions = model.predict([test_sent,test_x],batch_size=BATCH_SIZE)
if FILE_TYPE == 'all':
    predictions = np.where(predictions > 0.5,1,0)
else:
    predictions = predictions.argmax(axis=1)

In [40]:
if FILE_TYPE == 'all':
    fs = fscore(test_y,predictions)
else:
    fs = fscore(test_y.argmax(axis=1),predictions)
fs

(array([0.87406015, 0.90740741, 0.82517483]),
 array([0.93467337, 0.33333333, 0.48163265]),
 array([0.90335114, 0.48756219, 0.60824742]),
 array([995, 147, 245]))

In [25]:
model.save('model_'+FILE_TYPE+".h5")