In [17]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import keras
from keras.models import Model
from keras import optimizers
from keras.layers import Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Input, concatenate, Dropout, Reshape
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import precision_recall_fscore_support as fscore
from sklearn.metrics.pairwise import cosine_similarity as CS
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
CLEANING_PATTERSN = re.compile("[\s\n\r\t.,:;\-_\'\"?!#&()*]")
LSTM_HIDDEN_SIZE = 200
MAX_TIME = 30
VOCAB_SIZE = 10000
DROPOUT = 0.2
LEARNING_RATE = 0.0001
NUM_EPOCHS = 1
BATCH_SIZE = 2000
FILE_TYPE = 'all'
MIDDLE_LAYER_ACTIVATION = 'relu'
FINAL_LAYER_ACTIVATION = 'sigmoid'
K = 5

In [19]:
Z = pd.read_csv('DATA/GENERATED/TRAIN/Z_CONCATED_commentType.csv',delimiter='\t')
FEATS = pd.read_csv('DATA/GENERATED/TRAIN/CONCATED_commentType_'+FILE_TYPE+'.csv')
FEATS.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,ProgramDomain,ProjectManagement,ProblemDomain,Index
0,0.75,0.05,0.05,0.77,0.23,0.05,0.05,0.05,0.41,0.05,0.05,0.05,0,0,0,0
1,9.15,3.65,0.05,2.21,0.59,0.05,0.05,0.05,3.61,0.05,0.05,0.05,1,0,0,1
2,5.65,3.65,0.05,0.13,1.67,0.05,0.05,0.05,15.727505,0.05,0.05,0.05,1,0,1,2
3,3.55,0.05,0.05,0.77,0.23,0.05,0.05,0.05,0.41,0.05,0.05,0.05,0,0,1,3
4,0.75,0.05,0.05,0.77,0.23,0.05,0.05,0.05,0.41,0.05,0.05,0.05,0,0,0,4


In [20]:
comments = np.array(Z['F2'])
X = np.array(FEATS)[:,:12]
if FILE_TYPE == 'all':
    Y = np.array(FEATS[['ProgramDomain','ProjectManagement','ProblemDomain']])
else:
    Y = np.array(FEATS['Class'])

In [21]:
ctr = Counter()
mp = {}
sentences = []
for comment in comments:
    sent = [x.strip() for x in CLEANING_PATTERSN.split(comment) if x!='']
    ctr[len(sent)] += 1
    sentences.append(sent)
    if len(sent) not in mp:
        mp[len(sent)] = []
    mp[len(sent)].append(sent)

In [22]:
ctr = Counter()
for sent in sentences:
    for word in sent:
        ctr[word] += 1

In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [24]:
len(tokenizer.word_index)

7732

In [25]:
train_sent = tokenizer.texts_to_sequences(sentences)
train_sent = pad_sequences(train_sent, maxlen=MAX_TIME,padding='post')

In [26]:
if FILE_TYPE == 'all':
    train_y = Y
else:
    train_y = to_categorical(Y)
print(train_y.shape)

(12774, 3)


In [27]:
NUM_TRAIN = int(0.9*len(X))
print(NUM_TRAIN)
train_x = X[:NUM_TRAIN]
test_x = X[NUM_TRAIN:]
train_y, test_y = train_y[:NUM_TRAIN], train_y[NUM_TRAIN:]
train_sent, test_sent = train_sent[:NUM_TRAIN], train_sent[NUM_TRAIN:]
print(train_x.shape, train_y.shape, train_sent.shape, test_x.shape, test_y.shape, test_sent.shape)

11496
(11496, 12) (11496, 3) (11496, 30) (1278, 12) (1278, 3) (1278, 30)


In [28]:
def divide_into_k_folds(train_x, train_y, train_sent,k):
    xs = []
    ys = []
    sents = []
    each = int(len(train_x)/k)
    for i in range (k-1):
        xs.append(train_x[i*each:(i+1)*each])
        ys.append(train_y[i*each:(i+1)*each])
        sents.append(train_sent[i*each:(i+1)*each])
    xs.append(train_x[(k-1)*each:])
    ys.append(train_y[(k-1)*each:])    
    sents.append(train_sent[(k-1)*each:])    
    return np.array(xs), np.array(ys), np.array(sents)

train_x, train_y, train_sent = divide_into_k_folds(train_x, train_y, train_sent, K)
print(train_x.shape)

def get_fold(train_x, train_y, train_sent,i,k):
    ids = [x for x in range(k) if x != i]
    print(i,k,ids)
    return np.concatenate(train_x[ids],axis=0), np.concatenate(train_y[ids],axis=0), \
        np.concatenate(train_sent[ids],axis=0)
#     xs, ys, sents = train_x[0], train_y[0], train_sent[0]
#     if i == 0:
#         xs, ys, sents = train_x[1], train_y[1], train_sent[1]
#     for j in range(1,k):
#         if j == i:
#             continue
#         c

(5,)


In [29]:
# Model

In [30]:
def build_model():
    sent_input = Input(shape=(MAX_TIME,))
    extracted_feats = Input(shape=(12,))
    embeddingLayer = Embedding(VOCAB_SIZE, 100, input_length=MAX_TIME,  trainable=True)
    sent = embeddingLayer(sent_input)
    _, h1, c1 = LSTM(LSTM_HIDDEN_SIZE,dropout=DROPOUT,return_state=True)(sent)
    print(h1.shape)
    # Concat h1 and 12 features
    feature_vector = concatenate([h1,extracted_feats],axis=1)
    print(feature_vector.shape)
    probs = Dense(64,activation=MIDDLE_LAYER_ACTIVATION)(feature_vector)
    print(probs.shape)
    probs = Dense(3,activation=FINAL_LAYER_ACTIVATION)(probs)
    print(probs.shape)
    model = Model(inputs=[sent_input,extracted_feats],outputs=probs)
    rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
    model.compile(loss='binary_crossentropy',
                 optimizer=rmsprop,
                 metrics=['binary_accuracy','categorical_accuracy'])
    return model

In [31]:
def find_fs(model):
    predictions = model.predict([test_sent,test_x],batch_size=BATCH_SIZE)
    if FILE_TYPE == 'all':
        predictions = np.where(predictions > 0.5,1,0)
    else:
        predictions = predictions.argmax(axis=1)
    if FILE_TYPE == 'all':
        fs = fscore(test_y,predictions)
    else:
        fs = fscore(test_y.argmax(axis=1),predictions)
    return fs

In [32]:
MODELS = []
FSS = []
for k in range(K):
    print("-----------------Running Fold - ",k+1," of ",K,"-------------------")
    model = build_model()
    MODELS.append(model)
    curr_train_x, curr_train_y, curr_train_sent = get_fold(train_x, train_y, train_sent,k,K)
    print(curr_train_x.shape)
    model.fit([curr_train_sent,curr_train_x],curr_train_y,epochs=NUM_EPOCHS,batch_size=BATCH_SIZE,verbose=1,
          validation_data=([train_sent[k], train_x[k]],train_y[k]))
    FSS.append(find_fs(model))
    model.save('model_'+FILE_TYPE+'_fold_'+str(k)+'.h5')

-----------------Running Fold -  1  of  5 -------------------
(?, 200)
(?, 212)
(?, 64)
(?, 3)
0 5 [1, 2, 3, 4]
(9197, 12)
Train on 9197 samples, validate on 2299 samples
Epoch 1/1
-----------------Running Fold -  2  of  5 -------------------
(?, 200)
(?, 212)
(?, 64)
(?, 3)
1 5 [0, 2, 3, 4]
(9197, 12)
Train on 9197 samples, validate on 2299 samples
Epoch 1/1
-----------------Running Fold -  3  of  5 -------------------
(?, 200)
(?, 212)
(?, 64)
(?, 3)
2 5 [0, 1, 3, 4]
(9197, 12)
Train on 9197 samples, validate on 2299 samples
Epoch 1/1
-----------------Running Fold -  4  of  5 -------------------
(?, 200)
(?, 212)
(?, 64)
(?, 3)
3 5 [0, 1, 2, 4]
(9197, 12)
Train on 9197 samples, validate on 2299 samples
Epoch 1/1
-----------------Running Fold -  5  of  5 -------------------
(?, 200)
(?, 212)
(?, 64)
(?, 3)
4 5 [0, 1, 2, 3]
(9196, 12)
Train on 9196 samples, validate on 2300 samples
Epoch 1/1


In [None]:
# TO CONTINUE TRAINING FOR MORE EPOCHS
for k in range(K):
    print("-----------------Running Fold - ",k+1," of ",K,"-------------------")
    model = MODELS[k]
    model.fit([train_sent[k],train_x[k]],train_y[k],epochs=NUM_EPOCHS,batch_size=BATCH_SIZE,verbose=1,
          validation_data=([test_sent, test_x],test_y))
    model.save('model_'+FILE_TYPE+'_fold_'+str(k)+'.h5')

In [None]:
predictions = model.predict([test_sent,test_x],batch_size=BATCH_SIZE)
if FILE_TYPE == 'all':
    predictions = np.where(predictions > 0.5,1,0)
else:
    predictions = predictions.argmax(axis=1)

In [None]:
if FILE_TYPE == 'all':
    fs = fscore(test_y,predictions)
else:
    fs = fscore(test_y.argmax(axis=1),predictions)
fs

In [None]:
model.save('model_'+FILE_TYPE+".h5")

In [None]:
# Visulaising Embeddings
embeddings = model.layers[1].get_weights()[0]

In [None]:
def embed(word):
    return embeddings[tokenizer.word_index[word]].reshape(1,-1)

In [None]:
NUM_WORDS_FOR_ANALYSIS = 50
SIM = []
ALL_WORDS = []
all_words = tokenizer.word_index.keys()
for word in all_words:
    ALL_WORDS.append(word)
all_words = ALL_WORDS
for i in range(NUM_WORDS_FOR_ANALYSIS):
    for j in range(i+1,NUM_WORDS_FOR_ANALYSIS):
        SIM.append((all_words[i],all_words[j],CS(embed(all_words[i]),embed(all_words[j]))[0][0]))

In [None]:
SS = sorted(SIM,reverse=True,key=(lambda x:abs(x[2])))

In [None]:
def tsne_plot():
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in all_words[:50]:
        tokens.append(embed(word)[0])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    #plt.show()
    plt.savefig('SP.svg',format='svg')

In [None]:
tsne_plot()

In [42]:
train_x.shape

(5, 1)