In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import keras
from keras.models import Model
from keras import optimizers
from keras.layers import Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Input, concatenate, Dropout, Reshape
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import precision_recall_fscore_support as fscore
from sklearn.metrics.pairwise import cosine_similarity as CS
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import tensorflow as tf
import os
%matplotlib inline

In [None]:
CLEANING_PATTERSN = re.compile("[\s\n\r\t.,:;\-_\'\"?!#&()*]")
LSTM_HIDDEN_SIZE = 200
MAX_TIME = 30 #MAXIMUM SIZE OF A COMMENT TO BE PASSED TO LSTM
VOCAB_SIZE = 10000 #MAX VOCAB SIZE
DROPOUT = 0.2
LEARNING_RATE = 0.0001
NUM_EPOCHS = 1
BATCH_SIZE = 2000
FILE_TYPE = 'all' #should be one of 'all', 'ProgramDomain', 'ProblemDomain', 'ProjectManagement'
MIDDLE_LAYER_ACTIVATION = 'relu' #Activation function in middle layers.
FINAL_LAYER_ACTIVATION = 'sigmoid' #Activation function of final layer.
K = 5 #Parameter for K-fold Cross Validation

In [None]:
Z = pd.read_csv('DATA/GENERATED/TRAIN/Z_CONCATED_commentType.csv',delimiter='\t') #Z contains the comment text
FEATS = pd.read_csv('DATA/GENERATED/TRAIN/CONCATED_commentType_'+FILE_TYPE+'.csv') #Features for training
FEATS.head()

In [None]:
comments = np.array(Z['F2'])
X = np.array(FEATS)[:,:12]
if FILE_TYPE == 'all':
    Y = np.array(FEATS[['ProgramDomain','ProjectManagement','ProblemDomain']])
else:
    Y = np.array(FEATS['Class'])

In [None]:
# Comments Cleaning
ctr = Counter()
mp = {}
sentences = []
for comment in comments:
    sent = [x.strip() for x in CLEANING_PATTERSN.split(comment) if x!='']
    ctr[len(sent)] += 1
    sentences.append(sent)
    if len(sent) not in mp:
        mp[len(sent)] = []
    mp[len(sent)].append(sent)

In [None]:
ctr = Counter()
for sent in sentences:
    for word in sent:
        ctr[word] += 1

In [None]:
# For creating a vocabulary and convert a sentence (vector of words) to vector of indices
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [None]:
len(tokenizer.word_index)

In [None]:
# train_sent are Comment texts to be passed for training. (Input to model)
train_sent = tokenizer.texts_to_sequences(sentences)
train_sent = pad_sequences(train_sent, maxlen=MAX_TIME,padding='post')

In [None]:
if FILE_TYPE == 'all':
    train_y = Y
else:
    train_y = to_categorical(Y)
print(train_y.shape)

In [None]:
# Train/Test Split
NUM_TRAIN = int(0.9*len(X))
print(NUM_TRAIN)
train_x = X[:NUM_TRAIN]
test_x = X[NUM_TRAIN:]
train_y, test_y = train_y[:NUM_TRAIN], train_y[NUM_TRAIN:]
train_sent, test_sent = train_sent[:NUM_TRAIN], train_sent[NUM_TRAIN:]
print(train_x.shape, train_y.shape, train_sent.shape, test_x.shape, test_y.shape, test_sent.shape)

In [None]:
def divide_into_k_folds(train_x, train_y, train_sent,k):
    xs = []
    ys = []
    sents = []
    each = int(len(train_x)/k)
    for i in range (k-1):
        xs.append(train_x[i*each:(i+1)*each])
        ys.append(train_y[i*each:(i+1)*each])
        sents.append(train_sent[i*each:(i+1)*each])
    xs.append(train_x[(k-1)*each:])
    ys.append(train_y[(k-1)*each:])    
    sents.append(train_sent[(k-1)*each:])    
    return np.array(xs), np.array(ys), np.array(sents)

def get_fold(train_x, train_y, train_sent,i,k):
    ids = [x for x in range(k) if x != i]
    print(i,k,ids)
    return np.concatenate(train_x[ids],axis=0), np.concatenate(train_y[ids],axis=0), \
        np.concatenate(train_sent[ids],axis=0)

def get_all_data_from_folds(train_x, train_y, train_sent):
    return np.concatenate(train_x,axis=0), np.concatenate(train_y,axis = 0),\
            np.concatenate(train_sent,axis=0)

In [None]:
train_x, train_y, train_sent = divide_into_k_folds(train_x, train_y, train_sent, K)
print(train_x.shape)

In [None]:
def build_model(optimizer='rmsprop',lr=LEARNING_RATE,middle_act=MIDDLE_LAYER_ACTIVATION,
               final_act=FINAL_LAYER_ACTIVATION,dropout=DROPOUT,lstm_hidden=LSTM_HIDDEN_SIZE): 
    
    sent_input = Input(shape=(MAX_TIME,)) #Input 1 - Comment text
    extracted_feats = Input(shape=(12,)) #Input 2 - 12 Features
    print(sent_input.shape, extracted_feats.shape)
    
    embeddingLayer = Embedding(VOCAB_SIZE, 100, input_length=MAX_TIME,  trainable=True)
    sent = embeddingLayer(sent_input)
    _, h1, c1 = LSTM(lstm_hidden,dropout=dropout,return_state=True)(sent) #Feed the comments to LSTM
    print(h1.shape)
    # Concat h1 and 12 features
    feature_vector = concatenate([h1,extracted_feats],axis=1) #Concat output of LSTM with the 12 features
    print(feature_vector.shape)
    probs = Dense(64,activation=middle_act)(feature_vector) #Dense layer over LSTM_HIDEEN_SIZE + 12 features
    print(probs.shape)
    probs = Dense(3,activation=final_act)(probs) #Final Activation. Use sigmoid and NOT Softmax here.
    print(probs.shape)
    model = Model(inputs=[sent_input,extracted_feats],outputs=probs)
    if optimizer == 'rmsprop':
        optimizer = optimizers.rmsprop(lr=lr)
    elif optimizer == 'adam':
        optimizer = optimizers.adam(lr=lr)
    else:
        print("Optimizer not supported!")
        return
    model.compile(loss='binary_crossentropy',
                 optimizer=optimizer,
                 metrics=['binary_accuracy','categorical_accuracy'])
    return model

In [None]:
# Find fscore for a model
def find_fs(model):
    predictions = model.predict([test_sent,test_x],batch_size=BATCH_SIZE)
    if FILE_TYPE == 'all':
        predictions = np.where(predictions > 0.5,1,0)
    else:
        predictions = predictions.argmax(axis=1)
    if FILE_TYPE == 'all':
        fs = fscore(test_y,predictions)
    else:
        fs = fscore(test_y.argmax(axis=1),predictions)
    return fs

In [None]:
# Run, takes parameters for model. Returns K-models from K-cross validation (We use only final one) 
# and Fscore Statistics from all of them.
def run(optimizer='rmsprop',lr=LEARNING_RATE,middle_act=MIDDLE_LAYER_ACTIVATION,
               final_act=FINAL_LAYER_ACTIVATION,dropout=DROPOUT,lstm_hidden=LSTM_HIDDEN_SIZE):
    MODELS = []
    FSS = []
    for k in range(K):
        print("-----------------Running Fold - ",k+1," of ",K,"-------------------")
        model = build_model(optimizer,lr,middle_act,final_act,dropout,lstm_hidden)
        MODELS.append(model)
        curr_train_x, curr_train_y, curr_train_sent = get_fold(train_x, train_y, train_sent,k,K)
        print(curr_train_x.shape)
        model.fit([curr_train_sent,curr_train_x],curr_train_y,epochs=NUM_EPOCHS,batch_size=BATCH_SIZE,verbose=1,
              validation_data=([train_sent[k], train_x[k]],train_y[k]))
        FSS.append(find_fs(model))
        model.save('model_'+FILE_TYPE+'_fold_'+str(k)+'.h5')
    return MODELS, FSS

In [None]:
# TO CONTINUE TRAINING FOR MORE EPOCHS
# for k in range(K):
#     print("-----------------Running Fold - ",k+1," of ",K,"-------------------")
#     model = MODELS[k]
#     model.fit([train_sent[k],train_x[k]],train_y[k],epochs=NUM_EPOCHS,batch_size=BATCH_SIZE,verbose=1,
#           validation_data=([test_sent, test_x],test_y))
#     model.save('model_'+FILE_TYPE+'_fold_'+str(k)+'.h5')

In [None]:
# Get predictions for an ensemble for models. 
def get_predictions(test_x, test_sent,models_arr=None):
    prediction_scores = np.zeros((len(test_x),3))
    k = len(models_arr)
    for mod in models_arr:
        predictions = mod.predict([test_sent, test_x],batch_size=BATCH_SIZE)
        if FILE_TYPE == 'all':
            predictions = np.where(predictions > 0.5,1,0)
        else:
            predictions = predictions.argmax(axis=1)
        prediction_scores += predictions
    print(prediction_scores)
    return np.where(prediction_scores > k/2, 1, 0)

In [None]:
# predictions = get_predictions(test_x, test_sent)

In [None]:
# if FILE_TYPE == 'all':
#     fs = fscore(test_y,predictions)
# else:
#     fs = fscore(test_y.argmax(axis=1),predictions)
# fs

In [None]:
# model.save('model_'+FILE_TYPE+".h5")

# Ensemble

In [None]:
import pickle

In [None]:
ENSEMBLE_FSS = {} #Key - experiment name. Value - FScore Statistics of the experiment.
if not os.path.exists('ensemble_models'):
    os.mkdir('ensemble_models')
if os.path.exists('LSTM_ENSEMBLE_MODELS_SUMMARY.map'):
    with open('LSTM_ENSEMBLE_MODELS_SUMMARY.map','rb') as f:
        ENSEMBLE_FSS = pickle.load(f)
# Saves all the information for an experiment. Saves the FScore Stats in ENSEMBLE_FSS, 
# saves the models in folder ensemble_models, and dumps the ENSEMBLE_FSS to be read later.\
# Input parameters - MODELS as returned by run(), FSS as returned by run(), name of the experiment.
def _put(m,f,name):
    for j,model in enumerate(m):
        model.save('ensemble_models/model_'+name+str(j)+'.h5')
    ENSEMBLE_FSS[name] = f
    with open('LSTM_ENSEMBLE_MODELS_SUMMARY.map','wb') as f:
        pickle.dump(ENSEMBLE_FSS,f)
# Running different experiments.

# Default model
m, f = run()
_put(m,f,'default')
# 2*LSTM_HIDDEN
m, f = run(lstm_hidden=2*LSTM_HIDDEN_SIZE)
_put(m,f,'2LSTM_HIDDEN')
# 4*LSTM_HIDDEN
m,f = run(lstm_hidden=4*LSTM_HIDDEN_SIZE)
_put(m,f,'4LSTM_HIDDEN')

In [None]:
# Ensemble Prediction
with open('LSTM_ENSEMBLE_MODELS_SUMMARY.map','rb') as f:
    ENSEMBLE_FSS = pickle.load(f)
ENSEMBLE_MODELS = []
for k,v in ENSEMBLE_FSS.items():
    # Taking only last fold model
    m = keras.models.load_model('ensemble_models/model_'+k+str(len(v)-1)+'.h5')
    ENSEMBLE_MODELS.append(m)
predictions = get_predictions(test_x, test_sent, ENSEMBLE_MODELS)
if FILE_TYPE == 'all':
    fs = fscore(test_y,predictions)
else:
    fs = fscore(test_y.argmax(axis=1),predictions)
fs

# Embeddings Visualisation

In [None]:
# Visulaising Embeddings
embeddings = model.layers[1].get_weights()[0]

In [None]:
def embed(word):
    return embeddings[tokenizer.word_index[word]].reshape(1,-1)

In [None]:
NUM_WORDS_FOR_ANALYSIS = 50
SIM = []
ALL_WORDS = []
all_words = tokenizer.word_index.keys()
for word in all_words:
    ALL_WORDS.append(word)
all_words = ALL_WORDS
for i in range(NUM_WORDS_FOR_ANALYSIS):
    for j in range(i+1,NUM_WORDS_FOR_ANALYSIS):
        SIM.append((all_words[i],all_words[j],CS(embed(all_words[i]),embed(all_words[j]))[0][0]))

In [None]:
SS = sorted(SIM,reverse=True,key=(lambda x:abs(x[2])))

In [None]:
def tsne_plot():
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in all_words[:50]:
        tokens.append(embed(word)[0])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    #plt.show()
    plt.savefig('SP.svg',format='svg')

In [None]:
tsne_plot()

In [None]:
train_x.shape