# Configurations

In [None]:
batch_size = 64  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 200  # Latent dimensionality of the encoding space.
max_vocab = 30000
embeding_output_dim = 50
CLUSTERING_METHOD =1 #0: Kmeans, 1: NMF, 2: LDA
NUMBER_OF_CLUSTERS=3
CLUSTERING_METHOD_NAME = ['Kmeans', 'NMF','LDA'][CLUSTERING_METHOD]

In [None]:
from __future__ import print_function

import pickle
import numpy as np
from random import choices,shuffle
from os import path
import tensorflow  as tf
from tqdm.notebook import tqdm
from keras.models import load_model
from keras.models import Model
from keras.layers import Input, LSTM, Dense,GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Bidirectional,Concatenate
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF, LatentDirichletAllocation
from scipy.sparse import vstack

# --------------------------------------------------

## Load Dataset

In [None]:
#Because this data is private and there is no permission to publish it publicly, only the indexed data is used here.

#load tokenized Questions train,validation,test 
Q_train_tokenized = pickle.load(open('data\\Q_train_tokenized.pkl','rb'))
Q_valid_tokenized = pickle.load(open('data\\Q_valid_tokenized.pkl','rb'))
Q_test_tokenized = pickle.load(open('data\\Q_test_tokenized.pkl','rb'))
#load tokenized Answers train,validation,test
A_train_tokenized = pickle.load(open('data\\A_train_tokenized.pkl','rb'))
A_valid_tokenized = pickle.load(open('data\\A_valid_tokenized.pkl','rb'))
A_test_tokenized = pickle.load(open('data\\A_test_tokenized.pkl','rb'))

#load TFIDF vectors train,validation,test (include Question-Answer pair)
train_TFIDFvec = pickle.load(open('data\\train_TFIDFvec.pkl','rb'))
valid_TFIDFvec = pickle.load(open('data\\valid_TFIDFvec.pkl','rb'))
test_TFIDFvec = pickle.load(open('data\\test_TFIDFvec.pkl','rb'))

SOS = 2
EOS = 3
data_length = len(Q_train_tokenized)+len(Q_valid_tokenized)+len(Q_test_tokenized)
train_count = len(Q_train_tokenized)
print('total records %i, %i train, %i validation %i test'%(data_length
                                                           ,len(Q_train_tokenized),len(Q_valid_tokenized),len(Q_test_tokenized)))

num_encoder_tokens = min(max([max(Q) for Q in Q_train_tokenized if len(Q)>0]),max_vocab)+2
num_decoder_tokens = min(max([max(A) for A in A_train_tokenized if len(A)>0]),max_vocab)+2
print('max tokens:',num_encoder_tokens,num_decoder_tokens)

# Clustering 

## Running the clustering on Answers

In [None]:
print('clustering dataset using %s'%CLUSTERING_METHOD_NAME)

#_matrix = vec.fit_transform(texts)
#vec = CountVectorizer()
feature_matrix = vstack((train_TFIDFvec, valid_TFIDFvec))

clustering_switcher = {
    0:KMeans(n_clusters=NUMBER_OF_CLUSTERS),
    1:NMF(n_components=NUMBER_OF_CLUSTERS),
    2:LatentDirichletAllocation(n_components=NUMBER_OF_CLUSTERS)
}

clustering = clustering_switcher[CLUSTERING_METHOD]
clustering.fit(feature_matrix)
print('Clustering method fitted to dataset')


train_cluster_labels = clustering.transform(train_TFIDFvec) 
valid_cluster_labels = clustering.transform(valid_TFIDFvec) 
test_cluster_labels = clustering.transform(test_TFIDFvec) 


if CLUSTERING_METHOD != 0:
    train_cluster_labels = np.argmax(train_cluster_labels,axis=-1)
    valid_cluster_labels = np.argmax(valid_cluster_labels,axis=-1)
    test_cluster_labels = np.argmax(test_cluster_labels,axis=-1)
else:
    train_cluster_labels = np.argmin(train_cluster_labels,axis=-1)
    valid_cluster_labels = np.argmin(valid_cluster_labels,axis=-1)
    test_cluster_labels = np.argmin(test_cluster_labels,axis=-1)
print('Clustering done.')


# --------------------------------------------------

# Classification

## Classifier AUX 

In [None]:
def indices_to_one_hot(data, nb_classes):
    """Convert an iterable of indices to one-hot encoded labels."""
    try:
        targets = np.array(data).reshape(-1)
        output = np.zeros((len(targets),nb_classes))
        for i,d in enumerate(targets):
            output[i,d]=1
        return output
    except Exception as ex:
        print("error in indices_to_one_hot")
        print(ex)
        print(nb_classes)
        print(targets)
def generate_data_classification(data, batch_size):
    idx =0
    max_batch = len(data)//batch_size
    while True:
        idx = (idx)% max_batch
        input_sequences = []
        target = []
        for q,l in data[idx * batch_size:(idx + 1) * batch_size]:
            input_sequences.append(q)
            target.append(l)

        max_encoder_seq_length = max([len(txt) for txt in input_sequences])
        encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length)
        output = indices_to_one_hot(target,NUMBER_OF_CLUSTERS)
        idx+=1
        yield (np.array(encoder_input_data),np.array(output))

## Balancing  Classifier Training Data using stratified sampling

In [None]:
max_class_count = np.max([sum(train_cluster_labels==i) for i in range(NUMBER_OF_CLUSTERS)])
train_data_bal = []
for j in range(NUMBER_OF_CLUSTERS):
    train_data_bal += choices([(q,l) for q,l in zip(Q_train_tokenized,train_cluster_labels) if l == j],k=max_class_count)
shuffle(train_data_bal)
print("train_data_bal : %i#"%len(train_data_bal))

## Training Question Classifier

In [None]:
    #create classifier model
    encoder_inputs = Input(shape=(None, ))
    print(encoder_inputs.shape)
    embed_encoder = Embedding(
                        input_dim=num_encoder_tokens,
                        output_dim=embeding_output_dim)(encoder_inputs)
    encoder = GRU(latent_dim, dropout=0.5)
    encoder_outputs = encoder(embed_encoder)


    decoder_dense = Dense(NUMBER_OF_CLUSTERS, activation='softmax')
    decoder_outputs = decoder_dense(encoder_outputs)
    classifier = Model(encoder_inputs, decoder_outputs)
    classifier.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])



    best = 0
    #run untill it can improve the accuracy on validation data
    while(True):
        classifier.fit_generator(generator=generate_data_classification(train_data_bal,batch_size),
                        steps_per_epoch = len(Q_train_tokenized)//batch_size,
                        epochs=5)
        acc = classifier.evaluate_generator(generator=generate_data_classification(list(zip(Q_valid_tokenized,valid_cluster_labels)),batch_size),
                     steps= len(Q_valid_tokenized)//batch_size)
        acc=acc[1]
        print('Accuracy on validation data: ',acc)

        if acc > best:
            best = acc
            #Saveing the best so far classifier
            classifier.save('cached_models\\classifier.h5')
            
            continue
        else:
            #stp training since it could not improve the accuracy any more
            break


    print("training done, running on data...")
    #loading the best classifier
    print('loading classifier from cached model...')
    classifier = load_model('cached_models\\classifier.h5')
    
    print('checking accuracy on test data....')
    print(classifier.evaluate_generator(generator=generate_data_classification(list(zip(Q_test_tokenized,test_cluster_labels)),batch_size),
                         steps= len(Q_test_tokenized)//batch_size))

    #predicting the class of dataset
    print('running on dataset...')
    train_cluster_predicted_labels = classifier.predict(next(generate_data_classification(list(zip(Q_train_tokenized,train_cluster_labels)),len(Q_train_tokenized)))[0])
    train_cluster_predicted_labels = train_cluster_predicted_labels.argmax(axis=-1)
    
    valid_cluster_predicted_labels = classifier.predict(next(generate_data_classification(list(zip(Q_valid_tokenized,valid_cluster_labels)),len(Q_valid_tokenized)))[0])
    valid_cluster_predicted_labels = valid_cluster_predicted_labels.argmax(axis=-1)
    
    test_cluster_predicted_labels = classifier.predict(next(generate_data_classification(list(zip(Q_test_tokenized,test_cluster_labels)),len(Q_test_tokenized)))[0])
    test_cluster_predicted_labels = test_cluster_predicted_labels.argmax(axis=-1)
    

# --------------------------------------------------

# Proposed Method

## Preparing Data

In [None]:
clusters_prob = [sum([1 for l in train_cluster_predicted_labels if l == i])/len(train_cluster_predicted_labels) for i in range(NUMBER_OF_CLUSTERS)]
_train_data=[]
_valid_data = []
_test_data = []
for index in  range(NUMBER_OF_CLUSTERS):
    _train_data.append([(q,a) for q,a,l in zip(Q_train_tokenized,A_train_tokenized,train_cluster_predicted_labels) if l == index])
    _valid_data.append([(q,a) for q,a,l in zip(Q_valid_tokenized,A_valid_tokenized,valid_cluster_predicted_labels) if l == index])
    _test_data.append([(q,a) for q,a,l in zip(Q_test_tokenized,A_test_tokenized,test_cluster_predicted_labels) if l == index])    

## Data Generator Method 

In [None]:
import random
def generate_data(data, batch_size,Infinite=True,Random=False):
    max_batch = len(data)//batch_size
    idx = random.randint(0,max_batch) if Random else 0
    while Infinite or idx<=max_batch:
        idx = (idx)% max_batch if Infinite else idx
        input_sequences = []
        target_sequences = []
        for input_text, target_text in data[idx * batch_size:(idx + 1) * batch_size]:
                input_sequences.append(input_text)
                target_sequences.append([SOS] + target_text + [EOS])


        max_encoder_seq_length = max([1]+[len(txt) for txt in input_sequences])
        max_decoder_seq_length = max([1]+[len(txt) for txt in target_sequences])
        encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length)
        decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length,padding='post')
        decoder_target_data = np.array([indices_to_one_hot(np.concatenate((a[1:],[0])),num_decoder_tokens) for a in decoder_input_data])
        idx+=1
        yield ([np.array(encoder_input_data),np.array(decoder_input_data)], np.array(decoder_target_data))

## Create Model for each Class

In [None]:
from keras.layers import Bidirectional,Concatenate
from os import path
models =  {}
encoder_inputs = Input(shape=(None, ))
print(encoder_inputs.shape)
embed_encoder = Embedding(
                    input_dim=num_encoder_tokens,
                    output_dim=embeding_output_dim)(encoder_inputs)
encoder = Bidirectional(GRU(latent_dim, return_state=True,dropout=0.5))
encoder_outputs, forward_h,backward_h = encoder(embed_encoder)
state_h = Concatenate()([forward_h,backward_h])



decoder_inputs = Input(shape=(None, ))
embed_decoder = Embedding(num_decoder_tokens, embeding_output_dim)(decoder_inputs)
decoder_gru = GRU(latent_dim*2,return_sequences=True,return_state=True, dropout=0.5)
gru_outputs,_ = decoder_gru(embed_decoder, initial_state=state_h)
def get_model(i):
    if i in models:
        return models[i]
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(gru_outputs)


    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    models[i] = model
    return models[i]

## Extact Test Model of each Class

In [None]:
def reset_test_models():
    global idea_encoder,decoder
    _encoder_inputs = Input(shape=(None, ))


    embed_encoder = get_model(0).layers[1](_encoder_inputs)
    encoder = get_model(0).layers[3]
    encoder_outputs, forward_h,backward_h = encoder(embed_encoder)
    state_h = Concatenate()([forward_h,backward_h])
    idea_encoder = Model(_encoder_inputs, state_h)
    decoder = {}
def get_decoder(i):
    if i in decoder:
        return decoder[i]
    decoder_state_input_h = Input(shape=(latent_dim*2,))
    decoder_inputs = Input(shape=(None, ))
    embed_decoder = get_model(i).layers[4](decoder_inputs)
    decoder_outputs, state_h = get_model(i).layers[6](
    embed_decoder, initial_state=decoder_state_input_h)
    decoder_states = state_h
    decoder_outputs = get_model(i).layers[7](decoder_outputs)
    decoder[i] = Model(
    [decoder_inputs] + [decoder_state_input_h],
    [decoder_outputs] + [decoder_states])
    return decoder[i]

## Measures Method

In [None]:
import keras.backend as K
from scipy.stats import entropy
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from collections import defaultdict

AllTrue = 0
AllPred = 0
Correct = 0
Measures = defaultdict(lambda:[])
Rouge1 = []
Rouge2 = []
RougeL = []
Bleu1 = []
Bleu2 = []


scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=False)


def compute_recall_precision(y_true, y_pred):
    global AllTrue,AllPred,Correct
    reference = [m for m in np.argmax(y_true,axis=-1) if m >0]
    candidate = [m for m in np.argmax(y_pred,axis=-1) if m >0]
    scores = scorer.score(' '.join([str(i) for i in reference]),
                      ' '.join([str(i) for i in candidate]))

    Measures['Entrophy'].append(-tf.reduce_sum(y_pred * tf.math.log(y_pred)))
    Measures['Rouge1.precision'].append(scores['rouge1'].precision)
    Measures['Rouge1.recall'].append(scores['rouge1'].recall)
    Measures['Rouge1.fmeasure'].append(scores['rouge1'].fmeasure)
    Measures['Rouge2.precision'].append(scores['rouge2'].precision)
    Measures['Rouge2.recall'].append(scores['rouge2'].recall)
    Measures['Rouge2.fmeasure'].append(scores['rouge2'].fmeasure)
    Measures['RougeL.precision'].append(scores['rougeL'].precision)
    Measures['RougeL.recall'].append(scores['rougeL'].recall)
    Measures['RougeL.fmeasure'].append(scores['rougeL'].fmeasure)    
    Measures['Bleu1'].append(sentence_bleu([reference], candidate, weights=(1,0)))
    Measures['Bleu2'].append(sentence_bleu([reference], candidate, weights=(0, 1)))
    
    A = set(reference)
    B = set(candidate)
    AllTrue+=len(A)
    AllPred+=len(B)
    Correct+=len(A.intersection(B))
    
#    Sum += (float)(K.mean(K.equal(K.argmax(y_true, axis=-1),
#                  K.argmax(y_pred, axis=-1))))*len(y_true)
#    Count+=len(y_true)
def reset_measure():
    global AllTrue,AllPred,Correct
    AllTrue = 0
    AllPred = 0
    Correct = 0
    Entrophy = []
    
def compute_measure():
    global AllTrue,AllPred,Correct
    Recall = Correct/(AllTrue+.00000000000000000000001)
    Precision = Correct/(AllPred+.00000000000000000000001)
    measure = 2*Precision*Recall/(Precision+Recall+.00000000000000000000001)
    print('Precision:\t%.2f%%'%(Precision*100))
    print('Recall:\t%.2f%%'%(Recall*100))
    print('F-measure:\t%.2f%%'%(measure*100))
    for key in Measures:
        print('%s:\t%.2f%%'%(key,np.mean(Measures[key])*100))
    return measure
    

## Model Runner

In [None]:
def run_model(encoder_model,decoder_model,encoder_input_data,max_len):
    states_value = encoder_model.predict(encoder_input_data)
    
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0]= SOS

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    output = []
    decoded = 0
    while not stop_condition:
        #print([target_seq] + [states_value])
        output_tokens, h = decoder_model.predict(
            [target_seq] + [states_value])

        # Sample a token
        decoded+=1
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        output.append(output_tokens[0][0])
        
        # Exit condition: either hit max length
        # or find stop character.
        if (decoded >= max_len or sampled_token_index == EOS):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq[0, 0]= sampled_token_index

        # Update states
        states_value = h

    return np.array([output])


## Testing Method 

In [None]:
max_len= 100
def test_proposed(data,seperated=False):
    y_true = []
    y_pred = []
    reset_measure()
    i =0 
    failed =0
    tot_len = sum([len(d) for d in data])
    for index in tqdm(range(NUMBER_OF_CLUSTERS)):

        if seperated:
            reset_measure()
        for X,Y in tqdm(generate_data(data[index],1,False)):
            i+=1
            try:
                y_pred = run_model(idea_encoder,get_decoder(index),X[0],max_len)
                compute_recall_precision(Y[0],y_pred[0])


            except Exception as ex:
                print(ex)
                failed+=1    
        if seperated:
                    print()
                    compute_measure()
    return compute_measure()

## Training...

In [None]:
    reset_test_models()
    steps = len(Q_train_tokenized)//batch_size*epochs
    best = 0
    while(True):
        loss = 0
        accuracy = 0
        for k in tqdm(range(steps)):
            index = np.random.choice(NUMBER_OF_CLUSTERS,1,p=clusters_prob)[0]
            hist = get_model(index).fit_generator(generator=generate_data(_train_data[index],batch_size,Random=True),
                            steps_per_epoch = 1,#1/len(_train_data)//batch_size,
                            epochs=epochs,
                            verbose=0)
            loss += hist.history['loss'][0]
            accuracy += hist.history['accuracy'][0]


        m = test_proposed(_valid_data)

        if m > best:
            best = m
            for i in range(NUMBER_OF_CLUSTERS):
                get_model(i).save('cached_models\\model%i.h5'%i)
            continue
        else:
            break



## Testing

In [None]:
print('loading models from file')
reset_test_models()
for i in range(NUMBER_OF_CLUSTERS):    
        models[i] = load_model('cached_models\\model%i.h5'%i)
        # models[i].summary()
test_proposed(_test_data)

# --------------------------------------------------

# Base Model

## Create Base Model 

In [None]:
basemodel = None
def get_basemodel():
    global basemodel
    if basemodel != None:
        return basemodel
    
    encoder_inputs = Input(shape=(None, ))
    embed_encoder = Embedding(
                        input_dim=num_encoder_tokens,
                        output_dim=embeding_output_dim)(encoder_inputs)
    encoder = Bidirectional(GRU(latent_dim, return_state=True,dropout=0.5))
    encoder_outputs, forward_h,backward_h = encoder(embed_encoder)
    state_h = Concatenate()([forward_h,backward_h])



    decoder_inputs = Input(shape=(None, ))
    embed_decoder = Embedding(num_decoder_tokens, embeding_output_dim)(decoder_inputs)
    decoder_gru = GRU(latent_dim*2,return_sequences=True,return_state=True, dropout=0.5)
    gru_outputs,_ = decoder_gru(embed_decoder, initial_state=state_h)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(gru_outputs)


    basemodel = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    basemodel.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return basemodel

## Create Test model

In [None]:
base_encoder = None
base_decoder = None
def get_base_encoder():
    global base_encoder
    if base_encoder != None:
        return base_encoder
    model = get_basemodel()
    encoder_inputs = Input(shape=(None, ))
    embed_encoder = model.layers[1](encoder_inputs)
    encoder = model.layers[3]
    encoder_outputs, forward_h,backward_h = encoder(embed_encoder)
    state_h = Concatenate()([forward_h,backward_h])
    base_encoder = Model(encoder_inputs, state_h)
    return base_encoder

def get_base_decoder():
    global base_decoder
    if base_decoder != None:
        return base_decoder
    model = get_basemodel()
    decoder_state_input_h = Input(shape=(latent_dim*2,))
    decoder_inputs = Input(shape=(None, ))
    embed_decoder = model.layers[4](decoder_inputs)
    decoder_outputs, state_h = model.layers[6](
        embed_decoder, initial_state=decoder_state_input_h)
    decoder_states = state_h
    decoder_outputs = model.layers[7](decoder_outputs)
    base_decoder = Model(
        [decoder_inputs] + [decoder_state_input_h],
        [decoder_outputs] + [decoder_states])
    return base_decoder
def test_base(data):
    y_true = []
    y_pred = []
    reset_measure()
    i =0 
    failed =0
    for X,Y in tqdm( generate_data(data,1,False)):
        i+=1
        try:
            y_pred = run_model(get_base_encoder(),get_base_decoder(),X[0],max_len)
            compute_recall_precision(Y[0],y_pred[0])

        except Exception as ex:
            print(ex)
            failed+=1
    return compute_measure()

## Train Base

In [None]:
    best = 0
    while(True):
        get_basemodel().fit_generator(generator=generate_data(list(zip(Q_train_tokenized,A_train_tokenized)),batch_size,Random=False),
                            steps_per_epoch = len(Q_train_tokenized)//batch_size,
                            epochs=epochs)

        m= test_base(list(zip(Q_valid_tokenized,A_valid_tokenized)))
        if m>best:
            best = m
            get_basemodel().save('cached_models\\model_base.h5')
            continue
        else:
            break




## Test Base model

In [None]:
basemodel=load_model('cached_models\\model_base.h5')
base_encoder = None
base_decoder = None
test_base(list(zip(Q_test_tokenized,A_test_tokenized)))