In [0]:
from google.colab import files
import requests

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
            params = { 'id' : id, 'confirm' : token }
            response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

def get_confirm_token(response):
    for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
      for chunk in response.iter_content(CHUNK_SIZE):
        if chunk: # filter out keep-alive new chunks
          f.write(chunk)
file_id = '1nUXvuTgaICF7Um9NPgHH_hlVG8jMK9WX'
destination = 'Data.tsv'
download_file_from_google_drive(file_id, destination)

file_id = '1DhunfTzOTZHSTyOYLmIaVobQlLU_AG8u'
destination = 'eval1_unlabelled.tsv'
download_file_from_google_drive(file_id, destination)

file_id = '1SRwaKKUE-x-deox5cPFntVvKYMu7BpEO'
destination = 'glove.6B.50d.txt'
download_file_from_google_drive(file_id, destination)


In [2]:
!apt-get install --no-install-recommends openmpi-bin libopenmpi-dev libopencv-dev python3-opencv python-opencv && ln -sf /usr/lib/x86_64-linux-gnu/libmpi_cxx.so /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1 && ln -sf /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so.12 && ln -sf /usr/lib/x86_64-linux-gnu/libmpi.so /usr/lib/x86_64-linux-gnu/libmpi.so.12 && pip install cntk-gpu
import cntk as C

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libopenmpi-dev is already the newest version (2.1.1-8).
openmpi-bin is already the newest version (2.1.1-8).
libopencv-dev is already the newest version (3.2.0+dfsg-4ubuntu0.1).
python-opencv is already the newest version (3.2.0+dfsg-4ubuntu0.1).
python3-opencv is already the newest version (3.2.0+dfsg-4ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.




In [0]:
import cntk as C

In [0]:
import math

import pickle



#Initialize Global variables 

docIDFDict = {}

avgDocLength = 0


def GetCorpus(inputfile,trainfile,validfile):

  f = open(inputfile,"r",encoding="utf-8")

  ft = open(trainfile,"w",encoding="utf-8")

  fv = open(validfile,"w",encoding="utf-8")

  i=0

  for line in f:  

                if (i<3000000):

                    ft.write(line)

                elif(i<50):

                    fv.write(line)

                i=i+1

  ft.close()

  fv.close()



if __name__ == '__main__' :



    inputFileName = "Data.tsv"   # This file should be in the following format : queryid \t query \t passage \t label \t passageid

    trainfile = 'traindata.tsv'

    validfile = 'validationdata.tsv'

    testFileName = "eval1_unlabelled.tsv"  # This file should be in the following format : queryid \t query \t passage \t passageid # order of the query

    #corpusFileName = "corpus.tsv" 

    #outputFileName = "answer.tsv"



    GetCorpus(inputFileName,trainfile,validfile)    # Gets all the passages(docs) and stores in corpusFile. you can comment this 

In [6]:
import re

#Initialize Global variables 
GloveEmbeddings = {}
max_query_words = 12
max_passage_words = 50
emb_dim = 50
#emb_dim = 300
#The following method takes Glove Embedding file and stores all words and their embeddings in a dictionary
def loadEmbeddings(embeddingfile):
    global GloveEmbeddings,emb_dim

    fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
    for line in fe:
        tokens= line.strip().split()
        word = tokens[0]
        vec = tokens[1:]
        vec = " ".join(vec)
        GloveEmbeddings[word]=vec
    #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
    GloveEmbeddings["zerovec"] = "0.0 "*emb_dim
    fe.close()


def TextDataToCTF(inputfile,outputfile,isEvaluation):
    global GloveEmbeddings,emb_dim,max_query_words,max_passage_words

    f = open(inputfile,"r",encoding="utf-8",errors="ignore")  # Format of the file : query_id \t query \t passage \t label \t passage_id
    fw = open(outputfile,"w",encoding="utf-8")
    for line in f:
        tokens = line.strip().lower().split("\t")
        query_id,query,passage,label = tokens[0],tokens[1],tokens[2],tokens[3]

        #****Query Processing****
        words = re.split('\W+', query)
        words = [x for x in words if x] # to remove empty words 
        word_count = len(words)
        remaining = max_query_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_query_words
        words = words[:max_query_words] # trim extra words
        #create Query Feature vector 
        query_feature_vector = ""
        for word in words:
            if(word in GloveEmbeddings):
                query_feature_vector += GloveEmbeddings[word]+" "
            else:
                query_feature_vector += GloveEmbeddings["zerovec"]+" "  #Add zerovec for OOV terms
        query_feature_vector = query_feature_vector.strip() 

        #***** Passage Processing **********
        words = re.split('\W+', passage)
        words = [x for x in words if x] # to remove empty words 
        word_count = len(words)
        remaining = max_passage_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_passage_words
        words = words[:max_passage_words] # trim extra words
        #create Passage Feature vector 
        passage_feature_vector = ""
        for word in words:
            if(word in GloveEmbeddings):
                passage_feature_vector += GloveEmbeddings[word]+" "
            else:
                passage_feature_vector += GloveEmbeddings["zerovec"]+" "  #Add zerovec for OOV terms
        passage_feature_vector = passage_feature_vector.strip() 

        #convert label
        label_str = " 1 0 " if label=="0" else " 0 1 " 

        if(not isEvaluation):
            fw.write("|qfeatures "+query_feature_vector+" |pfeatures "+passage_feature_vector+" |labels "+label_str+"\n")
        else:
            fw.write("|qfeatures "+query_feature_vector+" |pfeatures "+passage_feature_vector+"|qid "+str(query_id)+"\n")



if __name__ == "__main__":

    trainFileName = "traindata.tsv"
    validationFileName = "validationdata.tsv"
    EvaluationFileName = "eval1_unlabelled.tsv"

    embeddingFileName = "glove.6B.50d.txt"
    #embeddingFileName = "glove.6B.300d.txt"
    loadEmbeddings(embeddingFileName)    

    # Convert Query,Passage Text Data to CNTK Text Format(CTF) using 50-Dimension Glove word embeddings 
    TextDataToCTF(trainFileName,"TrainData.ctf",False)
    print("Train Data conversion is done")
    TextDataToCTF(validationFileName,"ValidationData.ctf",False)
    print("Validation Data conversion is done")
    TextDataToCTF(EvaluationFileName,"EvaluationData.ctf",True)
    print("Evaluation Data conversion is done")


Train Data conversion is done
Validation Data conversion is done
Evaluation Data conversion is done


In [0]:
from __future__ import print_function
import numpy as np
import sys
import os
import cntk as C
from cntk.ops import combine, splice, sequence, reconcile_dynamic_axes
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,INFINITELY_REPEAT
from cntk.learners import sgd, learning_parameter_schedule_per_sample
from cntk import input_variable, cross_entropy_with_softmax,classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense
from cntk.layers import Sequential
from cntk.layers.typing import Tensor, Sequence
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from sklearn.metrics import precision_recall_fscore_support
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 
from google.colab import files


#Initialize Global variables
validation_query_vectors = []
validation_passage_vectors = []
validation_labels = []   
q_max_words=12
p_max_words=50
emb_dim=50


# The following LoadValidationSet method reads ctf format validation file and creates query, passage feature vectors and also copies labels for each pair.
## the created vectors will be useful to find metrics on validation set after training each epoch which will be useful to decide the best model 
def LoadValidationSet(validationfile):
    f = open(validationfile,'r',encoding="utf-8")
    for line in f:
        tokens = line.strip().split("|")  
        #tokens[0] will be empty token since the line is starting with |
        x1 = tokens[1].replace("qfeatures","").strip() #Query Features
        x2 = tokens[2].replace("pfeatures","").strip() # Passage Features
        y = tokens[3].replace("labels","").strip() # labels
        x1 = [float(v) for v in x1.split()]
        x2 = [float(v) for v in x2.split()]
        y = [int(w) for w in y.split()]        
        y = y[1] # label will be at index 1, i.e. if y = "1 0" then label=0 else if y="0 1" then label=1

        validation_query_vectors.append(x1)
        validation_passage_vectors.append(x2)
        validation_labels.append(y)

        #print("1")
    
    print("Validation Vectors are created")

def cnn_network(queryfeatures, passagefeatures,num_classes):
        #*****Hyper-Parameters******
    HIDDEN_DIM = 50 # LSTM dimension
    DSSM_DIM = 25 # Dense layer dimension
    NEGATIVE_SAMPLES = 5
    DROPOUT_RATIO = 0.2
    with C.layers.default_options(initial_state=0.1, pad = False,activation=C.elu ):
            r1=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r1')(queryfeatures)
            rr1=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='rr1')(queryfeatures)
            rr11=splice(rr1,r1)
            #r1 = C.sequence.last(r1)
            k1=C.layers.BatchNormalization(map_rank=1)(rr11)
            d1=C.layers.Dense(50, activation=C.elu, name='d1')(k1)
            a1=C.layers.Dropout(DROPOUT_RATIO, name='a1')(d1)
#             r11=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r11')(a1)
#             #r1 = C.sequence.last(r1)
#             k11=C.layers.BatchNormalization(map_rank=1)(r11)
#             d11=C.layers.Dense(50, activation=C.elu, name='d11')(k11)
#             a11=C.layers.Dropout(DROPOUT_RATIO, name='a11')(d11)
#             r12=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r12')(a11)
#             #r1 = C.sequence.last(r1)
#             k12=C.layers.BatchNormalization(map_rank=1)(r12)
#             d12=C.layers.Dense(50, activation=C.elu, name='d12')(k12)
#             a12=C.layers.Dropout(DROPOUT_RATIO, name='a12')(d12)
#             r13=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r13')(a12)
#             #r1 = C.sequence.last(r1)
#             k13=C.layers.BatchNormalization(map_rank=1)(r13)
#             d13=C.layers.Dense(50, activation=C.elu, name='d13')(k13)
#             a13=C.layers.Dropout(DROPOUT_RATIO, name='a13')(d13)
#             r14=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r14')(a13)
#             #r1 = C.sequence.last(r1)
#             k14=C.layers.BatchNormalization(map_rank=1)(r14)
#             d14=C.layers.Dense(50, activation=C.elu, name='d14')(k14)
#             a14=C.layers.Dropout(DROPOUT_RATIO, name='a14')(d14)
#             #b1=C.layers.Dense(num_classes*num_classes, activation=C.softmax, name='b1')(a1)
            #r4=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r4')(d1)
            #r4 = C.sequence.last(r4)
            #d4=C.layers.Dense(DSSM_DIM, activation=C.elu, name='d4')(r4)
            #a1=C.layers.Dropout(DROPOUT_RATIO, name='a1')(d4)
            #b1=C.layers.Dense(num_classes*num_classes, activation=C.softmax, name='b1')(a1)
            r2=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r2')(passagefeatures)
            rr2=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='rr2')(passagefeatures)
            rr22=splice(rr2,r2)
            k2=C.layers.BatchNormalization(map_rank=1)(rr22)
            #r2 = C.sequence.last(r2)
            d2=C.layers.Dense(50, activation=C.elu, name='d2')(k2)
            a2=C.layers.Dropout(DROPOUT_RATIO, name='a2')(d2)
#             r21=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r21')(a2)
#             k21=C.layers.BatchNormalization(map_rank=1)(r21)
#             #r2 = C.sequence.last(r2)
#             d21=C.layers.Dense(50, activation=C.elu, name='d21')(k21)
#             a21=C.layers.Dropout(DROPOUT_RATIO, name='a21')(d21)
            
#             r22=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r22')(a21)
#             #r1 = C.sequence.last(r1)
#             k22=C.layers.BatchNormalization(map_rank=1)(r22)
#             d22=C.layers.Dense(50, activation=C.elu, name='d22')(k22)
#             a22=C.layers.Dropout(DROPOUT_RATIO, name='a22')(d22)
#             r23=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r23')(a22)
#             #r1 = C.sequence.last(r1)
#             k23=C.layers.BatchNormalization(map_rank=1)(r23)
#             d23=C.layers.Dense(50, activation=C.elu, name='d23')(k23)
#             a23=C.layers.Dropout(DROPOUT_RATIO, name='a23')(d23)
#             r24=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r23')(a23)
#             #r1 = C.sequence.last(r1)
#             k24=C.layers.BatchNormalization(map_rank=1)(r24)
#             d24=C.layers.Dense(50, activation=C.elu, name='d24')(k24)
#             a24=C.layers.Dropout(DROPOUT_RATIO, name='a24')(d24)

            mergeQP1 = C.element_times(a1,a2)
            r3=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r3')(mergeQP1)
            r4=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r4')(mergeQP1)
            r5=splice(r4,r3)
            k3=C.layers.BatchNormalization(map_rank=1)(r5)
            d3=C.layers.Dense(25, activation=C.elu, name='d3')(k3)
            a3=C.layers.Dropout(DROPOUT_RATIO, name='a3')(d3)
#             r31=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r31')(a3)
#             k31=C.layers.BatchNormalization(map_rank=1)(r31)
#             d31=C.layers.Dense(25, activation=C.elu, name='d31')(k31)
#             a31=C.layers.Dropout(DROPOUT_RATIO, name='a31')(d31)
#             r32=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=False, name='r32')(a31)
#             k32=C.layers.BatchNormalization(map_rank=1)(r32)
#             d32=C.layers.Dense(25, activation=C.elu, name='d32')(k32)
#             a32=C.layers.Dropout(DROPOUT_RATIO, name='a32')(d32)
#             r33=C.layers.Recurrence(C.layers.LSTM(shape=50), go_backwards=True, name='r33')(a32)
#             k33=C.layers.BatchNormalization(map_rank=1)(r33)
#             d33=C.layers.Dense(25, activation=C.elu, name='d33')(k33)
#             a33=C.layers.Dropout(DROPOUT_RATIO, name='a33')(d33)
#             #b2=C.layers.Dense(num_classes*num_classes, activation=C.softmax, name='b2')(a2)
            #mergeQP     = C.element_times(b1,b2)
            #model   = C.layers.Dense(num_classes, activation=C.softmax,name="overall")(mergeQP)
            model   = C.layers.Dense(num_classes, activation=C.softmax,name="overall")(a3)
            return model

def create_reader(path, is_training, query_total_dim, passage_total_dim, label_total_dim):
    return MinibatchSource(CTFDeserializer(path, StreamDefs( queryfeatures = StreamDef(field='qfeatures', shape=query_total_dim,is_sparse=False), 
                                                            passagefeatures = StreamDef(field='pfeatures', shape=passage_total_dim,is_sparse=False), 
                                                            labels   = StreamDef(field='labels', shape=label_total_dim,is_sparse=False)
                                                            )), 
                           randomize=is_training, max_sweeps = INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)

def TrainAndValidate(trainfile):

    #*****Hyper-Parameters******
    q_max_words= 12
    p_max_words = 50
    #emb_dim = 300
    emb_dim = 50
    hidden_dim = 50
    num_classes = 2
    minibatch_size = 250
    epoch_size = 1500000 #No.of samples in training set
    total_epochs = 500 #Total number of epochs to run
    query_total_dim = q_max_words*emb_dim
    label_total_dim = num_classes
    passage_total_dim = p_max_words*emb_dim
    EMB_DIM   = 50



    #****** Create placeholders for reading Training Data  ***********
    #query_input_var =  C.ops.input_variable((1,q_max_words,emb_dim),np.float32,is_sparse=False)
    #passage_input_var =  C.ops.input_variable((1,p_max_words,emb_dim),np.float32,is_sparse=False)
    query_input_var =  C.sequence.input_variable((1,q_max_words,emb_dim),np.float32,is_sparse=False)
    passage_input_var =  C.sequence.input_variable((1,p_max_words,emb_dim),np.float32,is_sparse=False)
    output_var = C.sequence.input_variable(num_classes,np.float32,is_sparse = False)
    print(type(query_input_var))
    train_reader = create_reader(trainfile, True, query_total_dim, passage_total_dim, label_total_dim)
    input_map = { query_input_var : train_reader.streams.queryfeatures, passage_input_var:train_reader.streams.passagefeatures, output_var : train_reader.streams.labels}

    # ********* Model configuration *******
    model_output = cnn_network(query_input_var, passage_input_var, num_classes)
    loss = C.binary_cross_entropy(model_output, output_var)
    pe = C.classification_error(model_output, output_var)
    lr_per_minibatch = C.learning_rate_schedule(0.03, C.UnitType.minibatch)   
    learner = C.adagrad(model_output.parameters, lr=lr_per_minibatch)
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=total_epochs)

    #************Create Trainer with model_output object, learner and loss parameters*************  
    trainer = C.Trainer(model_output, (loss, pe), learner, progress_printer)
    C.logging.log_number_of_parameters(model_output) ; print()

    # **** Train the model in batchwise mode *****
    for epoch in range(total_epochs):       # loop over epochs
        print("Epoch : ",epoch)
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
            data = train_reader.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch.
            trainer.train_minibatch(data)        # training step
            sample_count += data[output_var].num_samples   # count samples processed so far

        trainer.summarize_training_progress()
                
        model_output.save("RNN_{}.dnn".format(epoch)) # Save the model for every epoch
        #files.download('RNN_{}.dnn'.format(epoch))
                #*** Find metrics on validation set after every epoch ******#  (Note : you can skip doing this for every epoch instead to optimize the time, do it after every k epochs)
            #*** Find metrics on validation set after every epoch ******#  (Note : you can skip doing this for every epoch instead to optimize the time, do it after every k epochs)
        predicted_labels=[]    
    return model_output

## The following GetPredictionOnEvalSet method reads all query passage pair vectors from CTF file and does forward prop with trained model to get similarity score
## after getting scores for all the pairs, the output will be written into submission file. 
def GetPredictionOnEvalSet(model,testfile,submissionfile):
    global q_max_words,p_max_words,emb_dim

    f = open(testfile,'r',encoding="utf-8")
    all_scores={} # Dictionary with key = query_id and value = array of scores for respective passages
    for line in f:
        tokens = line.strip().split("|")  
        #tokens[0] will be empty token since the line is starting with |
        x1 = tokens[1].replace("qfeatures","").strip() #Query Features
        x2 = tokens[2].replace("pfeatures","").strip() # Passage Features
        query_id = tokens[3].replace("qid","").strip() # Query_id
        x1 = [float(v) for v in x1.split()]
        x2 = [float(v) for v in x2.split()]    
        queryVec   = np.array(x1,dtype="float32").reshape(1,q_max_words,emb_dim)
        passageVec = np.array(x2,dtype="float32").reshape(1,p_max_words,emb_dim)
        score = model(queryVec,passageVec)[0][0][1] # do forward-prop on model to get score
        if(query_id in all_scores):
            all_scores[query_id].append(score)
        else:
            all_scores[query_id] = [score]
    fw = open(submissionfile,"w",encoding="utf-8")
    for query_id in all_scores:
        scores = all_scores[query_id]
        scores_str = [str(sc) for sc in scores] # convert all scores to string values
        scores_str = "\t".join(scores_str) # join all scores in list to make it one string with  tab delimiter.  
        fw.write(query_id+"\t"+scores_str+"\n")
    fw.close()

    
if __name__ == "__main__":

    trainSetFileName = "TrainData.ctf"
    validationSetFileName = "ValidationData.ctf"
    testSetFileName = "EvaluationData.ctf"
    submissionFileName = "answer.tsv"
    print("done")
    LoadValidationSet(validationSetFileName)    #Load Validation Query, Passage Vectors from Validation CTF File
    model = TrainAndValidate(trainSetFileName) # Training and validation methods    
    GetPredictionOnEvalSet(model,testSetFileName,submissionFileName) # Get Predictions on Evaluation Set
