#Initial setup
- Tensorboard setup
- Google collab and google drive related functions. 
- GPU and memory consumption setup
- NLTK install

## Tensorboard setup

In [0]:

"""Setup NGROK server for tensorboard
   Thanks to question at (https://stackoverflow.com/questions/47818822/can-i-use-tensorboard-with-google-colab) 
   
   Args:1
        LOG_DIR (string): Location where the files must be logged
"""
  
LOG_DIR = '/tmp/log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
! unzip ngrok-stable-linux-amd64.zip

get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

## GPU and memory consumption setup

In [0]:
"""Check Google Collab GPU and CPU
   Thanks to the question at (https://stackoverflow.com/questions/48750199/google-colaboratory-misleading-information-about-its-gpu-only-5-ram-available)
   only one GPU on Colab and it isn't guaranteed. 
   Prints:
        GPU and RAM utilization. 
"""

!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU


In [0]:
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

## Google Drive Mount

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
cd /content/gdrive/My Drive/ABCNN/

In [0]:
import nltk
nltk.download('stopwords')
!pip3 install beautifultable

In [0]:
ls

# Word2vec init

- Instantiates the class and creates an object. (3.6GB file, will take a while)


In [0]:
import gensim
import numpy as np
class Word2Vec():
  """Uses gensim to load google's pretrained vectors as the model.
  """
  def __init__(self):
    """Initialises the class
       Attr:
        model: A variable holding the embeddings for the words in it's vocabulary. About 3GB size. 
        unknowns: A variable holding 300 values sampled from a uniform distribution
                    from -0.01 to 0.01
    """
    self.model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
                                                                 binary=True)
    self.unknowns = np.random.uniform(-0.01, 0.01, 300).astype("float32")

  def get(self, word):
    """Method for returning the keyed vectors for a particular word. 
       Returns:
        If the word is the model's vocab returns the particular embedding else it returns self.unkown holding arbitary embeddings.
    """
    if word not in self.model.vocab:
        return self.unknowns
    else:
        return self.model.word_vec(word)


In [0]:
word2vec = Word2Vec() #Instantiates the class and creates an object. 

# Preprocessing for the WikiQA dataset

- List of all questions, their corresponding answers and their labels is obtained. 
- All the questions and the answers are converted into lowercases.
- All answers are truncated to max 40 character as per the paper.
- List of common words between the question and answer is calculated excluding the common words. This is the word_cnt feature. 
- List of common words is obtained. Their corresponding Inverse Document frequency (IDF) values is also obtained and summed. This the wgt_word_cnt feature. 
- Max length is also calculated. This is 40 for our dataset. 
- Every question and answer is then padded to max_length. (40)

Returns the preprocessed questions, answers, labels, features, max_len

In [0]:
"""Preprocessing for the WikiQA dataset.
   Attr:
    max_len: maximum length of the question, answers are truncated to 40 characters. 
    questions: List of all questions.
    answers: List of all answers
    labels: 0 or 1 for each question/answer combination
    features: list of the following elements for each line: [len(question), len(answer), word_cnt, wgt_word_cnt]
      word_cnt: Common words between a question and a answer that are not in the stop words list
      wgt_word_cnt: sum of inverse document frequency of all the common words between a question and a answer
"""
def preprocess_data(mode):
  
  if mode == "train":
    file_path = "WikiQA_Corpus/WikiQA-train.txt"
  else: 
    file_path = "WikiQA_Corpus/WikiQA-test.txt"
    
  questions, answers, labels, features = [], [], [], []
  stopwords = nltk.corpus.stopwords.words("english") #Load the stopwords from nltk
  max_len = 0
  vocab = []
  idf = {}

  with open(file_path,'r',encoding='utf-8') as f:
    for line in f:

      sentence = line[:-1].split('\t')

      question = sentence[0].lower().split()
      answer = sentence[1].lower().split()[:40] #Only 40 words, as per the paper
      label = int(sentence[2])

      questions.append(question)
      answers.append(answer)
      labels.append(label)

      #Calculate number of common words between the question and answers that are not in the stopwords list.
      word_cnt = len([word for word in question if word not in stopwords and word in answer])
      features.append([len(question), len(answer), word_cnt])

    #Calculate max-length of a sentence in both question and answer
    max_len = max(len(max(questions, key=len)),len(max(answers, key=len)))

    #Flatten a list and build a vocab using the unique words
    vocab = list(set([y for x in questions for y in x]))

    #Calculate IDF for every word in vocab:
    for w in vocab:
      idf[w] = np.log( len(questions)/ len([1 for question in questions for word in question if word==w]) )

    #Obtain the common words, calculate the IDF for each word and sum them.
    for i in range(len(questions)):
      wgt_word_cnt = sum([idf[w] for word in questions[i] if word not in stopwords and word in answers[i]])
      features[i].append(wgt_word_cnt)
      
  return questions, answers, labels, features, max_len

In [0]:
questions, answers, labels, features, max_len = preprocess_data("train")

# Dataloader class.

- loads the preprocessed data and provides batch input to the train function



In [0]:
class DataLoader():
    def __init__(self, word2vec, questions, answers, labels, features, max_len):
      """This class provides batch input for training and testing.
         Attr:
            word2vec: word2vec
            questions: List of all questions.
            answers: List of all answers
            labels: 0 or 1 for each question/answer combination
            features: list of the following elements for each line: [len(question), len(answer), word_cnt, wgt_word_cnt]
            max_len: maximum length of either  the question or the answer. [40]
            index: Index to keep track of batches
            data_size: size of the data, i.e len(questions)
            num_of_features: number of features, 4.
      """
      self.questions, self.answers, self.labels, self.features = questions, answers, labels, features
      self.index, self.max_len, self.word2vec = 0, max_len, word2vec
      self.data_size = len(self.questions)
      self.num_features = len(features[0])
    def is_available(self):
        if self.index < self.data_size:
            return True
        else:
            return False

    def reset_index(self):
        self.index = 0

    def next(self):
        if (self.is_available()):
            self.index += 1
            return self.data[self.index - 1]
        else:
            return

    def next_batch(self, batch_size):
        batch_size = min(self.data_size - self.index, batch_size)
        question_mats, answer_mats = [], []

        for i in range(batch_size):
            question = self.questions[self.index + i]
            answer = self.answers[self.index + i]

            """ 
            • Embedding is obtained for the every word in a question and appended to a list [[]]
            • The list of lists is converted into a np.array by np.column_stack [300,number_of_words]
            • The column is then padded by the maximum length so that all questions and answers have the same dimension. [300,40]
            • The resulting NDarray is then expanded using expand_dims 
            • Resulting question is of dimension [1,300,40]
            """
            question_mats.append(np.expand_dims(np.pad(np.column_stack([self.word2vec.get(w) for w in question]),
                                                 [[0, 0], [0, self.max_len - len(question)]],
                                                 "constant"), axis=0))

            answer_mats.append(np.expand_dims(np.pad(np.column_stack([self.word2vec.get(w) for w in answer]),
                                                 [[0, 0], [0, self.max_len - len(answer)]],
                                                 "constant"), axis=0))

        batch_questions = np.concatenate(question_mats, axis=0) #Dimensions[64,300,40]
        batch_answers = np.concatenate(answer_mats, axis=0) #Dimensions[64,300,40]
        batch_labels = self.labels[self.index:self.index + batch_size] #Dimensions[64]
        batch_features = self.features[self.index:self.index + batch_size] #Dimensions[64,4]

        self.index += batch_size

        return batch_questions, batch_answers, batch_labels, batch_features

In [0]:
dataloader = DataLoader(word2vec, questions, answers, labels, features, max_len)

# Dataloader Results

In [14]:
from beautifultable import BeautifulTable

dataloader_table = BeautifulTable()

dataloader_table.column_headers = ["Dataloader", "Values"]
dataloader_table.append_row(["Maximum length of Data", dataloader.max_len])
dataloader_table.append_row(["Number of questions", len(dataloader.questions)])
dataloader_table.append_row(["Number of answers", len(dataloader.answers)])
dataloader_table.append_row(["Number of features", dataloader.num_features])

print(dataloader_table)

+------------------------+--------+
|       Dataloader       | Values |
+------------------------+--------+
| Maximum length of Data |   40   |
+------------------------+--------+
|  Number of questions   | 20360  |
+------------------------+--------+
|   Number of answers    | 20360  |
+------------------------+--------+
|   Number of features   |   4    |
+------------------------+--------+


# BCNN

- Class definition
- Train
- Test

In [0]:
import tensorflow as tf
import numpy as np


class BCNN():
    def __init__(self, sentence_length, filter_width, l2_reg, num_features, embedding_dim=300, nb_filters=50, num_classes=2, num_layers=2):
        """This class implements BCNN arch.
           BCNN consists of two CNNs, each processing one of the two sentences, and a final layer that solves the sentence pair tasks. 
         Attr:
            sentence_length: 40
            filter_width: 4 width.
            l2_reg: 0.0004
            num_features: 4
            embedding_dim: 300
            nb_filters: 50
            num_classes: 2
            num_layers: 2
        """

        self.x1 = tf.placeholder(tf.float32, shape=[None, embedding_dim, sentence_length], name="x1") #[b,d,s] -> [64,300,40]
        self.x2 = tf.placeholder(tf.float32, shape=[None, embedding_dim, sentence_length], name="x2") #[b,d,s] -> #[64,300,40]
        self.y = tf.placeholder(tf.int32, shape=[None], name="y") #[b] #64
        self.features = tf.placeholder(tf.float32, shape=[None, num_features], name="features") #[b,num_of_filters] -> [64,4] 

        def pad_for_wide_conv(x):
          """Zero padding to inputs for wide convolution,
            padding w-1 for both sides  (s -> s+w-1)
            Attr:
                x: input tensor (b, d, s, c) #[64, 300, 40, 1]
                w: filter size
            Returns:
                padded input (b, d, s+w-1 , c) #[64,300,43,1]
          """
          return tf.pad(x, np.array([[0, 0], [0, 0], [filter_width - 1, filter_width - 1], [0, 0]]), "CONSTANT", name="pad_wide_conv")

        def cos_sim(v1, v2):
            """Compute the cosine similarity between two vectors v1 and v2
               `cosine`: Defined as <x.y>/ |x|*|y|
               Args:
                v1: vector1
                v2: vector2
            """

            norm1 = tf.sqrt(tf.reduce_sum(tf.square(v1), axis=1))
            norm2 = tf.sqrt(tf.reduce_sum(tf.square(v2), axis=1))
            dot_products = tf.reduce_sum(v1 * v2, axis=1, name="cos_sim")

            return dot_products / (norm1 * norm2)
   
        def convolution(name_scope, x, d, reuse):
            """conv2D layer
               Args:
                x: input tensor (b, d, s, c) #[64, 300, 46, 1] if layer 1 or [64,50,46,1]
                d: vector2
               
               Returns:
               conv.transpose: (b, d, s, c)[64, 50, 43, 1]
               
               Namescope is necessary for weight sharing with the second layer
            """
            with tf.name_scope(name_scope + "-conv"):
                with tf.variable_scope("conv") as scope:
                    conv = tf.contrib.layers.conv2d(
                        inputs=x,
                        num_outputs=nb_filters,
                        kernel_size=(d, filter_width),
                        stride=1,
                        padding="VALID",
                        activation_fn=tf.nn.tanh,
                        weights_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                        weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg),
                        biases_initializer=tf.constant_initializer(1e-04),
                        reuse=reuse,
                        trainable=True,
                        scope=scope
                    )

                    conv_trans = tf.transpose(conv, [0, 3, 2, 1], name="conv_trans") 
                    return conv_trans

        def w_pool(x):
            """Pooling layer as mentioned in the paper
               Args:
                x: input tensor (b, d, s, c) #[64, 300, 46, 1] if layer 1 or [64,50,46,1]
               
               Returns:
               conv.transpose: (b, d, s, c)[64, 50, 43, 1]
            """
                
            w_ap = tf.layers.average_pooling2d(
                inputs=x,
                pool_size=(1, filter_width),
                strides=1,
                padding="VALID",
                name="w_ap"
            )

            return w_ap

        def all_pool(variable_scope, x):
            """Pooling layer as mentioned in the paper
               Args:
                variable_scope: checks if it's initial inputs
                x: input tensor 
               
               Returns:
               conv.transpose: (b, d, s, c)[64, 50, 43, 1]
            """
            with tf.variable_scope(variable_scope + "-all_pool"):
              
                if variable_scope.startswith("input"):
                    
                    pool_width = sentence_length
                    d = embedding_dim
                else:
                    pool_width = sentence_length + filter_width - 1
                    d = nb_filters

                all_ap = tf.layers.average_pooling2d(
                    inputs=x,
                    pool_size=(1, pool_width),
                    strides=1,
                    padding="VALID",
                    name="all_ap")

                # [batch, di]
                all_ap_reshaped = tf.reshape(all_ap, [-1, d])
                return all_ap_reshaped

        def CNN_layer(variable_scope, x1, x2, d):
            """Each block contains input -> wide-conv -> w-pool
            """
            with tf.variable_scope(variable_scope):
               
                left_conv = convolution(name_scope="left", x=pad_for_wide_conv(x1), d=d, reuse=False)
                right_conv = convolution(name_scope="right", x=pad_for_wide_conv(x2), d=d, reuse=True)

                left_wp = w_pool(x=left_conv)
                left_ap = all_pool(variable_scope="left", x=left_conv)
                right_wp = w_pool(x=right_conv)
                right_ap = all_pool(variable_scope="right", x=right_conv)

                return left_wp, left_ap, right_wp, right_ap
                    
        x1_expanded = tf.expand_dims(self.x1, -1) #[64, 300, 40, 1]
        x2_expanded = tf.expand_dims(self.x2, -1)  #[64, 300, 40, 1]

        LO_0 = all_pool(variable_scope="input-left", x=x1_expanded)  #[64, 300, 1, 1]
        RO_0 = all_pool(variable_scope="input-right", x=x2_expanded) #[64, 300, 1, 1]

        LI_1, LO_1, RI_1, RO_1 = CNN_layer(variable_scope="CNN-1", x1=x1_expanded, x2=x2_expanded, d=embedding_dim)
        sims = [cos_sim(LO_0, RO_0), cos_sim(LO_1, RO_1)] #Compute similarity scores and store them.

        if num_layers > 1:
            """ Create second CNN block if num_layers > 1
                Output from the first layer is given as input to the second
            """
            _, LO_2, _, RO_2 = CNN_layer(variable_scope="CNN-2", x1=LI_1, x2=RI_1, d=nb_filters)
            sims.append(cos_sim(LO_2, RO_2)) # Compute similarity scores for the second block too. 

        with tf.variable_scope("output-layer"):
            """ Final Output layer"""            
            self.output_features = tf.concat([self.features, tf.stack(sims, axis=1)], axis=1, name="output_features")

            self.estimation = tf.contrib.layers.fully_connected(
                inputs=self.output_features,
                num_outputs=num_classes,
                activation_fn=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg),
                biases_initializer=tf.constant_initializer(1e-04),
                scope="FC"
            )

        self.prediction = tf.contrib.layers.softmax(self.estimation)[:, 1]
      
        """ Calculate cost by softmax_cross_entropy and add a regularizer term """
        self.cost = tf.add(
            tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.estimation, labels=self.y)),
            tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
            name="cost")
        
        tf.summary.scalar("cost", self.cost)
        self.merged = tf.summary.merge_all()

## Train

In [0]:
import tensorflow as tf
import numpy as np
import sys

from sklearn import linear_model
from sklearn.externals import joblib



def build_path(prefix, model_type, num_layers, postpix=""):
    return prefix + "-" + model_type + "-" + str(num_layers) + postpix


def train(learning_rate, filter_width, l2_reg, nb_epoch, batch_size, model_type, num_layers, word2vec, embedding_dim, nb_filters, num_classes=2):
    """ Reset Default graph to rerun and test the cell multiple times"""
    tf.reset_default_graph()

    
    if model_type == "BCNN":
      model = BCNN(sentence_length=dataloader.max_len, filter_width=filter_width, l2_reg=l2_reg,
                  num_features=dataloader.num_features, num_classes=num_classes, num_layers=num_layers, embedding_dim=embedding_dim, nb_filters=nb_filters)
    else: 
      
      model = ABCNN(sentence_length=dataloader.max_len, filter_width=filter_width, l2_reg=l2_reg,
                  num_features=dataloader.num_features, num_classes=num_classes, num_layers=num_layers, embedding_dim=embedding_dim, nb_filters=nb_filters)
      
    optimizer = tf.train.AdagradOptimizer(learning_rate, name="optimizer").minimize(model.cost)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=100)
    
    

    with tf.device("/device:GPU:0"):
      with tf.Session() as sess:
          train_summary_writer = tf.summary.FileWriter("/tmp/log/", sess.graph)

          sess.run(init)
          
          print("=" * 70)
          for e in range(1, nb_epoch + 1):
            
              epoch_table = BeautifulTable()
              epoch_table.column_headers = ["Epoch " + str(e) + "/" + str(nb_epoch)]
                            
              dataloader.reset_index()
              i = 0

              LR = linear_model.LogisticRegression(solver='lbfgs')
              clf_features = []

              epoch_loss = 0
              
              while dataloader.is_available():
                  i += 1

                  batch_x1, batch_x2, batch_y, batch_features = dataloader.next_batch(batch_size=batch_size)

                  merged, _, c, features = sess.run([model.merged, optimizer, model.cost, model.output_features],
                                                    feed_dict={model.x1: batch_x1,
                                                               model.x2: batch_x2,
                                                               model.y: batch_y,
                                                               model.features: batch_features})
                  
                  clf_features.append(features)
                  epoch_loss += c

              epoch_table.append_row(["Cost: " + str(epoch_loss)])
              print(epoch_table)
              
              cost_path = build_path("./cost/", model_type, num_layers, ".txt")
              with open(cost_path, 'a') as f:
                f.write(str(epoch_loss) + "\n")
              
              #train_summary_writer.add_summary(merged, i)
              
              save_path = saver.save(sess, build_path("./models/", model_type, num_layers), global_step=e)
              """ Performance increases if we do not use the output of the LR layer as the 
              final decision, but instead train a linear SVM or a logistic regression with default parameters2 directly
              on the input to the LR layer. (From the paper)"""
              clf_features = np.concatenate(clf_features)
              LR.fit(clf_features, dataloader.labels)

              LR_path = build_path("./models/", model_type, num_layers, "-" + str(e) + "-LR.pkl")
              joblib.dump(LR, LR_path)

          
          print("training finished!")
          print("=" * 50)


In [0]:
param_table = BeautifulTable()
param_table.column_headers = ["Parameter", "Value"]

params = {
    "learning_rate": 0.08,
    "filter_width": 4,
    "l2_reg": 0.0004,
    "nb_epoch": 50,
    "batch_size": 64,
    "model_type": "BCNN",
    "num_layers": 2,
    "word2vec": word2vec,
    "embedding_dim": 300,
    "nb_filters": 50
}

for k,v in params.items():
  param_table.append_row([k,v])
print(param_table)


train(learning_rate=float(params["learning_rate"]), filter_width=int(params["filter_width"]), l2_reg=float(params["l2_reg"]), nb_epoch=int(params["nb_epoch"]),
      batch_size=int(params["batch_size"]), model_type=params["model_type"], num_layers=int(params["num_layers"]),
      word2vec=params["word2vec"],embedding_dim=int(params["embedding_dim"]),nb_filters=int(params["nb_filters"]))

## Process and load Test Dataset

In [0]:
questions, answers, labels, features, max_len = preprocess_data("test")

In [0]:
dataloader = DataLoader(word2vec, questions, answers, labels, features, max_len)

In [0]:
from beautifultable import BeautifulTable

dataloader_table = BeautifulTable()

dataloader_table.column_headers = ["Dataloader", "Values"]
dataloader_table.append_row(["Maximum length of Data", dataloader.max_len])
dataloader_table.append_row(["Number of questions", len(dataloader.questions)])
dataloader_table.append_row(["Number of answers", len(dataloader.answers)])
dataloader_table.append_row(["Number of features", dataloader.num_features])

print(dataloader_table)

## Run test

In [0]:
def test(filter_width, l2_reg, nb_epoch, model_type, num_layers, classifier, word2vec, num_classes=2):
  
    """ Reset Default graph to rerun and test the cell multiple times"""
    tf.reset_default_graph()

    if model_type == "BCNN":
      model = BCNN(sentence_length=dataloader.max_len, filter_width=filter_width, l2_reg=l2_reg,
                    num_features=dataloader.num_features, num_classes=num_classes, num_layers=num_layers)
    else:
      model = ABCNN(sentence_length=dataloader.max_len, filter_width=filter_width, l2_reg=l2_reg,
                    num_features=dataloader.num_features, num_classes=num_classes, num_layers=num_layers)
    
    
    model_path = build_path("./models/", model_type, num_layers)
    MAPs, MRRs = [], []

    print("=" * 50)
    print("test data size:", dataloader.data_size)

    for e in range(1, nb_epoch + 1):
      
      epoch_table = BeautifulTable()
      epoch_table.column_headers = ["Epoch " + str(e) + "/" + str(50)]
      dataloader.reset_index()
        
      #with tf.device("/device:GPU:0"): #TF bug, can't assigned GPU but it runs on GPU verified
      with tf.Session() as sess:
          saver = tf.train.Saver()
          saver.restore(sess, model_path + "-" + str(e))
          print(model_path + "-" + str(e), "restored.")

          if classifier == "LR":
              clf_path = build_path("./models/", model_type, num_layers,
                                    "-" + str(e) + "-" + classifier + ".pkl")
              clf = joblib.load(clf_path)


          QA_pairs = {}
          questions, answers, labels, features = dataloader.next_batch(batch_size=dataloader.data_size)

          for i in range(dataloader.data_size):
              pred, clf_input = sess.run([model.prediction, model.output_features],
                                         feed_dict={model.x1: np.expand_dims(questions[i], axis=0),
                                                    model.x2: np.expand_dims(answers[i], axis=0),
                                                    model.y: np.expand_dims(labels[i], axis=0),
                                                    model.features: np.expand_dims(features[i], axis=0)})

              if classifier == "LR":
                  clf_pred = clf.predict_proba(clf_input)[:, 1]
                  pred = clf_pred

              question = " ".join(dataloader.questions[i])
              answer = " ".join(dataloader.answers[i])

              if question in QA_pairs:
                  QA_pairs[question].append((answer, labels[i], np.asscalar(pred)))
              else:
                  QA_pairs[question] = [(answer, labels[i], np.asscalar(pred))]

          """
          Calculate MAP and MRR for each saved model.
          """
          MAP, MRR = 0, 0
          for s1 in QA_pairs.keys():
              p, AP = 0, 0
              MRR_check = False

              QA_pairs[s1] = sorted(QA_pairs[s1], key=lambda x: x[-1], reverse=True)

              for idx, (s2, label, prob) in enumerate(QA_pairs[s1]):
                  if label == 1:
                      if not MRR_check:
                          MRR += 1 / (idx + 1)
                          MRR_check = True

                      p += 1
                      AP += p / (idx + 1)

              AP /= p
              MAP += AP

          num_questions = len(QA_pairs.keys())
          MAP /= num_questions
          MRR /= num_questions

          epoch_table.append_row(["MAP: " + str(MAP)])
          epoch_table.append_row(["MRR: " + str(MRR)])
          print(epoch_table)

          MAPs.append(MAP)
          MRRs.append(MRR)

    print("=" * 50)
    print("max MAP:", max(MAPs), "max MRR:", max(MRRs))
    print("=" * 50)
    
    """ Write the MAP and MMR for each corresponding epoch to a file."""

    exp_path = build_path("./experiments/", model_type, num_layers, "-" + classifier + ".txt")
    with open(exp_path, "w", encoding="utf-8") as f:
        print("Epoch\tMAP\tMRR", file=f)
        for i in range(e):
            print(str(i + 1) + "\t" + str(MAPs[i]) + "\t" + str(MRRs[i]), file=f)


In [0]:


# default parameters
params = {
    "filter_width": 4,
    "l2_reg": 0.0004,
    "nb_epoch": 50,
    "model_type": "BCNN",
    "num_layers": 2,
    "classifier": "LR",
    "word2vec": word2vec
}


test(filter_width=int(params["filter_width"]), l2_reg=float(params["l2_reg"]), nb_epoch=int(params["nb_epoch"])
     , model_type=params["model_type"], num_layers=int(params["num_layers"]), classifier=params["classifier"],
     word2vec=params["word2vec"])

## Cost and MAP,MMR graph

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('ggplot')

def plt_graphs(cost_file,score_file,position):
    with open(cost_file,'r') as f:
        values = f.read().split("\n")

    cost = values[:-1]
    cost[:] = [float(x) for x in cost]

    with open(score_file,'r') as f:
        values = f.read().split('\n')
    values = values[:-1]


    maps = [x.split('\t')[1] for x in values][1:]
    mmrs = [x.split('\t')[2] for x in values][1:]
    maps[:] = [float(x) for x in maps]
    mmrs[:] = [float(x) for x in mmrs]

    summed_maps_mmrs = [sum(x) for x in zip(maps,mmrs)]
    max_score = max(summed_maps_mmrs)
    max_score_index = summed_maps_mmrs.index(max_score)

    x = np.arange(1,51,1)
    y = maps
    y2 = mmrs
    y3 = cost

    fig = plt.figure(figsize=(12,9))
    ax1 = fig.add_subplot(1,1,1)
    ax1.plot(x,y,'b',label="MAP score")
    ax1.plot(x,y2,'g', label="MMR score")


    ax1.set_title("MAP, MMR and Loss over epochs")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("MMR and MAP scores")

    ax2 = ax1.twinx() 
    ax2.set_ylabel("Cost function")
    ax2.plot(x,y3, label="Loss")

    ax1.legend(loc="upper right",  prop={'size': 15}, bbox_to_anchor=(1, 0.9))
    ax2.legend(loc="upper right", prop={'size':15})

    string_score ="Max MAPS = {:.4f}\nMax MMR = {:.4f}\nEpoch = {}".format(maps[max_score_index],mmrs[max_score_index],max_score_index)
    fig.text(position[0],position[1],string_score,bbox={'boxstyle':'square',"color":"white"},fontdict={'color':'black','size':'15'})
    
    fig.savefig('BCNN-2-Trec.png')



In [0]:
cost_file = "cost/-BCNN-2.txt"
score_file = "experiments/-BCNN-2-LR.txt"
position = [0.5,0.75]
plt_graphs(cost_file,score_file,position)

# ABCNN

Same class as BCNN but only one extra function added to compute the attention matrix

In [0]:
import tensorflow as tf
import numpy as np


class ABCNN():
    def __init__(self, sentence_length, filter_width, l2_reg, num_features, embedding_dim=300, nb_filters=50, num_classes=2, num_layers=2):
        """This class implements ABCNN arch. 
         Attr:
            sentence_length: 40
            filter_width: 4 width.
            l2_reg: 0.0004
            num_features: 4
            embedding_dim: 300
            nb_filters: 50
            num_classes: 2
            num_layers: 2
        """

        self.x1 = tf.placeholder(tf.float32, shape=[None, embedding_dim, sentence_length], name="x1") #[b,d,s] -> [64,300,40]
        self.x2 = tf.placeholder(tf.float32, shape=[None, embedding_dim, sentence_length], name="x2") #[b,d,s] -> #[64,300,40]
        self.y = tf.placeholder(tf.int32, shape=[None], name="y") #[b] #64
        self.features = tf.placeholder(tf.float32, shape=[None, num_features], name="features") #[b,num_of_filters] -> [64,4] 

        def pad_for_wide_conv(x):
          """Zero padding to inputs for wide convolution,
            padding w-1 for both sides  (s -> s+w-1)
            Attr:
                x: input tensor (b, d, s, c) #[64, 300, 40, 1]
                w: filter size
            Returns:
                padded input (b, d, s+w-1 , c) #[64,300,43,1]
          """
          return tf.pad(x, np.array([[0, 0], [0, 0], [filter_width - 1, filter_width - 1], [0, 0]]), "CONSTANT", name="pad_wide_conv")

        def cos_sim(v1, v2):
            """Compute the cosine similarity between two vectors v1 and v2
               `cosine`: Defined as <x.y>/ |x|*|y|
               Args:
                v1: vector1
                v2: vector2
            """

            norm1 = tf.sqrt(tf.reduce_sum(tf.square(v1), axis=1))
            norm2 = tf.sqrt(tf.reduce_sum(tf.square(v2), axis=1))
            dot_products = tf.reduce_sum(v1 * v2, axis=1, name="cos_sim")

            return dot_products / (norm1 * norm2)
        
        def make_attention_mat(x1, x2):
            
            euclidean = tf.sqrt(tf.reduce_sum(tf.square(x1 - tf.matrix_transpose(x2)), axis=1))
            return 1 / (1 + euclidean)
   
        def convolution(name_scope, x, d, reuse):
            """conv2D layer
               Args:
                x: input tensor (b, d, s, c) #[64, 300, 46, 1] if layer 1 or [64,50,46,1]
                d: vector2
               
               Returns:
               conv.transpose: (b, d, s, c)[64, 50, 43, 1]
               
               Namescope is necessary for weight sharing with the second layer
            """
            with tf.name_scope(name_scope + "-conv"):
                with tf.variable_scope("conv") as scope:
                    conv = tf.contrib.layers.conv2d(
                        inputs=x,
                        num_outputs=nb_filters,
                        kernel_size=(d, filter_width),
                        stride=1,
                        padding="VALID",
                        activation_fn=tf.nn.tanh,
                        weights_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                        weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg),
                        biases_initializer=tf.constant_initializer(1e-04),
                        reuse=reuse,
                        trainable=True,
                        scope=scope
                    )

                    conv_trans = tf.transpose(conv, [0, 3, 2, 1], name="conv_trans") 
                    return conv_trans

        def w_pool(x):
            """Pooling layer as mentioned in the paper
               Args:
                x: input tensor (b, d, s, c) #[64, 300, 46, 1] if layer 1 or [64,50,46,1]
               
               Returns:
               conv.transpose: (b, d, s, c)[64, 50, 43, 1]
            """
                
            w_ap = tf.layers.average_pooling2d(
                inputs=x,
                pool_size=(1, filter_width),
                strides=1,
                padding="VALID",
                name="w_ap"
            )

            return w_ap

        def all_pool(variable_scope, x):
            """Pooling layer as mentioned in the paper
               Args:
                variable_scope: checks if it's initial inputs
                x: input tensor 
               
               Returns:
               conv.transpose: (b, d, s, c)[64, 50, 43, 1]
            """
            with tf.variable_scope(variable_scope + "-all_pool"):
              
                if variable_scope.startswith("input"):
                    
                    pool_width = sentence_length
                    d = embedding_dim
                else:
                    pool_width = sentence_length + filter_width - 1
                    d = nb_filters

                all_ap = tf.layers.average_pooling2d(
                    inputs=x,
                    pool_size=(1, pool_width),
                    strides=1,
                    padding="VALID",
                    name="all_ap")

                # [batch, di]
                all_ap_reshaped = tf.reshape(all_ap, [-1, d])
                return all_ap_reshaped

        def CNN_layer(variable_scope, x1, x2, d):
            """Each block contains input -> wide-conv -> w-pool
            """
            with tf.variable_scope(variable_scope):
              
              with tf.name_scope("att_mat"):
                aW = tf.get_variable(name="aW",
                                     shape=(sentence_length, d),
                                     initializer=tf.contrib.layers.xavier_initializer(),
                                     regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg))

                """ Compute attention matrix"""
                att_mat = make_attention_mat(x1, x2)

                """ Transform into the same dimension as the input"""
                x1_a = tf.expand_dims(tf.matrix_transpose(tf.einsum("ijk,kl->ijl", att_mat, aW)), -1)
                x2_a = tf.expand_dims(tf.matrix_transpose(
                    tf.einsum("ijk,kl->ijl", tf.matrix_transpose(att_mat), aW)), -1)

                """Concat with input"""
                x1 = tf.concat([x1, x1_a], axis=3)
                x2 = tf.concat([x2, x2_a], axis=3)
               
              left_conv = convolution(name_scope="left", x=pad_for_wide_conv(x1), d=d, reuse=False)
              right_conv = convolution(name_scope="right", x=pad_for_wide_conv(x2), d=d, reuse=True)

              left_wp = w_pool(x=left_conv)
              left_ap = all_pool(variable_scope="left", x=left_conv)
              right_wp = w_pool(x=right_conv)
              right_ap = all_pool(variable_scope="right", x=right_conv)

              return left_wp, left_ap, right_wp, right_ap
                    
        x1_expanded = tf.expand_dims(self.x1, -1) #[64, 300, 40, 1]
        x2_expanded = tf.expand_dims(self.x2, -1)  #[64, 300, 40, 1]

        LO_0 = all_pool(variable_scope="input-left", x=x1_expanded)  #[64, 300, 1, 1]
        RO_0 = all_pool(variable_scope="input-right", x=x2_expanded) #[64, 300, 1, 1]

        LI_1, LO_1, RI_1, RO_1 = CNN_layer(variable_scope="CNN-1", x1=x1_expanded, x2=x2_expanded, d=embedding_dim)
        sims = [cos_sim(LO_0, RO_0), cos_sim(LO_1, RO_1)] #Compute similarity scores and store them.

        if num_layers > 1:
            """ Create second CNN block if num_layers > 1
                Output from the first layer is given as input to the second
            """
            _, LO_2, _, RO_2 = CNN_layer(variable_scope="CNN-2", x1=LI_1, x2=RI_1, d=nb_filters)
            sims.append(cos_sim(LO_2, RO_2)) # Compute similarity scores for the second block too. 

        with tf.variable_scope("output-layer"):
            """ Final Output layer"""            
            self.output_features = tf.concat([self.features, tf.stack(sims, axis=1)], axis=1, name="output_features")

            self.estimation = tf.contrib.layers.fully_connected(
                inputs=self.output_features,
                num_outputs=num_classes,
                activation_fn=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg),
                biases_initializer=tf.constant_initializer(1e-04),
                scope="FC"
            )

        self.prediction = tf.contrib.layers.softmax(self.estimation)[:, 1]
      
        """ Calculate cost by softmax_cross_entropy and add a regularizer term """
        self.cost = tf.add(
            tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.estimation, labels=self.y)),
            tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
            name="cost")
        
        tf.summary.scalar("cost", self.cost)
        self.merged = tf.summary.merge_all()

## Train

In [0]:
questions, answers, labels, features, max_len = preprocess_data("train")

In [0]:
dataloader = DataLoader(word2vec, questions, answers, labels, features, max_len)

In [0]:
from beautifultable import BeautifulTable

dataloader_table = BeautifulTable()

dataloader_table.column_headers = ["Dataloader", "Values"]
dataloader_table.append_row(["Maximum length of Data", dataloader.max_len])
dataloader_table.append_row(["Number of questions", len(dataloader.questions)])
dataloader_table.append_row(["Number of answers", len(dataloader.answers)])
dataloader_table.append_row(["Number of features", dataloader.num_features])

print(dataloader_table)

In [0]:
param_table = BeautifulTable()
param_table.column_headers = ["Parameter", "Value"]

params = {
    "learning_rate": 0.085,
    "filter_width": 4,
    "l2_reg": 0.0006,
    "nb_epoch": 50,
    "batch_size": 64,
    "model_type": "ABCNN1",
    "num_layers": 2,
    "word2vec": word2vec,
    "embedding_dim": 300,
    "nb_filters": 50
    
}

for k,v in params.items():
  param_table.append_row([k,v])
print(param_table)


train(learning_rate=float(params["learning_rate"]), filter_width=int(params["filter_width"]), l2_reg=float(params["l2_reg"]), nb_epoch=int(params["nb_epoch"]),
      batch_size=int(params["batch_size"]), model_type=params["model_type"], num_layers=int(params["num_layers"]),
      word2vec=params["word2vec"],embedding_dim=int(params["embedding_dim"]),nb_filters=int(params["nb_filters"]))

## Process and load Test Dataset

In [0]:
questions, answers, labels, features, max_len = preprocess_data("test")

In [0]:
dataloader = DataLoader(word2vec, questions, answers, labels, features, max_len)

In [0]:
from beautifultable import BeautifulTable

dataloader_table = BeautifulTable()

dataloader_table.column_headers = ["Dataloader", "Values"]
dataloader_table.append_row(["Maximum length of Data", dataloader.max_len])
dataloader_table.append_row(["Number of questions", len(dataloader.questions)])
dataloader_table.append_row(["Number of answers", len(dataloader.answers)])
dataloader_table.append_row(["Number of features", dataloader.num_features])

print(dataloader_table)

## Run test

In [0]:

# default parameters
params = {
    "filter_width": 4,
    "l2_reg": 0.0006,
    "nb_epoch": 50,
    "model_type": "ABCNN1",
    "num_layers": 2,
    "classifier": "LR",
    "word2vec": word2vec
}


test(filter_width=int(params["filter_width"]), l2_reg=float(params["l2_reg"]), nb_epoch=int(params["nb_epoch"])
     , model_type=params["model_type"], num_layers=int(params["num_layers"]), classifier=params["classifier"],
     word2vec=params["word2vec"])

## Cost and MAP,MMR graph

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('ggplot')

def plt_graphs(cost_file,score_file,position):
    with open(cost_file,'r') as f:
        values = f.read().split("\n")

    cost = values[:-1]
    cost[:] = [float(x) for x in cost]

    with open(score_file,'r') as f:
        values = f.read().split('\n')
    values = values[:-1]


    maps = [x.split('\t')[1] for x in values][1:]
    mmrs = [x.split('\t')[2] for x in values][1:]
    maps[:] = [float(x) for x in maps]
    mmrs[:] = [float(x) for x in mmrs]

    summed_maps_mmrs = [sum(x) for x in zip(maps,mmrs)]
    max_score = max(summed_maps_mmrs)
    max_score_index = summed_maps_mmrs.index(max_score)

    x = np.arange(1,51,1)
    y = maps
    y2 = mmrs
    y3 = cost

    fig = plt.figure(figsize=(12,9))
    ax1 = fig.add_subplot(1,1,1)
    ax1.plot(x,y,'b',label="MAP score")
    ax1.plot(x,y2,'g', label="MMR score")


    ax1.set_title("MAP, MMR and Loss over epochs")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("MMR and MAP scores")

    ax2 = ax1.twinx() 
    ax2.set_ylabel("Cost function")
    ax2.plot(x,y3, label="Loss")

    ax1.legend(loc="upper right",  prop={'size': 15}, bbox_to_anchor=(1, 0.9))
    ax2.legend(loc="upper right", prop={'size':15})

    string_score ="Max MAPS = {:.4f}\nMax MMR = {:.4f}\nEpoch = {}".format(maps[max_score_index],mmrs[max_score_index],max_score_index)
    fig.text(position[0],position[1],string_score,bbox={'boxstyle':'square',"color":"white"},fontdict={'color':'black','size':'15'})
    
    fig.savefig('ABCNN1-2.png')



In [0]:
cost_file = "cost/-ABCNN1-2.txt"
score_file = "experiments/-ABCNN1-2-LR.txt"
position = [0.5,0.75]
plt_graphs(cost_file,score_file,position)