In [23]:
import sklearn as sk
import numpy as np
import pandas as pd
import json
from torch.utils.data import Dataset, DataLoader

class BQDataset():
    def __init__(self, path):
        self.dataset = open(path,encoding="utf-8")

        self.dataset = [json.loads(instance) for instance in self.dataset ]


        self.passages = []
        self.questions = []
        self.answers = []
        self.titles = []

        for inst in self.dataset:
            self.passages.append(inst["passage"])
            self.questions.append(inst["question"])
            self.answers.append(inst["answer"])
            self.titles.append(inst["title"])

    def get_dataset(self):
        return self.dataset

    def get_split(self):

        return self.passages,self.questions, self.answers


bqd = BQDataset("datasets/train.jsonl")
dataset = bqd.get_dataset()


### Preprocessing

In [24]:
def clean(text, stem_words=True):
    import re    # for regular expressions
    from string import punctuation
    from nltk.stem import SnowballStemmer    #if you are brave enough to do stemming
    from nltk.corpus import stopwords      #if you want to remove stopwords

    if type(text) != str or text=='':
        return ''

    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)

    ### YOUR CODE HERE
    text = re.sub("can't", "cannot", text, flags=re.IGNORECASE)
    text = re.sub("don't", "do not", text, flags=re.IGNORECASE)
    text = re.sub("won't", "will not", text, flags=re.IGNORECASE)
    text = re.sub("shouldn't", "should not", text, flags=re.IGNORECASE)
    text = re.sub("couldn't", "could not", text, flags=re.IGNORECASE)
    text = re.sub("isn't", "is not", text, flags=re.IGNORECASE)
    text = re.sub("wasn't", "was not", text, flags=re.IGNORECASE)
    text = re.sub("weren't", "were not", text, flags=re.IGNORECASE)
    text = re.sub("haven't", "have not", text, flags=re.IGNORECASE)
    text = re.sub("hasn't", "has not", text, flags=re.IGNORECASE)

    text = re.sub(r"[0-9]-[0-9]", " minus ", text)
    text = re.sub("-", " ", text)

    digit_letters = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    for i in range(len(digit_letters)):
        regex = rf"(?<=\b){str(i)}(?=\b)"
        text = re.sub(regex, digit_letters[i], text)

    # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    # Return a list of words
    return text


In [25]:
def tokenize(text):

    # Split by whitespace
    text = text.split(" ")

    return text

In [26]:
# Apply pre-processing to data
p,q,answers = bqd.get_split()

questions = []
passages = []
for i in range(len(p)):

    p_ = clean(p[i])
    p_ = tokenize(p_)

    q_ = clean(q[i])
    q_ = tokenize(q_)

    passages.append(p_)
    questions.append(q_)

### Baseline Model



In [27]:
from sklearn.model_selection import *
from sklearn.metrics import *
class BaselineModel:


    def __init__(self, w2v_model,embedding_size, P,Q,A,seed = 0):

        self.w2v = w2v_model
        self.embedding_size = embedding_size
        self.P = P
        self.Q = Q
        self.A = A

        self.X, self.y = self.get_X_y(self.P,self.Q,self.A,embedding_size,self.w2v)

        self.X_train, self.X_dev, self.y_train, self.y_dev = sk.model_selection.train_test_split(self.X,self.y,random_state=seed,shuffle=True,train_size=0.7)

    def get_X_y(self,P,Q,A, embedding_size, w2v):

        X = []
        y = []
        for i in range(len(P)):

            p_ = P[i]
            q_ = Q[i]

            p_vect = np.zeros(embedding_size)
            q_vect = np.zeros(embedding_size)

            for word in p_:
                if word in w2v.wv.key_to_index:

                    p_vect += w2v.wv.get_vector(word)

            for word in q_:
                if word in w2v.wv.key_to_index:
                    q_vect += w2v.wv.get_vector(word)


            p_vect /= len(p_)
            q_vect /= len(q_)

            X.append(np.concatenate([p_vect,q_vect]))
            y.append(0 if A[i] == False else 1)

        return X,y


    def evaluate(self, classifier):

        classifier.fit(self.X_train,self.y_train)

        train_score = f1_score(self.y_train,classifier.predict(self.X_train))
        dev_score = f1_score(self.y_dev,classifier.predict(self.X_dev))


        print(f"f1-score for train set: {train_score}")
        print(f"f1-score for dev set: {dev_score}")






In [28]:
from gensim.models import Word2Vec
from sklearn.linear_model import *

embedding_size = 100


sentences = []
for i in range(len(passages)):
    temp = passages[i]
    temp.extend(questions[i])
    sentences.append(temp)



In [126]:
w2v_model = Word2Vec(sentences=sentences,vector_size=embedding_size, window= 5, min_count= 1, workers= 4)
classifier = LogisticRegression(penalty="l2",max_iter=10000)

w2v_model.train(sentences,total_examples=len(sentences),epochs=10)


(7749230, 9804740)

In [None]:
k = BaselineModel(w2v_model,embedding_size,passages,questions,answers)
k.evaluate(classifier)

f1-score for train set: 0.7723086694188632
f1-score for dev set: 0.7466800300676523


### LSTM Model ###


In [43]:
import numpy as np
import tensorflow as tf
import sklearn as sk
import pandas as pd
from sklearn.model_selection import *
from sklearn.metrics import *

BUFFER_SIZE = 10000
BATCH_SIZE = 64

passages = []
questions = []
answers = []

for line in dataset:
    passages.append(line["passage"])
    questions.append(line["question"])
    answers.append(line["answer"])


In [127]:
# Splitting the data into train and development sets
X = []
for i in range(len(passages)):
    X.append(passages[i] + " <SEP> " +  questions[i])

seed = 42
X_train, X_dev, y_train, y_dev = sk.model_selection.train_test_split(X,answers,random_state=seed,shuffle=True,train_size=0.7)

print(X_train[1]) # For testing

# Here we are putting the data into a tensorflow dataset so that we can use it with the enconder to vectorize it.
X_train_tf = tf.data.Dataset.from_tensor_slices(X_train) # To encode the vectors for the training set
# X_dev_tf = tf.data.Dataset.from_tensor_slices(X_dev) # To encode the vectors for the development set


Within the ADIZ is an even more sensitive zone designated the Washington, D.C. Metropolitan Area Flight Restricted Zone (DC FRZ). The DC FRZ extends approximately 13--15 nmi (15--17 mi; 24--28 km) around the DCA VOR/DME. Flight within the FRZ is restricted to governmental, certain scheduled commercial and a limited set of waivered flights. Three general aviation airports (known as the ``Maryland 3'' or the ``DC 3'') are located inside the DC FRZ: College Park Airport (CGS), Washington Executive/Hyde Field (W32), and Potomac Airport (VKX). <SEP> are planes allowed to fly over washington dc


In [128]:
# Encoding the X_train data only to encode the word vectors then we use X_train on the model
VOCAB_SIZE = 1000

print(type(X)) # For testing
print(type(y_train)) # For testing

encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE) # Splits on whitespace by default
encoder.adapt(X_train_tf.batch(64))

<class 'list'>
<class 'list'>


In [129]:
# Retrieving the vocabulary from the encoder
vocab = np.array(encoder.get_vocabulary())

vocab[:20] # For testing

array(['', '[UNK]', 'the', 'of', 'and', 'a', 'in', 'to', 'is', 'sep',
       'as', 'on', 'for', 'by', 'with', 'was', 'it', 'are', 'or', 'that'],
      dtype='<U13')

In [130]:
encoded_example = encoder(X_train[0]).numpy()
encoded_example # For testing

array([ 12,   1,  51,  86, 303, 326,   1,  54,   2, 261,   1,  13,   2,
         1,   1,   1, 100,   1, 553,  84,   1,   1, 228, 553,   4,   1,
       553,   2, 553,  92, 199,  85,   8,  32,   3,   2,  63,   1,  92,
       261,  23,   2, 326,  51,  86, 481, 129,   1,   1,   1,   4,   1,
         1, 129, 271, 110,   4,  32,   1,   1,  79,  74,   1,   1, 169,
        18,   1,  12,   5, 481,   1,   1, 370,   1,   6, 665,   1,   1,
       104,   7,  23, 423, 100,  65,  67,  73,  45, 837,   6, 524, 553,
        17,   2,  54,  85,  26,  27,   1,  11,   2,   1,   1, 169,  18,
         1, 338,   1,  35,  15,   5, 839,   1,   1,   1,   1,   1,   1,
         1,   4,   1, 263,  14,   1,   1,   4,   1,  34,  17,  32,   3,
         2, 129,  92, 261,   7, 302, 459,  33,   1,   1,  14,   2, 330,
         3,   2, 217, 326,  51,  86,   6, 117, 140,   2,  85,  15, 635,
         6, 665, 106,   3,   2, 398,   1,   2, 101,  63,   1,   4,  54,
       113,  28, 904,   2,   1, 849,   6,   1,   4,  93,  14,  4

In [132]:
# Building the model
# We are using the X_train data to train the model, X_train_tf is only used to encode the word vectors.
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# Training the model on the training set
model.fit(X_train, y_train, epochs=10)



Epoch 1/10

KeyboardInterrupt: 