In [32]:
import sklearn as sk
import numpy as np
import pandas as pd
import json
from torch.utils.data import Dataset, DataLoader

class BQDataset():
    def __init__(self, path):
        self.dataset = open(path,encoding="utf-8")

        self.dataset = [json.loads(instance) for instance in self.dataset ]


        self.passages = []
        self.questions = []
        self.answers = []
        self.titles = []

        for inst in self.dataset:
            self.passages.append(inst["passage"])
            self.questions.append(inst["question"])
            self.answers.append(inst["answer"])
            self.titles.append(inst["title"])

    def get_dataset(self):
        return self.dataset

    def get_split(self):

        return self.passages,self.questions, self.answers


bqd = BQDataset("datasets/train.jsonl")
dataset = bqd.get_dataset()


### Preprocessing

In [None]:
def clean(text, stem_words=True):
    import re    # for regular expressions
    from string import punctuation
    from nltk.stem import SnowballStemmer    #if you are brave enough to do stemming
    from nltk.corpus import stopwords      #if you want to remove stopwords

    if type(text) != str or text=='':
        return ''

    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)

    ### YOUR CODE HERE
    text = re.sub("can't", "cannot", text, flags=re.IGNORECASE)
    text = re.sub("don't", "do not", text, flags=re.IGNORECASE)
    text = re.sub("won't", "will not", text, flags=re.IGNORECASE)
    text = re.sub("shouldn't", "should not", text, flags=re.IGNORECASE)
    text = re.sub("couldn't", "could not", text, flags=re.IGNORECASE)
    text = re.sub("isn't", "is not", text, flags=re.IGNORECASE)
    text = re.sub("wasn't", "was not", text, flags=re.IGNORECASE)
    text = re.sub("weren't", "were not", text, flags=re.IGNORECASE)
    text = re.sub("haven't", "have not", text, flags=re.IGNORECASE)
    text = re.sub("hasn't", "has not", text, flags=re.IGNORECASE)

    text = re.sub(r"[0-9]-[0-9]", " minus ", text)
    text = re.sub("-", " ", text)

    digit_letters = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    for i in range(len(digit_letters)):
        regex = rf"(?<=\b){str(i)}(?=\b)"
        text = re.sub(regex, digit_letters[i], text)

    # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    # Return a list of words
    return text


In [None]:
def tokenize(text):

    # Split by whitespace
    text = text.split(" ")

    return text

In [None]:
# Apply pre-processing to data
p,q,answers = bqd.get_split()

questions = []
passages = []
for i in range(len(p)):

    p_ = clean(p[i])
    p_ = tokenize(p_)

    q_ = clean(q[i])
    q_ = tokenize(q_)

    passages.append(p_)
    questions.append(q_)

### Baseline Model



In [53]:
from sklearn.model_selection import *
from sklearn.metrics import *
class BaselineModel:


    def __init__(self, w2v_model,embedding_size, P,Q,A,seed = 0):

        self.w2v = w2v_model
        self.embedding_size = embedding_size
        self.P = P
        self.Q = Q
        self.A = A

        self.X, self.y = self.get_X_y(self.P,self.Q,self.A,embedding_size,self.w2v)

        self.X_train, self.X_dev, self.y_train, self.y_dev = sk.model_selection.train_test_split(self.X,self.y,random_state=seed,shuffle=True,train_size=0.7)

    def get_X_y(self,P,Q,A, embedding_size, w2v):

        X = []
        y = []
        for i in range(len(P)):

            p_ = P[i]
            q_ = Q[i]

            p_vect = np.zeros(embedding_size)
            q_vect = np.zeros(embedding_size)

            for word in p_:
                if word in w2v.wv.key_to_index:

                    p_vect += w2v.wv.get_vector(word)

            for word in q_:
                if word in w2v.wv.key_to_index:
                    q_vect += w2v.wv.get_vector(word)


            p_vect /= len(p_)
            q_vect /= len(q_)

            X.append(np.concatenate([p_vect,q_vect]))
            y.append(0 if A[i] == False else 1)

        return X,y


    def evaluate(self, classifier):

        classifier.fit(self.X_train,self.y_train)



        train_score = f1_score(self.y_train,classifier.predict(self.X_train))
        dev_score = f1_score(self.y_dev,classifier.predict(self.X_dev))


        print(f"f1-score for train set: {train_score}")
        print(f"f1-score for dev set: {dev_score}")






In [72]:
from gensim.models import Word2Vec
from sklearn.linear_model import *

embedding_size = 100


sentences = []
for i in range(len(passages)):
    temp = passages[i]
    temp.extend(questions[i])
    sentences.append(temp)



In [73]:
w2v_model = Word2Vec(sentences=sentences,vector_size=embedding_size, window= 5, min_count= 1, workers= 4)
classifier = LogisticRegression(penalty="l2",max_iter=10000)

w2v_model.train(sentences,total_examples=len(sentences),epochs=10)


(8972415, 11474780)

In [74]:
k = BaselineModel(w2v_model,embedding_size,passages,questions,answers)
k.evaluate(classifier)

f1-score for train set: 0.769654587836406
f1-score for dev set: 0.7432701894317048
