In [32]:
import sklearn as sk
import numpy as np
import pandas as pd
import json
from torch.utils.data import Dataset, DataLoader

class BQDataset():
    def __init__(self, path):
        self.dataset = open(path,encoding="utf-8")

        self.dataset = [json.loads(instance) for instance in self.dataset ]


        self.passages = []
        self.questions = []
        self.answers = []
        self.titles = []

        for inst in self.dataset:
            self.passages.append(inst["passage"])
            self.questions.append(inst["question"])
            self.answers.append(inst["answer"])
            self.titles.append(inst["title"])

    def get_dataset(self):
        return self.dataset

    def get_split(self):

        return self.passages,self.questions, self.answers


bqd = BQDataset("datasets/train.jsonl")
dataset = bqd.get_dataset()


9427

### Baseline Model



In [53]:
from sklearn.model_selection import *
from sklearn.metrics import *
class BaselineModel:


    def __init__(self, w2v_model,embedding_size, P,Q,A,seed = 0):

        self.w2v = w2v_model
        self.embedding_size = embedding_size
        self.P = P
        self.Q = Q
        self.A = A

        self.X, self.y = self.get_X_y(self.P,self.Q,self.A,embedding_size,self.w2v)

        self.X_train, self.X_dev, self.y_train, self.y_dev = sk.model_selection.train_test_split(self.X,self.y,random_state=seed,shuffle=True,train_size=0.7)

    def get_X_y(self,P,Q,A, embedding_size, w2v):

        X = []
        y = []
        for i in range(len(P)):

            p_ = P[i]
            q_ = Q[i]

            p_vect = np.zeros(embedding_size)
            q_vect = np.zeros(embedding_size)

            for word in p_:
                if word in w2v.wv.key_to_index:

                    p_vect += w2v.wv.get_vector(word)

            for word in q_:
                if word in w2v.wv.key_to_index:
                    q_vect += w2v.wv.get_vector(word)


            p_vect /= len(p_)
            q_vect /= len(q_)

            X.append(np.concatenate([p_vect,q_vect]))
            y.append(0 if A[i] == False else 1)

        return X,y


    def evaluate(self, classifier):

        classifier.fit(self.X_train,self.y_train)



        train_score = f1_score(self.y_train,classifier.predict(self.X_train))
        dev_score = f1_score(self.y_dev,classifier.predict(self.X_dev))


        print(f"f1-score for train set: {train_score}")
        print(f"f1-score for dev set: {dev_score}")






In [42]:
from gensim.models import Word2Vec
from sklearn.linear_model import *

embedding_size = 100

passages, questions, answers = bqd.get_split()

sentences = list(passages)
sentences.extend(questions)


p_ = []
for p in passages:
    p_.append(p.split(" "))

q_ = []
for q in questions:
    q_.append(q.split(" "))



s_ = []
for i in range(len(p_)):
    s = list(p_[i])
    s.extend(q_[i])
    s_.append(s)

sentences = s_

w2v_model = Word2Vec(sentences=sentences,vector_size=embedding_size, window= 5, min_count= 1, workers= 4)
classifier = LogisticRegression(penalty="l2",max_iter=10000)

w2v_model.train(sentences,total_examples=len(sentences),epochs=10)


(7601333, 9613800)

In [56]:
k = BaselineModel(w2v_model,embedding_size,p_,q_,answers)
k.evaluate(classifier)

f1-score for train set: 0.7709947977492304
f1-score for dev set: 0.7500625469101827


In [59]:
k.X_train[0]

array([-3.10293568e-01,  3.04078892e-01,  7.99123367e-01, -5.96259765e-01,
       -6.04488989e-02, -5.27231544e-01,  5.47070851e-01,  1.01012521e-01,
       -4.57029295e-02,  3.23070695e-01,  5.59191915e-02, -2.08565610e-01,
       -4.24085065e-01,  3.67575266e-02, -5.26636049e-01, -2.36994107e-01,
        1.89451844e-01,  1.01375778e-01, -5.83698628e-02, -8.13871985e-01,
        2.32357427e-01,  1.17835589e-02, -3.27943195e-01, -2.88370005e-01,
       -5.62386448e-01, -2.93313111e-02,  6.44427421e-02,  3.29379867e-01,
       -8.37064570e-01, -1.79148597e-01,  6.24268914e-01, -5.51920648e-01,
        2.92520956e-01,  7.84171486e-02, -2.21777943e-01,  8.71184795e-01,
        4.31599037e-01,  9.97845171e-02, -1.03633612e+00, -5.68279108e-01,
       -5.08828799e-01, -4.11109750e-01, -1.50489201e+00, -5.53760649e-01,
        7.97723324e-01, -2.78819563e-01,  6.26712295e-02,  1.56732191e-03,
        3.04071621e-01,  3.26310693e-01,  3.10146469e-02,  7.11179911e-02,
       -1.47398058e-01,  