In [1]:
!git clone https://github.com/brodzik/lqad-pl

Cloning into 'lqad-pl'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 24 (delta 13), reused 12 (delta 6), pack-reused 0[K
Unpacking objects: 100% (24/24), done.


In [2]:
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [3]:
def read_json(path):
    with open(path, "rb") as f:
        data = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in data["data"]:
        for passage in group["paragraphs"]:
            context = passage["context"]
            for qa in passage["qas"]:
                question = qa["question"]
                for answer in qa["answers"]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

In [4]:
train_contexts, train_questions, train_answers = read_json("lqad-pl/lqad-pl-pretty-train.json")
dev_contexts, dev_questions, dev_answers = read_json("lqad-pl/lqad-pl-pretty-dev.json")
test_contexts, test_questions, test_answers = read_json("lqad-pl/lqad-pl-pretty-test.json")

In [5]:
stop_words = ["ach", "aj", "albo", "bardzo", "bez", "bo", "być", "ci", "cię", "ciebie", "co", "czy", "daleko", "dla", "dlaczego", "dlatego", "do", "dobrze", "dokąd", "dość", "dużo", "dwa", "dwaj", "dwie", "dwoje", "dziś", "dzisiaj", "gdyby", "gdzie", "go", "ich", "ile", "im", "inny", "ja", "ją", "jak", "jakby", "jaki", "je", "jeden", "jedna", "jedno", "jego", "jej", "jemu", "jeśli", "jest", "jestem", "jeżeli	już", "każdy", "kiedy", "kierunku", "kto", "ku", "lub", "ma", "mają", "mam", "mi", "mną", "mnie", "moi", "mój", "moja", "moje", "może", "mu", "my", "na", "nam", "nami", "nas", "nasi", "nasz", "nasza", "nasze", "natychmiast", "nią", "nic", "nich", "nie", "niego", "niej", "niemu", "nigdy", "nim", "nimi", "niż", "obok", "od", "około", "on", "ona", "one", "oni", "ono", "owszem", "po	pod", "ponieważ", "przed", "przedtem", "są", "sam", "sama", "się", "skąd", "tak", "taki", "tam", "ten", "to", "tobą", "tobie", "tu", "tutaj", "twoi", "twój", "twoja", "twoje", "ty", "wam", "wami", "was", "wasi", "wasz", "wasza", "wasze", "we", "więc", "wszystko", "wtedy", "wy", "żaden", "zawsze", "że", "jeżeli", "już", "po", "pod"]

In [6]:
tfidf = TfidfVectorizer(stop_words=stop_words).fit(train_contexts + train_questions + dev_contexts + dev_questions + test_contexts + test_questions)

In [7]:
def add_negative_samples(contexts, questions, answers):
    length = len(contexts)

    contexts_vec = tfidf.transform(contexts)
    questions_vec = tfidf.transform(questions)

    sim = questions_vec * contexts_vec.T

    for i in tqdm(range(length)):
        original_ctx = contexts[i]
        original_q = questions[i]

        temp = sim[i].toarray()[0]
        top_idx = temp.argsort()[::-1]

        for j in top_idx:
            if contexts[j] != original_ctx:
                contexts.append(contexts[j])
                questions.append(original_q)
                answers.append({"answer_start": 0, "text": ""})
                break

In [8]:
add_negative_samples(train_contexts, train_questions, train_answers)
add_negative_samples(dev_contexts, dev_questions, dev_answers)
add_negative_samples(test_contexts, test_questions, test_answers)

HBox(children=(FloatProgress(value=0.0, max=2319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=302.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=295.0), HTML(value='')))




In [9]:
with open("train.pkl", "wb") as f:
    pickle.dump([train_contexts, train_questions, train_answers], f)

In [10]:
with open("dev.pkl", "wb") as f:
    pickle.dump([dev_contexts, dev_questions, dev_answers], f)

In [11]:
with open("test.pkl", "wb") as f:
    pickle.dump([test_contexts, test_questions, test_answers], f)