In [None]:
!pip install simpletransformers pandas sentence_transformers

# Load the dataset

In [None]:
import pandas as pd

splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'validation': 'plain_text/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/rajpurkar/squad/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/rajpurkar/squad/" + splits["validation"])

# prepare data
* for fine-tuning the pre-trained model
* for creating the vector database

# Note
* Due to the limit of local GPU/CPU, only first 100 records is prepared for fine-tuning the pre-trained model

In [None]:
# Prepare data and test for finetuning the pre-trained model
import json

train_data = []
test_data = []

for row in df[:20].iterrows():
    ans = row[1]["answers"]
    ans_txt = ''.join(map(str, ans["text"]))
    ans_start = int(ans["answer_start"][0])
    train_data.append({"context": row[1]["context"], "qas":[{"id": row[1]["id"], "is_impossible": False, "question": row[1]["question"], "answers": [{"text": ans_txt, "answer_start": ans_start}]}]})


for row in df_test[:20].iterrows():
    ans = row[1]["answers"]
    ans_txt = ''.join(map(str, ans["text"]))
    ans_start = ans["answer_start"].tolist()
    test_data.append({"context": row[1]["context"], "qas":[{"id": row[1]["id"], "is_impossible": False, "question": row[1]["question"], "answers": [{"text": ans_txt, "answer_start": ans_start}]}]})


with open('train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open('test.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

    

## Build Vector database
+ Generate embedding vector and store it as vector database

In [None]:
# Prepare the vector database

from sentence_transformers import SentenceTransformer
import json

model_path = "TencentBAC/Conan-embedding-v1"

model = SentenceTransformer(model_path, device="cpu")

with open("train.json", "r", encoding="utf-8") as f:
    train = json.load(f)

    embedding_data = []
    for data in train:
        context = data["context"]
        question = data["qas"][0]["question"]
        emb = model.encode(question).tolist()
        embedding_data.append({"context": context, "embedding": emb})

    with open("embedding.json", "w", encoding="utf-8") as f_out:
        json.dump(embedding_data, f_out, indent=4)


# Fine-tunning the pre-trained model

In [None]:
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
import json, os
os.environ["TOKENIZERS_PARALLELISM"] = "False"

with open("train.json", 'r', encoding='utf-8') as f:
    train = json.load(f)

with open("test.json", 'r', encoding='utf-8') as f:
    test = json.load(f)

train_args = {
    'overwrite_output_dir': True,
    "evaluate_during_training": True,
    "max_seq_length": 256,
    "num_train_epochs": 2, #25, after experimentations
    "evaluate_during_training_steps": 500,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":16, #batch_size is another important argument
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "output_dir":"./output"
}

model = QuestionAnsweringModel("bert",
                               "bert-large-cased", 
                               args = train_args,
                               use_cuda=True) # I will use GPU for faster performance

model.train_model(train, eval_data=test, output_dir="./output")