In [None]:
# External imports
import numpy as np
import pandas as pd
import torch
from typing import Tuple, List, Dict

# Project imports
from data_loading import Document
from data_loading.parser import SquadFileParser
from data_loading.utils import build_mappers_and_dataframe

In [None]:
parser = SquadFileParser("squad_data/training_set.json")
data = parser.parse_documents()

In [None]:
paragraphs_mapper, questions_mapper, df = build_mappers_and_dataframe(data)
print(questions_mapper[next(iter(questions_mapper))])
print(paragraphs_mapper[next(iter(paragraphs_mapper))])
df.head()

In [None]:
from utils.embedding_utils import EmbeddingDownloader

embedding_downloader = EmbeddingDownloader(
    "embedding_models", 
    "embedding_model.kv", 
    model_name="fasttext-wiki-news-subwords-300"
)

embedding_model = embedding_downloader.load()

In [None]:
from data_loading.utils import DataConverter, padder_collate_fn
from data_loading.qa_dataset import CustomQADataset

data_converter = DataConverter(embedding_model)
datasetQA = CustomQADataset(data_converter, df, paragraphs_mapper, questions_mapper)
data_loader = torch.utils.data.DataLoader(datasetQA, collate_fn = padder_collate_fn, batch_size=10, shuffle=True)
print(next(iter(data_loader))[0].shape)
print(next(iter(data_loader))[0].shape)