In [None]:
import requests
import tarfile
import numpy as np
import pandas as pd
from pathlib import Path, PurePath
from tqdm import tqdm

In [None]:
if not Path(RAW_DATASET_DIR).is_dir():
    Path(RAW_DATASET_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
if not Path(SOURCE_LABELS_PATH).is_file():
    logger.info("Downloading source labels file...")
    r = requests.get(SOURCE_LABELS_URL, stream=True)
    total_size = int(r.headers.get('content-length', 0))
    with open(SOURCE_LABELS_PATH, 'wb') as f:
        progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, disable=SILENT)
        for data in r.iter_content(1024):
            progress_bar.update(len(data))
            f.write(data)
        progress_bar.close()

In [None]:
df_sources_labels = pd.read_csv(SOURCE_LABELS_PATH, sep=SOURCE_LABELS_SEP)
logger.info(f"Total sources before dropping mixed and unwanted: \t {len(df_sources_labels)}")
df_sources_labels.drop([i for i, row in df_sources_labels.iterrows()
                        if row[source_label_col] not in (SOURCE_NEG_LABEL_VAL, SOURCE_POS_LABEL_VAL)
                        or row[source_name_col] in UNWANTED_SOURCES], inplace=True)
logger.info(f"Total sources after dropping mixed and unwanted:\t {len(df_sources_labels)}")
used_sources = df_sources_labels[[source_name_col, source_label_col]].to_numpy()
sources_dict = {source: True if label == SOURCE_POS_LABEL_VAL else False for source, label in used_sources}

In [None]:
if not Path(SOURCE_FILES_DIR).is_dir():
    if not Path(SOURCE_FILES_ARCHIVE_PATH).is_file():
        r = requests.get(SOURCE_ARCHIVE_URL, stream=True)
        total_size = int(r.headers.get('content-length', 0))
        logger.info("Downloading source files archive...")
        with open(SOURCE_FILES_ARCHIVE_PATH, 'wb') as f:
            progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, disable=SILENT)
            for data in r.iter_content(1024):
                progress_bar.update(len(data))
                f.write(data)
            progress_bar.close()
    logger.info("Extracting archive...")
    tar = tarfile.open(SOURCE_FILES_ARCHIVE_PATH, 'r')
    to_extract = []
    for member in tar:
        if (_ := Path(member.path)).stem in sources_dict.keys() and _.suffix == '.json':
            member.path = member.path.rsplit('/', 1)[1]
            to_extract.append(member)
    tar.extractall(members=to_extract, path=SOURCE_FILES_DIR)
    tar.close()
    logger.info("Archive extracted.")

In [None]:
# loading extracted sources
source_files = [f for f in Path(SOURCE_FILES_DIR).iterdir()
                if f.is_file() and f.stem in sources_dict.keys()]
logger.info("Loading source files...")
source_dfs = []
for f in tqdm(source_files, disable=SILENT):
    source_df = pd.read_json(f)
    source_df[source_name_col] = [Path(f).stem] * len(source_df)
    source_df[label_col] = [sources_dict[Path(f).stem]] * len(source_df)
    source_dfs.append(source_df)
dataset_df = pd.concat(source_dfs, ignore_index=True)
dataset_df.drop_duplicates(subset=[doc_col], keep='first', inplace=True, ignore_index=True)

In [None]:
logger.info("Storing files as a unified dataset...")
dataset_df = dataset_df.sample(frac=1, ignore_index=True, random_state=RANDOM_SEED)
indices_chunks = np.array_split(dataset_df.index, 1000)
for indices_chunk in tqdm(indices_chunks, disable=SILENT):
    dataset_df_chunk = dataset_df.loc[indices_chunk]
    if not Path(DATASET_PATH).is_file():
        dataset_df_chunk.to_csv(DATASET_PATH, mode='w', index=False, header=True)
    else:
        dataset_df_chunk.to_csv(DATASET_PATH, mode='a', index=False, header=False)
logger.info("Dataset stored to disk.")