In [None]:
import logging
import pandas as pd
from pathlib import Path
from gensim.corpora import Dictionary

In [None]:
SOURCE_DATASET_META = {
    2019: {
        'whole': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/O7FWPO/QV9KD9',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417513',
            'source_name_col': 'source',
            'source_label_col': 'aggregated_label',
            'doc_col': 'content'
        },
    }, 
    2020: {
        'whole': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/4366864',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417502',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        },
        'covid': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/4366864',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417503',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        },
        'elect': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/4366864',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417504',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        }
    }
}

In [None]:
DATASET_DIR = './../../resources/dataset'
DATASET_PATH = f'{DATASET_DIR}/dataset.csv'

PROC_DATA_DIR = './../../resources/processed'
PROC_DATASET_PATH = f'{PROC_DATA_DIR}/proc_dataset.pkl'
DICTIONARY_PATH = f'{PROC_DATA_DIR}/dictionary.dict'

In [None]:
RAW_DATASET_DIR = f'{DATASET_DIR}/raw'

SOURCE_FILES_ARCHIVE_NAME = 'raw.tar.bz2'
SOURCE_FILES_ARCHIVE_PATH = f'{RAW_DATASET_DIR}/{SOURCE_FILES_ARCHIVE_NAME}'

SOURCE_FILES_DIR_NAME = 'newsdata'
SOURCE_FILES_DIR = f'{RAW_DATASET_DIR}/{SOURCE_FILES_DIR_NAME}'

SOURCE_LABELS_FILE_NAME = 'labels.tsv'
SOURCE_LABELS_PATH = f'{RAW_DATASET_DIR}/{SOURCE_LABELS_FILE_NAME}'

In [None]:
SELECTED_DATASET = SOURCE_DATASET_META[2020]['whole']

SOURCE_LABELS_URL, SOURCE_ARCHIVE_URL = SELECTED_DATASET['labels_url'], SELECTED_DATASET['archive_url']

DROP_BELOW_LENGTH = 300
DROP_BELOW_UNIQUE = 50
DROP_BELOW_OCCURENCES = 10

N_SAMPLES_TO_LOAD = 250000
N_MAX_SAMPLES = 50000
POS_FRAC = 0.5

SILENT = False
RANDOM_SEED = 0

In [None]:
source_name_col, source_label_col = SELECTED_DATASET['source_name_col'], SELECTED_DATASET['source_label_col']
doc_col = SELECTED_DATASET['doc_col']
proc_doc_col, label_col = f'proc_{doc_col}', 'label'

In [None]:
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not SILENT:
    logger.setLevel(logging.DEBUG), handler.setLevel(logging.DEBUG)
logger.addHandler(handler)

In [None]:
if not Path(PROC_DATASET_PATH).is_file() or not Path(DICTIONARY_PATH).is_file():
    if not Path(DATASET_PATH).is_file():
        %run ./../data/_generate-dataset.ipynb
    %run ./../data/_preprocess-dataset.ipynb
else:
    df, dictionary = pd.read_pickle(PROC_DATASET_PATH), Dictionary.load(DICTIONARY_PATH)