In [None]:
import logging
import pandas as pd
from pathlib import Path
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import lower_to_unicode, strip_tags, strip_punctuation, \
                                         strip_non_alphanum, split_alphanum, strip_numeric, strip_short, \
                                         remove_stopwords, stem_text, strip_multiple_whitespaces

In [None]:
SOURCE_DATASET_META = {
    2019: {
        'whole': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/O7FWPO/QV9KD9',
            'updated_labels_url': None,
            'labels_sep': '\t',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417513',
            'source_name_col': 'source',
            'source_label_col': 'aggregated_label',
            'doc_col': 'content'
        },
    }, 
    2020: {
        'whole': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/4366864',
            'updated_labels_url': 'https://gist.githubusercontent.com/alex-massa/8f2fac7c18b23d2093e44c324976d17b/raw/a92d84b71907c3550f0251baa0f30c53b862c5d0/nela-gt-2020-updated-labels.tsv',
            'labels_sep': '\t',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417502',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        },
        'covid': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/4366864',
            'updated_labels_url': 'https://gist.githubusercontent.com/alex-massa/8f2fac7c18b23d2093e44c324976d17b/raw/a92d84b71907c3550f0251baa0f30c53b862c5d0/nela-gt-2020-updated-labels.tsv',
            'labels_sep': '\t',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417503',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        },
        'elect': {
            'labels_url': 'https://dataverse.harvard.edu/api/access/datafile/4366864',
            'updated_labels_url': 'https://gist.githubusercontent.com/alex-massa/8f2fac7c18b23d2093e44c324976d17b/raw/a92d84b71907c3550f0251baa0f30c53b862c5d0/nela-gt-2020-updated-labels.tsv',
            'labels_sep': '\t',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/4417504',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        }
    },
    2021: {
        'whole': {
            'labels_url': None,
            'updated_labels_url': 'https://gist.githubusercontent.com/alex-massa/c38f3e0bfe2e23e6bc60687f775318f6/raw/3d30279b53ceeb98ff6c8546426c610c76bb0e1d/nela-gt-2021-updated-labels.tsv',
            'labels_sep': '\t',
            'archive_url': 'https://dataverse.harvard.edu/api/access/datafile/6078140',
            'source_name_col': 'source',
            'source_label_col': 'label',
            'doc_col': 'content'
        }
    }
}

In [None]:
DATASET_DIR = './../../resources/dataset'
DATASET_PATH = f'{DATASET_DIR}/dataset.csv'

PROC_DATA_DIR = './../../resources/processed'
PROC_DATASET_PATH = f'{PROC_DATA_DIR}/proc_dataset.pkl'
DICTIONARY_PATH = f'{PROC_DATA_DIR}/dictionary.dict'

In [None]:
RAW_DATASET_DIR = f'{DATASET_DIR}/raw'

SOURCE_FILES_ARCHIVE_NAME = 'raw.tar'
SOURCE_FILES_ARCHIVE_PATH = f'{RAW_DATASET_DIR}/{SOURCE_FILES_ARCHIVE_NAME}'

SOURCE_FILES_DIR_NAME = 'newsdata'
SOURCE_FILES_DIR = f'{RAW_DATASET_DIR}/{SOURCE_FILES_DIR_NAME}'

SOURCE_LABELS_FILE_NAME = 'labels.tsv'
SOURCE_LABELS_PATH = f'{RAW_DATASET_DIR}/{SOURCE_LABELS_FILE_NAME}'

In [None]:
UPDATED_LABELS = True
SELECTED_DATASET = SOURCE_DATASET_META[2021]['whole']
SOURCE_LABELS_URL = SELECTED_DATASET['labels_url'] if not UPDATED_LABELS else SELECTED_DATASET['updated_labels_url']
SOURCE_LABELS_SEP = SELECTED_DATASET['labels_sep']
SOURCE_ARCHIVE_URL = SELECTED_DATASET['archive_url']

MIN_LENGTH = 300
MIN_UNIQUE = 50
MIN_OCCURENCES = 10
DOCUMENT_FILTERS = (lower_to_unicode, strip_tags, strip_punctuation, 
                    strip_non_alphanum, split_alphanum, strip_numeric, strip_short, 
                    remove_stopwords, strip_multiple_whitespaces)

N_SAMPLES_TO_LOAD = 250000
N_MAX_SAMPLES = 50000
POS_FRAC = 0.5

SILENT = False
RANDOM_SEED = 0

In [None]:
source_name_col, source_label_col = SELECTED_DATASET['source_name_col'], SELECTED_DATASET['source_label_col']
doc_col = SELECTED_DATASET['doc_col']
proc_doc_col, label_col = f'proc_{doc_col}', 'label'

In [None]:
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not SILENT:
    logger.setLevel(logging.DEBUG), handler.setLevel(logging.DEBUG)
logger.addHandler(handler)

In [None]:
if not Path(PROC_DATASET_PATH).is_file() or not Path(DICTIONARY_PATH).is_file():
    if not Path(DATASET_PATH).is_file():
        %run ./../data/_generate-dataset.ipynb
    %run ./../data/_preprocess-dataset.ipynb
else:
    df, dictionary = pd.read_pickle(PROC_DATASET_PATH), Dictionary.load(DICTIONARY_PATH)