In [1]:
import os
import time

import numpy as np
import pandas as pd

from random import sample

import yaml
import json

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm

In [2]:
N_SAMPLES = 50000

In [3]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']

In [4]:
# Доступ к данным из Kaggle через API
with open(preproc['kaggle_creds']) as json_file:
    kaggle_creds = json.load(json_file)

In [5]:
os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

In [6]:
file_dirs = preproc['file_dirs'] | evaluate['file_dirs']

In [7]:
def file_download(filename: str, local_filepath: str,
                  kaggle_filepath: str) -> None:
    """
    Загрузка файлов с репозитория Kaggle
    :param filename: имя файла с расширением
    :param local_filepath: директория проекта, куда следует загрузить файл
    :param kaggle_filepath: директория ресурса kaggle, где файл размещен
    """
    check_file = os.path.isfile(local_filepath + filename)
    if not check_file:
        print(f'File {filename} download operation from {kaggle_filepath}')
        shell_cmd = f"""
        cd {local_filepath}
        kaggle competitions download -c {preproc['kaggle_competition']} -f {kaggle_filepath + filename}
        unzip {filename}
        rm {filename}.zip
        """
        os.system(shell_cmd)
        print(f'File {filename} has been downloaded and unzipped')
    else:
        print(f'File {filename} exists in {local_filepath}. Skipping download')

In [8]:
def file_delete(filename: str, path: str) -> None:
    """
    Удаление файла
    :param filename: имя файла с расширением
    :param path: директория проекта, где расположен файл
    """
    check_file = os.path.isfile(path + filename)
    if check_file:
        print(f'Deleting file {path}{filename}')
        shell_cmd = f"""
        cd {path}
        rm {filename}
        """
        os.system(shell_cmd)
        print(f'File {path}{filename} has been removed')
    else:
        print(f'File {filename} does not exist in {path}. Skipping deletion')

In [9]:
# Загружаем тренировочные и тестовые файлы
for file in file_dirs.keys():
    if 'kaggle_dir' in file:
        local_filepath = file_dirs[file]['local_dir']
        kaggle_filepath = file_dirs[file]['kaggle_dir']
        if 'ids' in file_dirs[file]:
            for batch_id in range(int(file_dirs[file]['ids'][0]),
                                  int(file_dirs[file]['ids'][1]) + 1):
                filename = file_dirs[file]['filename'].format(batch_id=batch_id)
                file_download(filename=filename,
                              local_filepath=local_filepath,
                              kaggle_filepath=kaggle_filepath)
        else:
            filename = file_dirs[file]['filename']
            file_download(filename=filename,
                          local_filepath=local_filepath,
                          kaggle_filepath=kaggle_filepath)

In [11]:
file_path = preproc['file_dirs']['train_meta'][
    'local_dir'] + preproc['file_dirs']['train_meta']['filename']
train_meta = pd.read_parquet(file_path)
del file_path

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/train_meta.parquet'

In [None]:
def dtypes_convert(df: pd.DataFrame) -> pd.DataFrame:
    """
    Преобразование числовых полей датафрейма к меньшей размерности для экономии вычислительных ресурсов
    :param df: датафрейм
    """
    fcols = df.select_dtypes('float').columns
    icols = df.select_dtypes('integer').columns

    df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
    df[icols] = df[icols].apply(pd.to_numeric, downcast='unsigned')
    
    return df

In [None]:
train_meta = dtypes_convert(train_meta)

In [None]:
batch_ids = [*range(1, 11)]

In [None]:
train_meta = train_meta.loc[train_meta['batch_id'].isin(batch_ids)]

In [None]:
train_meta_sample = train_meta.sample(N_SAMPLES)

In [None]:
batches = []
for i in range(1, 11):
    file_path = preproc['file_dirs']['train_batches']['local_dir'] + '/' + preproc[
        'file_dirs']['train_batches']['filename'].format(batch_id=i)
    batches.append(dtypes_convert(pd.read_parquet(file_path)))
    del file_path
batches = pd.concat(batches, axis=0)

In [None]:
event_ids = sorted(list(train_meta_sample['event_id']))

In [None]:
batches_sample = batches[batches.index.isin(event_ids)]

In [None]:
train_meta_sample

In [None]:
batches_sample

In [None]:
batches_sample = dtypes_convert(batches_sample)

In [None]:
batches_sample.to_parquet('../data/raw/batches/batches_sample.parquet')

In [None]:
train_meta_sample.to_parquet('../data/raw/train_meta_sample.parquet')