In [1]:
import os
import gc
from glob import glob
from itertools import combinations
import pandas as pd

In [2]:
input_file = "../data/raw/train_test.hdf.compress"
output_dir = "../data/interim/partitioned_all"

def partition_by(df, column, npartitions):
    directory = f"{output_dir}/{column}"
    if not os.path.exists(directory): os.makedirs(directory)
    to_write=list(df.columns)
    
    df['npartition'] = df[column].apply(hash) % npartitions
    df.groupby('npartition').apply(lambda x: x[to_write].to_hdf(f"{directory}/train_{x.name}.hdf.compress", 'train', mode='w', complib='blosc', fletcher32=True, complevel=9))
    del df['npartition']

def partition_by2(df, t, npartitions):
    column = f"{t[0]}_{t[1]}"
    directory = f"{output_dir}/{column}"
    if not os.path.exists(directory): os.makedirs(directory)
    to_write=list(df.columns)
    to_write.append(column)

    df[column] = df[t[0]].astype(str) + "_" + df[t[1]].astype(str)
    df['npartition'] = df[column].apply(hash) % npartitions
    df.groupby('npartition').apply(lambda x: x[to_write].to_hdf(f"{directory}/train_{x.name}.hdf.compress", 'train', mode='w', complib='blosc', fletcher32=True, complevel=9))
    del df['npartition'], df[column]

def partition_by3(df, t, npartitions):
    column = f"{t[0]}_{t[1]}_{t[2]}"
    directory = f"{output_dir}/{column}"
    if not os.path.exists(directory): os.makedirs(directory)
    to_write=list(df.columns)
    to_write.append(column)
    
    df[column] = df[t[0]].astype(str) + "_" + df[t[1]].astype(str) + "_" + df[t[2]].astype(str)
    df['npartition'] = df[column].apply(hash) % npartitions
    df.groupby('npartition').apply(lambda x: x[to_write].to_hdf(f"{directory}/train_{x.name}.hdf.compress", 'train', mode='w', complib='blosc', fletcher32=True, complevel=9))
    del df['npartition'], df[column]

In [3]:
npartitions = 128
target_entities = ['app', 'device', 'os', 'channel', 'ip', 'hour']

df = pd.read_hdf(input_file)
df['hour'] = df['click_time'].dt.hour

for t in combinations(target_entities, 1):
    if 'device' in t: continue
    print(f"Processing {t}")
    partition_by(df, t[0], npartitions)
    gc.collect()

for t in combinations(target_entities, 2):
    if 'ip' in t: continue
    print(f"Processing {t}")
    partition_by2(df, t, npartitions)
    gc.collect()

for t in combinations(target_entities, 3):
    if 'ip' in t: continue
    print(f"Processing {t}")
    partition_by3(df, t, npartitions)
    gc.collect()

Processing ('app',)
Processing ('os',)
Processing ('channel',)
Processing ('ip',)
Processing ('hour',)
Processing ('app', 'device')
Processing ('app', 'os')
Processing ('app', 'channel')
Processing ('app', 'hour')
Processing ('device', 'os')
Processing ('device', 'channel')
Processing ('device', 'hour')
Processing ('os', 'channel')
Processing ('os', 'hour')
Processing ('channel', 'hour')
Processing ('app', 'device', 'os')
Processing ('app', 'device', 'channel')
Processing ('app', 'device', 'hour')
Processing ('app', 'os', 'channel')
Processing ('app', 'os', 'hour')
Processing ('app', 'channel', 'hour')
Processing ('device', 'os', 'channel')
Processing ('device', 'os', 'hour')
Processing ('device', 'channel', 'hour')
Processing ('os', 'channel', 'hour')


In [6]:
len(pd.read_hdf('../data/interim/partitioned2/channel_ip/train_0.hdf.compress'))

8