In [39]:
import pandas as pd
import numpy as np
import os

IC_DATA_DIR = './data/instacart_2017_05_01/'
SMALL_IC_DATA_DIR = './data/instacart_2017_05_01_small/'

In [46]:
def make_instacart_small(user_frac=0.01, data_dir=IC_DATA_DIR,
                         output_dir=SMALL_IC_DATA_DIR, verbose=0):
    """
    creates smaller version of instacart dataset for faster testing

    :param user_frac: fraction of users to keep
    :param data_dir: input data directory
    :param output_dir: output directory for small dataset
    
    :return: None
    """
    
    # load dataframes for full dataset
    priors = pd.read_csv(data_dir + 'order_products__prior.csv')
    train = pd.read_csv(data_dir + 'order_products__train.csv')
    orders = pd.read_csv(data_dir + 'orders.csv')
    products = pd.read_csv(data_dir + 'products.csv')
    aisles = pd.read_csv(data_dir + 'aisles.csv')
    departments = pd.read_csv(data_dir + 'departments.csv')

    if verbose>1:
        print("\noriginal sizes:")
        print("priors: ", priors.shape)
        print("train: ", train.shape)
        print("orders: ", orders.shape)
        print("products: ", products.shape)
        print("aisles: ", aisles.shape)
        print("departments: ", departments.shape)
        
    # get list of user ids and select subset
    user_ids = orders.user_id.unique()
    user_ids = np.random.choice(user_ids, int(len(user_ids) * user_frac), replace=False)

    # get list of order ids corresponding to selected users, create small orders dataframe
    orders_small = orders[orders.user_id.isin(user_ids)]
    order_ids = orders_small.order_id.unique()

    # create small prior and train dataframes
    priors_small = priors[priors.order_id.isin(order_ids)]
    train_small = train[train.order_id.isin(order_ids)]

    # get list of all products occuring in selected orders, create small product dataframe
    product_ids = pd.concat([priors_small.product_id,train_small.product_id]).unique()
    products_small = products[products.product_id.isin(product_ids)]

    # get list of aisle and department ids occuring in products, create corresponding small dataframes
    aisle_ids = products_small.aisle_id.unique()
    aisles_small = aisles[aisles.aisle_id.isin(aisle_ids)]
    department_ids = products_small.department_id.unique()
    departments_small = departments[departments.department_id.isin(department_ids)]

    if verbose>0:
        print("\nnew sizes:")
        print("priors: ", priors_small.shape)
        print("train: ", train_small.shape)
        print("orders: ", orders_small.shape)
        print("products: ", products_small.shape)
        print("aisles: ", aisles_small.shape)
        print("departments: ", departments_small.shape)
    
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    priors_small.to_csv(output_dir + 'order_products__prior.csv', index=False)
    train_small.to_csv(output_dir + 'order_products__train.csv', index=False)
    orders_small.to_csv(output_dir + 'orders.csv', index=False)
    products_small.to_csv(output_dir + 'products.csv', index=False)
    aisles_small.to_csv(output_dir + 'aisles.csv', index=False)
    departments_small.to_csv(output_dir + 'departments.csv', index=False)

In [47]:
make_instacart_small(user_frac=0.0001, output_dir='./data/instacart_2017_05_01_micro/', verbose=2)


original sizes:
priors:  (32434489, 4)
train:  (1384617, 4)
orders:  (3421083, 7)
products:  (49688, 4)
aisles:  (134, 2)
departments:  (21, 2)

new sizes:
priors:  (2052, 4)
train:  (122, 4)
orders:  (225, 7)
products:  (972, 4)
aisles:  (115, 2)
departments:  (20, 2)


In [48]:
make_instacart_small(user_frac=0.001, output_dir='./data/instacart_2017_05_01_tiny/', verbose=1)


new sizes:
priors:  (29653, 4)
train:  (1379, 4)
orders:  (3068, 7)
products:  (6126, 4)
aisles:  (134, 2)
departments:  (21, 2)


In [49]:
make_instacart_small(user_frac=0.01, output_dir='./data/instacart_2017_05_01_small/', verbose=1)


new sizes:
priors:  (320909, 4)
train:  (13182, 4)
orders:  (34773, 7)
products:  (21726, 4)
aisles:  (134, 2)
departments:  (21, 2)


In [50]:
make_instacart_small(user_frac=0.1, output_dir='./data/instacart_2017_05_01_medium/', verbose=1)


new sizes:
priors:  (3268549, 4)
train:  (140223, 4)
orders:  (343116, 7)
products:  (41851, 4)
aisles:  (134, 2)
departments:  (21, 2)
