# Todos
1. Create Empty HDF5 File
1. Read in Previous Orders
    - Split Between Train and Test
    - For Train Write out the Xs
    - For Test Write out the Xs
2. Read in the Dependent Variables
    - Split Between Train and Test
    - For Train Write out the order_month and the Ys
    - For Test Write out the order_month and the Ys

In [1]:
import numpy as np
import h5py
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 1000)
pd.set_option('max_columns',100)

# Create Empty HDF5 File

In [2]:
ord_hist_train = h5py.File('data/order_history_train.h5', 'w')
ord_hist_test = h5py.File('data/order_history_test.h5', 'w')

# Save Previous Orders

In [4]:
final_pd = pd.read_pickle('data/final_pd_previous_orders.pkl')
final_pd = final_pd.drop(['ordno_current', 'months_before', 'days_before'],axis=1)

#### Training

In [5]:
train_pd = final_pd[final_pd['orderdate']<= pd.to_datetime('2018-09-1')].copy()

In [6]:
del(train_pd['orderdate'])
del(train_pd['ordno'])
#del(final_pd)

In [7]:
train_X_orders = train_pd.drop(
    ['ordermonth', 'P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0'],
    axis=1
).values

train_X_orders = train_X_orders.reshape((-1,14, 253))

In [8]:
ord_hist_train.create_dataset('train_X_orders', data=train_X_orders)

<HDF5 dataset "train_X_orders": shape (135889, 14, 253), type "<f8">

#### Test

In [9]:
test_pd = final_pd[final_pd['orderdate']> pd.to_datetime('2018-09-1')].copy()

In [10]:
del(test_pd['orderdate'])
del(test_pd['ordno'])

In [11]:
test_X_orders = test_pd.drop(
    ['ordermonth','P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0'],
    axis=1
).values

test_X_orders = test_X_orders.reshape((-1,14, 253))

In [12]:
ord_hist_test.create_dataset('test_X_orders', data=test_X_orders)

<HDF5 dataset "test_X_orders": shape (13828, 14, 253), type "<f8">

# Save Dependent Variables

In [13]:
dependent_vars = pd.read_pickle('data/dependent_vars_previous_orders.pkl')

In [14]:
assert len(set(dependent_vars['ordno'])) == len(set(final_pd['ordno']))

#### Convert Orders From Counts to BInary

In [15]:
dependent_vars[
    ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']
] = (dependent_vars[
    ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']
]>0).astype(int)

#### Training

In [16]:
train_dependent_pd = dependent_vars[dependent_vars['orderdate']<= pd.to_datetime('2018-09-1')]

In [17]:
train_X_order_month = train_dependent_pd['ordermonth'].values.reshape((-1,1))

In [18]:
train_Y_order = train_dependent_pd[
    ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']
].values

In [19]:
ord_hist_train.create_dataset('train_X_order_month', data=train_X_order_month)

<HDF5 dataset "train_X_order_month": shape (135889, 1), type "<i8">

In [20]:
ord_hist_train.create_dataset('train_Y_order', data=train_Y_order)

<HDF5 dataset "train_Y_order": shape (135889, 6), type "<i8">

#### Test

In [21]:
test_dependent_pd = dependent_vars[dependent_vars['orderdate']> pd.to_datetime('2018-09-1')]

In [22]:
test_X_order_month = test_dependent_pd['ordermonth'].values.reshape((-1,1))

In [23]:
test_Y_order = test_dependent_pd[['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']].values

In [24]:
ord_hist_test.create_dataset('test_X_order_month', data=test_X_order_month)

<HDF5 dataset "test_X_order_month": shape (13828, 1), type "<i8">

In [25]:
ord_hist_test.create_dataset('test_Y_order', data=test_Y_order)

<HDF5 dataset "test_Y_order": shape (13828, 6), type "<i8">

# Close the HDF5 File

In [26]:
ord_hist_train.close()

In [27]:
ord_hist_test.close()

# Junk Below

## Split Between Train and Test
- X_Orders.shape = (-1, 14, 254)
- X_Month.shape = (-1,1)
- Y.shape = (-1, 6)
- X_Events.shape = ()


# Preprocessing

## Create Training Dataset

In [None]:
train_pd = final_pd[final_pd['orderdate']<= pd.to_datetime('2018-09-1')]
train_dependent_pd = dependent_vars[dependent_vars['orderdate']<= pd.to_datetime('2018-09-1')]

In [None]:
train_pd.shape

In [None]:
train_X_orders = train_pd.drop(
    ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0'],
    axis=1
).values

In [None]:
train_X_orders = train_X_orders.reshape((-1,14, 254))

In [None]:
train_X_order_month = train_dependent_pd['ordermonth'].values.reshape((-1,1))

In [None]:
train_Y_order = train_pd[['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']].values

## Create Test Dataset

In [None]:
test_pd = final_pd[final_pd['orderdate']> pd.to_datetime('2018-09-1')]
test_dependent_pd = dependent_vars[dependent_vars['orderdate']> pd.to_datetime('2018-09-1')]

In [None]:
del(test_pd['orderdate'])

In [None]:
del(test_pd['ordno'])

In [None]:
test_X_orders = test_pd.drop(
    ['ordermonth','P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0'],
    axis=1
).values

test_X_orders = test_X_orders.reshape((-1,14, 253))

In [None]:
test_X_order_month = test_dependent_pd['ordermonth'].values.reshape((-1,1))

In [None]:
test_Y_order = test_pd[['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']].values

In [None]:
test_Y_order

#### Get Train Data

In [None]:
train_pd = final_pd[final_pd['orderdate']<= pd.to_datetime('2018-09-1')]

In [None]:
train_X = train_pd.drop(
    ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0'],
    axis=1
).values

train_X = train_X.reshape((-1,14, 254))

In [None]:
del(train_pd['orderdate'])

In [None]:
X = final_pd.drop(
    ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0'],
    axis=1
).values

In [None]:
Y = final_pd[['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']].shape

In [None]:
ordno_df = modeling_dataset[['ordno']].copy()
ordno_df['key'] = 1

In [None]:
all_prev_orders_clean.head()

# Junk Below

#### Previous Order Count

In [None]:
full_prev_order_count

#### Get Purchase Counts

In [None]:
def get_order_count_sum(all_previous_orders, min_b4, max_b4):
    relevant_previous_orders = all_previous_orders[
        (all_previous_orders['days_before']>min_b4) 
        & (all_previous_orders['days_before']< max_b4)
    ]
    del(relevant_previous_orders['days_before'])
    
    order_count_sum = relevant_previous_orders.groupby('ordno_current', as_index=False).sum()
    
    # Fix Column Names
    agg_cols = list(order_count_sum.columns)[1:]
    new_cols = [x+f'_{min_b4}_{max_b4}' for x in agg_cols]
    new_cols = ['ordno_current'] + new_cols
    order_count_sum.columns = new_cols

    return order_count_sum

In [None]:
order_count_sum_0_30 = get_order_count_sum(all_prev_orders_clean,0,30)
order_count_sum_30_350 = get_order_count_sum(all_prev_orders_clean,30,350)
order_count_sum_350_380 = get_order_count_sum(all_prev_orders_clean,350,380)

In [None]:
order_count_sum_0_30.head()

In [None]:
assert len(set(order_count_sum_0_30['ordno_current'])) == order_count_sum_0_30.shape[0]
assert len(set(order_count_sum_30_350['ordno_current'])) == order_count_sum_30_350.shape[0]
assert len(set(order_count_sum_350_380['ordno_current'])) == order_count_sum_350_380.shape[0]

assert len(set(modeling_dataset['ordno'])) == modeling_dataset.shape[0]

In [None]:
modeling_dataset.head()

In [None]:
modeling_dataset_orders = modeling_dataset.merge(
    order_count_sum_0_30,
    left_on='ordno', right_on='ordno_current',
    how='left'
).merge(
    order_count_sum_30_350,
    left_on='ordno', right_on='ordno_current',
    how='left'
).merge(
    order_count_sum_350_380,
    left_on='ordno', right_on='ordno_current',
    how='left'
)

In [None]:
# Ensure all Orders are stored
assert modeling_dataset.shape[0]==modeling_dataset_orders.shape[0]

In [None]:
assert len(set(modeling_dataset_orders['ordno'])) == modeling_dataset_orders.shape[0]

In [None]:
modeling_dataset_orders.head()

# Clean Online Data
- Aggregate to Session Level

#### Read in Data

In [None]:
# Read in Data
online = pd.read_csv('data/online.csv')

# Convert to Datetime
online['dt'] = pd.to_datetime(online['dt'])

# Fill in Missing Values with Dummie Value
online['event1'] = online['event1'].fillna(-1)

# Give clear category name for when it's split out to all the columns
online['event1'] = online[['event1']].apply(lambda row: rename_categories('E1',row[0]), axis=1)
online['event2'] = online[['event2']].apply(lambda row: rename_categories('E2',row[0]), axis=1)
online['category'] = online[['category']].apply(lambda row: rename_categories('Cat',row[0]), axis=1)

#### Split Out Events

In [None]:
e1_dummies = pd.get_dummies(online['event1'])
e2_dummies = pd.get_dummies(online['event2'])
cat_dummies = pd.get_dummies(online['category'])

In [None]:
event_vars = list(e1_dummies.columns) + list(e2_dummies.columns) + list(cat_dummies.columns)

In [None]:
online_expanded = pd.concat([online, e1_dummies, e2_dummies, cat_dummies], axis=1)

In [None]:
online_expanded.head()

In [None]:
# Create Dictionary for Aggregation
agg_dict = {}
for event_var in event_vars:
    agg_dict[event_var] = 'sum'
agg_dict['dt'] = 'min'

In [None]:
online_sessions = online_expanded.groupby(['custno', 'session'], as_index=False).agg(agg_dict).copy()

online_sessions['start_time'] = online_sessions['dt']
del(online_sessions['dt'])

In [None]:
online_sessions.head().T

## Create Features

In [None]:
orders_and_dates = modeling_dataset_orders[['ordno','custno','orderdate']]

In [None]:
orders_and_sessions = orders_and_dates.merge(online_sessions, on='custno').copy()

orders_and_sessions = orders_and_sessions[
    orders_and_sessions['orderdate']>orders_and_sessions['start_time']
]

In [None]:
orders_and_sessions['days_before'] = (
    orders_and_sessions['orderdate'] - orders_and_sessions['start_time']
).astype('timedelta64[s]')/3600/24

In [None]:
orders_and_sessions.columns.values

In [None]:
orders_and_sessions.head(5).T

In [None]:
orders_and_sessions_clean = orders_and_sessions[
    [
        'ordno','days_before',
        'E1:-1.0', 'E1:1.0', 'E1:2.0', 'E1:4.0', 'E1:5.0', 'E1:6.0',
        'E1:7.0', 'E1:8.0', 'E1:9.0', 'E1:10.0', 'E1:11.0',
        
        'E2:1', 'E2:2', 'E2:3','E2:4', 'E2:5',
        'E2:6', 'E2:7', 'E2:8', 'E2:9', 'E2:10', 
        
        'Cat:1', 'Cat:2', 'Cat:3', 
    ]
]

In [None]:
def get_session_event_sum(all_previous_sessions, min_b4, max_b4):
    relevant_previous_sessions = all_previous_sessions[
        (all_previous_sessions['days_before']>min_b4) 
        & (all_previous_sessions['days_before']< max_b4)
    ]
    del(relevant_previous_sessions['days_before'])
    
    session_event_sum = relevant_previous_sessions.groupby('ordno', as_index=False).sum()
    
    # Fix Column Names
    agg_cols = list(session_event_sum.columns)[1:]
    new_cols = [x+f'_{min_b4}_{max_b4}' for x in agg_cols]
    new_cols = ['ordno'] + new_cols
    session_event_sum.columns = new_cols

    return session_event_sum

In [None]:
session_event_sum_0_30 = get_session_event_sum(orders_and_sessions_clean,0,30)
session_event_sum_30_350 = get_session_event_sum(orders_and_sessions_clean,30,350)
session_event_sum_350_380 = get_session_event_sum(orders_and_sessions_clean,350,380)

In [None]:
session_event_sum_0_30

In [None]:
assert len(set(session_event_sum_0_30['ordno'])) == session_event_sum_0_30.shape[0]
assert len(set(session_event_sum_30_350['ordno'])) == session_event_sum_30_350.shape[0]
assert len(set(session_event_sum_350_380['ordno'])) == session_event_sum_350_380.shape[0]

assert len(set(modeling_dataset['ordno'])) == modeling_dataset.shape[0]

In [None]:
modeling_dataset_full = modeling_dataset_orders.merge(
    session_event_sum_0_30,
    on='ordno',
    how='left'
).merge(
    session_event_sum_30_350,
    on='ordno',
    how='left'
).merge(
    session_event_sum_350_380,
    on='ordno',
    how='left'
)

In [None]:
modeling_dataset_full.shape[0]

In [None]:
modeling_dataset.shape[0]

In [None]:
assert modeling_dataset_full.shape[0] == modeling_dataset.shape[0]

In [None]:
assert modeling_dataset_full.shape[0] == order_totals.shape[0]

In [None]:
assert len(set(modeling_dataset_full['ordno'])) == modeling_dataset_full.shape[0]

# Clean Up Final Modeling Dataset

#### Drop Redundant Variables

In [None]:
drop_vars = [
    'ordno_current_x', 'ordno_current_y', 'ordno_current'
]

In [None]:
modeling_dataset_final = modeling_dataset_full.drop(drop_vars, axis=1)

#### Filling Missing Counts with 0s

In [None]:
modeling_dataset_final = modeling_dataset_final.fillna(0).copy()

# Split into Training and Test

In [None]:
sum(modeling_dataset_final['orderdate']>pd.to_datetime('2018-12-01'))/modeling_dataset_final.shape[0]

# Create Features

In [None]:
online.shape

In [None]:
custno_join[
    (custno_join['orderdate']>custno_join['dt'])
].shape

In [None]:
online = pd.read_csv('data/online.csv')
online['dt'] = pd.to_datetime(online['dt'])

In [None]:
order = pd.read_csv('data/order.csv')
order['orderdate'] = pd.to_datetime(order['orderdate'])

In [None]:
online.head()

In [None]:
order.head()

In [None]:
custno_join = pd.merge(order,online,how='left',on=['custno'])