# _Create Raw Modeling File:_ 
In this section I created a raw version of the modeling file that was eventually used to predict the category of product a user would purchase next. It resulted in the following datasets:
 - final_previous_order_df - Which matches each order with all the previous orders counts made by that customer for the last 14 months. 
 - dependent_vars - This contained the raw depenent variable of prodcat1 purchases as well as the month of purchase.
 - previous_online_sessions_by_week - This table consists of all the previous online session event counts made by that customer for the last 53 weeks.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 1000)
pd.set_option('max_columns',100)

In [2]:
from dataset_creation import (
    get_order_total_at_order_level, get_previous_orders_by_month, 
    get_previous_online_sessions_by_week, get_online_sessions_df
)

# Get Order Data

In [3]:
order = pd.read_csv('data/order.csv')

#### Aggregate purchase counts to order level

In [4]:
order_totals = get_order_total_at_order_level(order)

#### Filtering to orders that occured durint the time where we have online data

In [5]:
order_totals = order_totals[
    (order_totals['orderdate']>= pd.to_datetime('2016-01-01'))
    & (order_totals['orderdate']< pd.to_datetime('2018-01-01'))
].copy()

In [6]:
order_totals.shape

(102915, 263)

#### Create new dataset unique at the order and purchase level

In [7]:
final_previous_order_df = get_previous_orders_by_month(order_totals, 14)

In [8]:
final_previous_order_df.head()

Unnamed: 0,ordno,orderdate,months_before,ordno_current,days_before,P2:-1.0,P2:-7.0,P2:10.0,P2:100.0,P2:101.0,P2:102.0,P2:103.0,P2:104.0,P2:105.0,P2:106.0,P2:107.0,P2:108.0,P2:109.0,P2:11.0,P2:110.0,P2:111.0,P2:112.0,P2:113.0,P2:114.0,P2:115.0,P2:116.0,P2:117.0,P2:118.0,P2:119.0,P2:12.0,P2:120.0,P2:121.0,P2:122.0,P2:123.0,P2:124.0,P2:125.0,P2:126.0,P2:127.0,P2:128.0,P2:129.0,P2:13.0,P2:130.0,P2:131.0,P2:132.0,P2:133.0,P2:134.0,P2:135.0,P2:136.0,P2:137.0,P2:138.0,...,P2:51.0,P2:52.0,P2:53.0,P2:54.0,P2:55.0,P2:56.0,P2:57.0,P2:58.0,P2:59.0,P2:6.0,P2:60.0,P2:61.0,P2:62.0,P2:63.0,P2:64.0,P2:65.0,P2:66.0,P2:67.0,P2:69.0,P2:7.0,P2:70.0,P2:71.0,P2:72.0,P2:73.0,P2:74.0,P2:75.0,P2:76.0,P2:77.0,P2:78.0,P2:79.0,P2:8.0,P2:80.0,P2:81.0,P2:82.0,P2:83.0,P2:85.0,P2:86.0,P2:88.0,P2:89.0,P2:9.0,P2:90.0,P2:91.0,P2:92.0,P2:93.0,P2:94.0,P2:95.0,P2:96.0,P2:97.0,P2:98.0,P2:99.0
0,1,2017-06-12 08:27:59,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2017-06-12 08:27:59,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2017-06-12 08:27:59,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2017-06-12 08:27:59,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2017-06-12 08:27:59,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Clean up data before saving

In [9]:
dependent_vars = order_totals[
    ['ordno', 'orderdate', 'ordermonth',
     'P1:1.0', 'P1:2.0', 'P1:3.0', 
     'P1:4.0', 'P1:5.0', 'P1:7.0']
].copy().sort_values(by=['ordno'])

In [10]:
final_previous_order_df.shape

(1440810, 258)

In [11]:
assert (
    final_previous_order_df[
        ['ordno', 'months_before']
    ].drop_duplicates().shape[0] 
    == final_previous_order_df.shape[0]
)

# Get Online Data

In [12]:
# Read in Data
online = pd.read_csv('data/online.csv')

In [13]:
online_sessions_df = get_online_sessions_df(online.copy())

In [14]:
previous_online_sessions_by_week = get_previous_online_sessions_by_week(
    online_sessions_df, order_totals, 53
)

In [15]:
assert 0 == len(set(previous_online_sessions_by_week['ordno']) - set(order_totals['ordno']))
assert previous_online_sessions_by_week.shape[0]%53 == 0

# Export Datasets

In [16]:
# Quality Check
prev_sess_ids = set(previous_online_sessions_by_week['ordno'])
prev_ord_ids = set(final_previous_order_df['ordno'])
ord_ids = set(order_totals['ordno'])

assert len(prev_sess_ids - prev_ord_ids) == 0
assert len(prev_ord_ids - ord_ids) == 0
assert len(prev_sess_ids - ord_ids) == 0

In [17]:
final_previous_order_df.to_pickle('data/online_and_order/final_previous_order_df.pkl')

In [18]:
dependent_vars.to_pickle('data/online_and_order/dependent_vars.pkl')

In [19]:
previous_online_sessions_by_week.to_pickle('data/online_and_order/previous_online_sessions_by_week.pkl')