# TODO
1. Clean Dataset
- Create Modeling Dataset
- Split into Training and Test
- Save Datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import networkx as nx

pd.set_option('display.max_rows', 1000)
pd.set_option('max_columns',100)

# Clean Order Data

In [2]:
def rename_categories(prefix, current_name):
    return f'{prefix}:{current_name}'

def test_rename_categories():
    obs = rename_categories(0,'It_Worked!')
    exp = '0:It_Worked!'
    print(obs)
    print(exp)
    assert obs == exp

test_rename_categories()

0:It_Worked!
0:It_Worked!


In [3]:
order = pd.read_csv('data/order.csv')
order['orderdate'] = pd.to_datetime(order['orderdate'])

In [4]:
order.shape

(263278, 6)

#### Clean Order Data

In [5]:
def clean_96_overlap(prd_cat1, prd_cat2):
    if prd_cat2 == 96:
        prd_cat1 = 7
    return prd_cat1

def test_clean_96_overlap():
    in_data = [
        [0,1],
        [3,96],
        [7,96]
    ]
    obs = [clean_96_overlap(x[0], x[1]) for x in in_data]
    exp = [0,7,7]
    assert obs == exp

test_clean_96_overlap()

def prod_cat2_nan_fill(prd_cat1, prd_cat2):
    if np.isnan(prd_cat2):
        return -1*prd_cat1
    else:
        return prd_cat2

def test_prod_cat2_nan_fill():
    in_data = [
        [1,np.nan],
        [2,np.nan],
        [3,199],
        [1,2]
    ]
    obs = [prod_cat2_nan_fill(x[0], x[1]) for x in in_data]
    exp = [-1,-2, 199, 2]
    assert obs == exp, f'obs:{obs}, exp:{exp}'

test_prod_cat2_nan_fill()

In [6]:
order['prodcat2'] = order[['prodcat1', 'prodcat2']].apply(
    lambda row: prod_cat2_nan_fill(row['prodcat1'], row['prodcat2']), axis=1
)

In [7]:
order['prodcat1'] = order[['prodcat1', 'prodcat2']].apply(
    lambda row: clean_96_overlap(row['prodcat1'], row['prodcat2']), axis=1
)

#### Get Dummies for prodcat2 and prodcat1

In [8]:
order['prodcat1'] = order[['prodcat1']].apply(lambda row: rename_categories('P1',row[0]), axis=1)
order['prodcat2'] = order[['prodcat2']].apply(lambda row: rename_categories('P2',row[0]), axis=1)

In [9]:
p1_dummies = pd.get_dummies(order['prodcat1'])
p2_dummies = pd.get_dummies(order['prodcat2'])

In [10]:
pcat_columns = list(p1_dummies.columns) + list(p2_dummies.columns)

In [11]:
order_expanded = pd.concat([order, p1_dummies, p2_dummies], axis=1)

In [12]:
order_expanded.head()

Unnamed: 0,custno,ordno,orderdate,prodcat2,prodcat1,revenue,P1:1.0,P1:2.0,P1:3.0,P1:4.0,P1:5.0,P1:7.0,P2:-1.0,P2:-7.0,P2:10.0,P2:100.0,P2:101.0,P2:102.0,P2:103.0,P2:104.0,P2:105.0,P2:106.0,P2:107.0,P2:108.0,P2:109.0,P2:11.0,P2:110.0,P2:111.0,P2:112.0,P2:113.0,P2:114.0,P2:115.0,P2:116.0,P2:117.0,P2:118.0,P2:119.0,P2:12.0,P2:120.0,P2:121.0,P2:122.0,P2:123.0,P2:124.0,P2:125.0,P2:126.0,P2:127.0,P2:128.0,P2:129.0,P2:13.0,P2:130.0,P2:131.0,...,P2:51.0,P2:52.0,P2:53.0,P2:54.0,P2:55.0,P2:56.0,P2:57.0,P2:58.0,P2:59.0,P2:6.0,P2:60.0,P2:61.0,P2:62.0,P2:63.0,P2:64.0,P2:65.0,P2:66.0,P2:67.0,P2:69.0,P2:7.0,P2:70.0,P2:71.0,P2:72.0,P2:73.0,P2:74.0,P2:75.0,P2:76.0,P2:77.0,P2:78.0,P2:79.0,P2:8.0,P2:80.0,P2:81.0,P2:82.0,P2:83.0,P2:85.0,P2:86.0,P2:88.0,P2:89.0,P2:9.0,P2:90.0,P2:91.0,P2:92.0,P2:93.0,P2:94.0,P2:95.0,P2:96.0,P2:97.0,P2:98.0,P2:99.0
0,18944,64694,2016-11-27 20:57:20,P2:-1.0,P1:1.0,53.3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,18944,114405,2017-04-29 20:18:04,P2:-1.0,P1:1.0,0.1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18944,28906,2017-04-23 21:31:03,P2:-1.0,P1:1.0,141.66,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,36096,62681,2016-02-25 07:16:33,P2:-1.0,P1:1.0,36.82,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,2017-06-12 08:27:59,P2:-1.0,P1:1.0,8.35,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# Create Dictionary for Aggregation
order_agg_dict = {}
for pcat_column in pcat_columns:
    order_agg_dict[pcat_column] = 'sum'
order_agg_dict['orderdate'] = 'min'

In [14]:
order_totals = order_expanded.groupby(['ordno', 'custno'], as_index=False).agg(order_agg_dict)

In [15]:
order_totals['ordermonth'] = order_totals['orderdate'].dt.month

# Create Modeling Datsets

## Approach #3 - Fully Connected Neural Network
    Unique at Order Level
    Input:
        Previous Purchase History(1008):
            Replicate(4X) for: (Last 30 Days) (12 Months Ago) (This Year) (Everytime Else)
            Variables:
                # of Purchases in each product category 252
        Session Count Columns (92):
            Replicate(4X) for: (Last 30 Days) (12 Months Ago) (Rest of Year) (Everytime Else)
            Variables:
                # Event 1 for all 10 categories
                # Event 2 for all 10 categories
                # # of category obs for all 3 categories
    Output: Product Category Purchased

In [16]:
modeling_dataset = order_totals[
    ['ordno', 'custno', 'orderdate', 'ordermonth','P1:1.0','P1:2.0','P1:3.0','P1:4.0','P1:5.0','P1:7.0']
].copy()

In [17]:
# Confirm Unique at Order Level
assert (modeling_dataset[['ordno','custno']].drop_duplicates().shape[0] 
        == modeling_dataset[['ordno']].drop_duplicates().shape[0])

In [18]:
merged_orders = modeling_dataset.merge(
    order_totals, on='custno', how='left',
    suffixes=('_current', '_previous')
)

In [19]:
all_prev_orders = merged_orders[merged_orders['orderdate_current']> merged_orders['orderdate_previous']].copy()

In [20]:
all_prev_orders['days_before'] = (
    all_prev_orders['orderdate_current'] - all_prev_orders['orderdate_previous']
).astype('timedelta64[s]')/3600/24

In [21]:
all_prev_orders.head().T

Unnamed: 0,3,5,14,15,16
ordno_current,2,2,4,4,4
custno,2,2,4,4,4
orderdate_current,2016-03-29 13:04:27,2016-03-29 13:04:27,2017-08-14 21:22:10,2017-08-14 21:22:10,2017-08-14 21:22:10
ordermonth_current,3,3,8,8,8
P1:1.0_current,1,1,1,1,1
P1:2.0_current,0,0,0,0,0
P1:3.0_current,0,0,0,0,0
P1:4.0_current,0,0,0,0,0
P1:5.0_current,0,0,0,0,0
P1:7.0_current,0,0,0,0,0


In [22]:
all_prev_orders_clean = all_prev_orders[
    [
        'ordno_current', 'days_before',
        'P2:-1.0', 'P2:-7.0', 'P2:10.0', 'P2:100.0', 'P2:101.0', 'P2:102.0', 'P2:103.0', 'P2:104.0', 'P2:105.0',
        'P2:106.0', 'P2:107.0', 'P2:108.0', 'P2:109.0', 'P2:11.0', 'P2:110.0', 'P2:111.0', 'P2:112.0', 'P2:113.0',
        'P2:114.0', 'P2:115.0', 'P2:116.0', 'P2:117.0', 'P2:118.0', 'P2:119.0', 'P2:12.0', 'P2:120.0', 'P2:121.0',
        'P2:122.0', 'P2:123.0', 'P2:124.0', 'P2:125.0', 'P2:126.0', 'P2:127.0', 'P2:128.0', 'P2:129.0', 'P2:13.0',
        'P2:130.0', 'P2:131.0', 'P2:132.0', 'P2:133.0', 'P2:134.0', 'P2:135.0', 'P2:136.0', 'P2:137.0', 'P2:138.0',
        'P2:139.0', 'P2:14.0', 'P2:140.0', 'P2:141.0', 'P2:142.0', 'P2:143.0', 'P2:144.0', 'P2:145.0', 'P2:146.0',
        'P2:147.0', 'P2:148.0', 'P2:149.0', 'P2:15.0', 'P2:150.0', 'P2:151.0', 'P2:152.0', 'P2:153.0', 'P2:154.0',
        'P2:155.0', 'P2:156.0', 'P2:157.0', 'P2:158.0', 'P2:159.0', 'P2:16.0', 'P2:160.0', 'P2:161.0', 'P2:162.0',
        'P2:164.0', 'P2:165.0', 'P2:166.0', 'P2:167.0', 'P2:168.0', 'P2:169.0', 'P2:17.0', 'P2:170.0', 'P2:171.0',
        'P2:172.0', 'P2:173.0', 'P2:174.0', 'P2:175.0', 'P2:176.0', 'P2:177.0', 'P2:178.0', 'P2:179.0', 'P2:18.0',
        'P2:180.0', 'P2:181.0', 'P2:182.0', 'P2:183.0', 'P2:184.0', 'P2:185.0', 'P2:186.0', 'P2:187.0', 'P2:188.0',
        'P2:189.0', 'P2:19.0', 'P2:190.0', 'P2:191.0', 'P2:192.0', 'P2:193.0', 'P2:194.0', 'P2:195.0', 'P2:196.0',
        'P2:197.0', 'P2:198.0', 'P2:199.0', 'P2:2.0', 'P2:20.0', 'P2:200.0', 'P2:201.0', 'P2:202.0', 'P2:203.0',
        'P2:204.0', 'P2:205.0', 'P2:206.0', 'P2:207.0', 'P2:208.0', 'P2:209.0', 'P2:21.0', 'P2:210.0', 'P2:211.0',
        'P2:212.0', 'P2:213.0', 'P2:214.0', 'P2:215.0', 'P2:216.0', 'P2:217.0', 'P2:218.0', 'P2:219.0', 'P2:220.0',
        'P2:221.0', 'P2:222.0', 'P2:223.0', 'P2:224.0', 'P2:225.0', 'P2:226.0', 'P2:227.0', 'P2:228.0', 'P2:229.0',
        'P2:23.0', 'P2:230.0', 'P2:231.0', 'P2:232.0', 'P2:233.0', 'P2:234.0', 'P2:235.0', 'P2:236.0', 'P2:237.0',
        'P2:238.0', 'P2:239.0', 'P2:24.0', 'P2:240.0', 'P2:241.0', 'P2:243.0', 'P2:244.0', 'P2:245.0', 'P2:246.0',
        'P2:247.0', 'P2:248.0', 'P2:249.0', 'P2:25.0', 'P2:250.0', 'P2:251.0', 'P2:252.0', 'P2:253.0', 'P2:255.0',
        'P2:256.0', 'P2:257.0', 'P2:258.0', 'P2:259.0', 'P2:26.0', 'P2:260.0', 'P2:261.0', 'P2:262.0', 'P2:263.0',
        'P2:27.0', 'P2:28.0', 'P2:3.0', 'P2:30.0', 'P2:32.0', 'P2:33.0', 'P2:34.0', 'P2:35.0', 'P2:38.0',
        'P2:39.0', 'P2:4.0', 'P2:40.0', 'P2:41.0', 'P2:42.0', 'P2:43.0', 'P2:44.0', 'P2:45.0', 'P2:46.0',
        'P2:47.0', 'P2:48.0', 'P2:49.0', 'P2:5.0', 'P2:50.0', 'P2:51.0', 'P2:52.0', 'P2:53.0', 'P2:54.0',
        'P2:55.0', 'P2:56.0', 'P2:57.0', 'P2:58.0', 'P2:59.0', 'P2:6.0', 'P2:60.0', 'P2:61.0', 'P2:62.0',
        'P2:63.0', 'P2:64.0', 'P2:65.0', 'P2:66.0', 'P2:67.0', 'P2:69.0', 'P2:7.0', 'P2:70.0', 'P2:71.0',
        'P2:72.0', 'P2:73.0', 'P2:74.0', 'P2:75.0', 'P2:76.0', 'P2:77.0', 'P2:78.0', 'P2:79.0', 'P2:8.0',
        'P2:80.0', 'P2:81.0', 'P2:82.0', 'P2:83.0', 'P2:85.0', 'P2:86.0', 'P2:88.0', 'P2:89.0', 'P2:9.0',
        'P2:90.0', 'P2:91.0', 'P2:92.0', 'P2:93.0', 'P2:94.0', 'P2:95.0', 'P2:96.0', 'P2:97.0', 'P2:98.0',
        'P2:99.0' 
]

]

#### Get Purchase Counts

In [23]:
def get_order_count_sum(all_previous_orders, min_b4, max_b4):
    relevant_previous_orders = all_previous_orders[
        (all_previous_orders['days_before']>min_b4) 
        & (all_previous_orders['days_before']< max_b4)
    ]
    del(relevant_previous_orders['days_before'])
    
    order_count_sum = relevant_previous_orders.groupby('ordno_current', as_index=False).sum()
    
    # Fix Column Names
    agg_cols = list(order_count_sum.columns)[1:]
    new_cols = [x+f'_{min_b4}_{max_b4}' for x in agg_cols]
    new_cols = ['ordno_current'] + new_cols
    order_count_sum.columns = new_cols

    return order_count_sum

In [24]:
order_count_sum_0_30 = get_order_count_sum(all_prev_orders_clean,0,30)
order_count_sum_30_350 = get_order_count_sum(all_prev_orders_clean,30,350)
order_count_sum_350_380 = get_order_count_sum(all_prev_orders_clean,350,380)

In [25]:
order_count_sum_0_30.head()

Unnamed: 0,ordno_current,P2:-1.0_0_30,P2:-7.0_0_30,P2:10.0_0_30,P2:100.0_0_30,P2:101.0_0_30,P2:102.0_0_30,P2:103.0_0_30,P2:104.0_0_30,P2:105.0_0_30,P2:106.0_0_30,P2:107.0_0_30,P2:108.0_0_30,P2:109.0_0_30,P2:11.0_0_30,P2:110.0_0_30,P2:111.0_0_30,P2:112.0_0_30,P2:113.0_0_30,P2:114.0_0_30,P2:115.0_0_30,P2:116.0_0_30,P2:117.0_0_30,P2:118.0_0_30,P2:119.0_0_30,P2:12.0_0_30,P2:120.0_0_30,P2:121.0_0_30,P2:122.0_0_30,P2:123.0_0_30,P2:124.0_0_30,P2:125.0_0_30,P2:126.0_0_30,P2:127.0_0_30,P2:128.0_0_30,P2:129.0_0_30,P2:13.0_0_30,P2:130.0_0_30,P2:131.0_0_30,P2:132.0_0_30,P2:133.0_0_30,P2:134.0_0_30,P2:135.0_0_30,P2:136.0_0_30,P2:137.0_0_30,P2:138.0_0_30,P2:139.0_0_30,P2:14.0_0_30,P2:140.0_0_30,P2:141.0_0_30,...,P2:51.0_0_30,P2:52.0_0_30,P2:53.0_0_30,P2:54.0_0_30,P2:55.0_0_30,P2:56.0_0_30,P2:57.0_0_30,P2:58.0_0_30,P2:59.0_0_30,P2:6.0_0_30,P2:60.0_0_30,P2:61.0_0_30,P2:62.0_0_30,P2:63.0_0_30,P2:64.0_0_30,P2:65.0_0_30,P2:66.0_0_30,P2:67.0_0_30,P2:69.0_0_30,P2:7.0_0_30,P2:70.0_0_30,P2:71.0_0_30,P2:72.0_0_30,P2:73.0_0_30,P2:74.0_0_30,P2:75.0_0_30,P2:76.0_0_30,P2:77.0_0_30,P2:78.0_0_30,P2:79.0_0_30,P2:8.0_0_30,P2:80.0_0_30,P2:81.0_0_30,P2:82.0_0_30,P2:83.0_0_30,P2:85.0_0_30,P2:86.0_0_30,P2:88.0_0_30,P2:89.0_0_30,P2:9.0_0_30,P2:90.0_0_30,P2:91.0_0_30,P2:92.0_0_30,P2:93.0_0_30,P2:94.0_0_30,P2:95.0_0_30,P2:96.0_0_30,P2:97.0_0_30,P2:98.0_0_30,P2:99.0_0_30
0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,12,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
assert len(set(order_count_sum_0_30['ordno_current'])) == order_count_sum_0_30.shape[0]
assert len(set(order_count_sum_30_350['ordno_current'])) == order_count_sum_30_350.shape[0]
assert len(set(order_count_sum_350_380['ordno_current'])) == order_count_sum_350_380.shape[0]

assert len(set(modeling_dataset['ordno'])) == modeling_dataset.shape[0]

In [27]:
modeling_dataset.head()

Unnamed: 0,ordno,custno,orderdate,ordermonth,P1:1.0,P1:2.0,P1:3.0,P1:4.0,P1:5.0,P1:7.0
0,1,1,2017-06-12 08:27:59,6,1,0,0,0,0,0
1,2,2,2016-03-29 13:04:27,3,1,0,0,0,0,0
2,3,3,2017-01-31 19:16:49,1,1,0,0,0,0,0
3,4,4,2017-08-14 21:22:10,8,1,0,0,0,0,0
4,5,5,2016-01-16 16:59:06,1,1,0,0,0,0,0


In [28]:
modeling_dataset_orders = modeling_dataset.merge(
    order_count_sum_0_30,
    left_on='ordno', right_on='ordno_current',
    how='left'
).merge(
    order_count_sum_30_350,
    left_on='ordno', right_on='ordno_current',
    how='left'
).merge(
    order_count_sum_350_380,
    left_on='ordno', right_on='ordno_current',
    how='left'
)

In [29]:
# Ensure all Orders are stored
assert modeling_dataset.shape[0]==modeling_dataset_orders.shape[0]

In [30]:
assert len(set(modeling_dataset_orders['ordno'])) == modeling_dataset_orders.shape[0]

In [31]:
modeling_dataset_orders.head()

Unnamed: 0,ordno,custno,orderdate,ordermonth,P1:1.0,P1:2.0,P1:3.0,P1:4.0,P1:5.0,P1:7.0,ordno_current_x,P2:-1.0_0_30,P2:-7.0_0_30,P2:10.0_0_30,P2:100.0_0_30,P2:101.0_0_30,P2:102.0_0_30,P2:103.0_0_30,P2:104.0_0_30,P2:105.0_0_30,P2:106.0_0_30,P2:107.0_0_30,P2:108.0_0_30,P2:109.0_0_30,P2:11.0_0_30,P2:110.0_0_30,P2:111.0_0_30,P2:112.0_0_30,P2:113.0_0_30,P2:114.0_0_30,P2:115.0_0_30,P2:116.0_0_30,P2:117.0_0_30,P2:118.0_0_30,P2:119.0_0_30,P2:12.0_0_30,P2:120.0_0_30,P2:121.0_0_30,P2:122.0_0_30,P2:123.0_0_30,P2:124.0_0_30,P2:125.0_0_30,P2:126.0_0_30,P2:127.0_0_30,P2:128.0_0_30,P2:129.0_0_30,P2:13.0_0_30,P2:130.0_0_30,P2:131.0_0_30,P2:132.0_0_30,...,P2:51.0_350_380,P2:52.0_350_380,P2:53.0_350_380,P2:54.0_350_380,P2:55.0_350_380,P2:56.0_350_380,P2:57.0_350_380,P2:58.0_350_380,P2:59.0_350_380,P2:6.0_350_380,P2:60.0_350_380,P2:61.0_350_380,P2:62.0_350_380,P2:63.0_350_380,P2:64.0_350_380,P2:65.0_350_380,P2:66.0_350_380,P2:67.0_350_380,P2:69.0_350_380,P2:7.0_350_380,P2:70.0_350_380,P2:71.0_350_380,P2:72.0_350_380,P2:73.0_350_380,P2:74.0_350_380,P2:75.0_350_380,P2:76.0_350_380,P2:77.0_350_380,P2:78.0_350_380,P2:79.0_350_380,P2:8.0_350_380,P2:80.0_350_380,P2:81.0_350_380,P2:82.0_350_380,P2:83.0_350_380,P2:85.0_350_380,P2:86.0_350_380,P2:88.0_350_380,P2:89.0_350_380,P2:9.0_350_380,P2:90.0_350_380,P2:91.0_350_380,P2:92.0_350_380,P2:93.0_350_380,P2:94.0_350_380,P2:95.0_350_380,P2:96.0_350_380,P2:97.0_350_380,P2:98.0_350_380,P2:99.0_350_380
0,1,1,2017-06-12 08:27:59,6,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2,2,2016-03-29 13:04:27,3,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3,3,2017-01-31 19:16:49,1,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,4,4,2017-08-14 21:22:10,8,1,0,0,0,0,0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,5,5,2016-01-16 16:59:06,1,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Clean Up Final Modeling Dataset

#### Drop Redundant Variables

In [32]:
drop_vars = [
    'ordno_current_x', 'ordno_current_y', 'ordno_current'
]

In [33]:
modeling_dataset_final = modeling_dataset_orders.drop(drop_vars, axis=1)

#### Filling Missing Counts with 0s

In [34]:
modeling_dataset_final = modeling_dataset_final.fillna(0).copy()

#### Make Dependent Variable Binary

In [74]:
def make_binary(value):
    if value>0:
        return 1
    else:
        return 0

In [76]:
dep_vars = ['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']

In [78]:
for var in dep_vars:
    modeling_dataset_final[var] = modeling_dataset_final[var].apply(make_binary)

In [80]:
modeling_dataset_final.head()

Unnamed: 0,ordno,custno,orderdate,ordermonth,P1:1.0,P1:2.0,P1:3.0,P1:4.0,P1:5.0,P1:7.0,P2:-1.0_0_30,P2:-7.0_0_30,P2:10.0_0_30,P2:100.0_0_30,P2:101.0_0_30,P2:102.0_0_30,P2:103.0_0_30,P2:104.0_0_30,P2:105.0_0_30,P2:106.0_0_30,P2:107.0_0_30,P2:108.0_0_30,P2:109.0_0_30,P2:11.0_0_30,P2:110.0_0_30,P2:111.0_0_30,P2:112.0_0_30,P2:113.0_0_30,P2:114.0_0_30,P2:115.0_0_30,P2:116.0_0_30,P2:117.0_0_30,P2:118.0_0_30,P2:119.0_0_30,P2:12.0_0_30,P2:120.0_0_30,P2:121.0_0_30,P2:122.0_0_30,P2:123.0_0_30,P2:124.0_0_30,P2:125.0_0_30,P2:126.0_0_30,P2:127.0_0_30,P2:128.0_0_30,P2:129.0_0_30,P2:13.0_0_30,P2:130.0_0_30,P2:131.0_0_30,P2:132.0_0_30,P2:133.0_0_30,...,P2:51.0_350_380,P2:52.0_350_380,P2:53.0_350_380,P2:54.0_350_380,P2:55.0_350_380,P2:56.0_350_380,P2:57.0_350_380,P2:58.0_350_380,P2:59.0_350_380,P2:6.0_350_380,P2:60.0_350_380,P2:61.0_350_380,P2:62.0_350_380,P2:63.0_350_380,P2:64.0_350_380,P2:65.0_350_380,P2:66.0_350_380,P2:67.0_350_380,P2:69.0_350_380,P2:7.0_350_380,P2:70.0_350_380,P2:71.0_350_380,P2:72.0_350_380,P2:73.0_350_380,P2:74.0_350_380,P2:75.0_350_380,P2:76.0_350_380,P2:77.0_350_380,P2:78.0_350_380,P2:79.0_350_380,P2:8.0_350_380,P2:80.0_350_380,P2:81.0_350_380,P2:82.0_350_380,P2:83.0_350_380,P2:85.0_350_380,P2:86.0_350_380,P2:88.0_350_380,P2:89.0_350_380,P2:9.0_350_380,P2:90.0_350_380,P2:91.0_350_380,P2:92.0_350_380,P2:93.0_350_380,P2:94.0_350_380,P2:95.0_350_380,P2:96.0_350_380,P2:97.0_350_380,P2:98.0_350_380,P2:99.0_350_380
0,1,1,2017-06-12 08:27:59,6,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2,2016-03-29 13:04:27,3,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3,2017-01-31 19:16:49,1,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,4,2017-08-14 21:22:10,8,1,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,5,2016-01-16 16:59:06,1,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split into Training and Test

In [89]:
variables = list(train_data.columns)[10:]

In [98]:
variables

['P2:-1.0_0_30',
 'P2:-7.0_0_30',
 'P2:10.0_0_30',
 'P2:100.0_0_30',
 'P2:101.0_0_30',
 'P2:102.0_0_30',
 'P2:103.0_0_30',
 'P2:104.0_0_30',
 'P2:105.0_0_30',
 'P2:106.0_0_30',
 'P2:107.0_0_30',
 'P2:108.0_0_30',
 'P2:109.0_0_30',
 'P2:11.0_0_30',
 'P2:110.0_0_30',
 'P2:111.0_0_30',
 'P2:112.0_0_30',
 'P2:113.0_0_30',
 'P2:114.0_0_30',
 'P2:115.0_0_30',
 'P2:116.0_0_30',
 'P2:117.0_0_30',
 'P2:118.0_0_30',
 'P2:119.0_0_30',
 'P2:12.0_0_30',
 'P2:120.0_0_30',
 'P2:121.0_0_30',
 'P2:122.0_0_30',
 'P2:123.0_0_30',
 'P2:124.0_0_30',
 'P2:125.0_0_30',
 'P2:126.0_0_30',
 'P2:127.0_0_30',
 'P2:128.0_0_30',
 'P2:129.0_0_30',
 'P2:13.0_0_30',
 'P2:130.0_0_30',
 'P2:131.0_0_30',
 'P2:132.0_0_30',
 'P2:133.0_0_30',
 'P2:134.0_0_30',
 'P2:135.0_0_30',
 'P2:136.0_0_30',
 'P2:137.0_0_30',
 'P2:138.0_0_30',
 'P2:139.0_0_30',
 'P2:14.0_0_30',
 'P2:140.0_0_30',
 'P2:141.0_0_30',
 'P2:142.0_0_30',
 'P2:143.0_0_30',
 'P2:144.0_0_30',
 'P2:145.0_0_30',
 'P2:146.0_0_30',
 'P2:147.0_0_30',
 'P2:148.0_0_30',

In [90]:
test_data = modeling_dataset_final[modeling_dataset_final['orderdate']>pd.to_datetime('2018-9-01')]

test_y = test_data[['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']]
test_x = test_data[variables]

In [91]:
test_y.shape

(13828, 6)

In [92]:
test_x.shape

(13828, 759)

In [93]:
test_x.to_pickle('data/test_x_order_only.pkl')
test_y.to_pickle('data/test_y_order_only.pkl')

In [94]:
train_data = modeling_dataset_final[modeling_dataset_final['orderdate']<=pd.to_datetime('2018-9-01')]

train_y = train_data[['P1:1.0', 'P1:2.0', 'P1:3.0', 'P1:4.0', 'P1:5.0', 'P1:7.0']]
train_x = train_data[variables]

In [95]:
train_x.shape

(135889, 759)

In [96]:
train_y.shape

(135889, 6)

In [97]:
train_x.to_pickle('data/train_x_order_only.pkl')
train_y.to_pickle('data/train_y_order_only.pkl')