In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import KFold

import nltk

from functools import partial

In [11]:
order_products_train_df = pd.read_csv('order_products__train.csv',
                                     dtype={'order_id': np.int32, 'product_id': np.int32,
                                           'add_to_cart_order': np.int16, 'reordered': np.int8})
order_products_prior_df = pd.read_csv('order_products__prior.csv')
orders_df = pd.read_csv('orders.csv')
products_df = pd.read_csv('products.csv')
aisles_df = pd.read_csv('aisles.csv')
departments_df = pd.read_csv('departments.csv')

In [12]:
# Merge DF's
items = pd.merge(products_df, departments_df, how='left')

In [13]:
items = pd.merge(items, aisles_df, how='left')

In [14]:
items.product_name = items.product_name.str.replace(' ', '_').str.lower()

In [16]:
items.dtypes

product_id        int64
product_name     object
aisle_id          int64
department_id     int64
department       object
aisle            object
dtype: object

In [17]:
# Memory Reduction
items.product_id = items.product_id.astype(np.int32)
items.aisle_id = items.aisle_id.astype(np.int32)
items.department_id = items.department_id.astype(np.int32)

In [20]:
op_train = pd.read_csv('order_products__train.csv',
                                     dtype={'order_id': np.int32, 'product_id': np.int32,
                                           'add_to_cart_order': np.int16, 'reordered': np.int8})

In [21]:
op_prior = pd.read_csv('order_products__prior.csv',
                                     dtype={'order_id': np.int32, 'product_id': np.int32,
                                           'add_to_cart_order': np.int16, 'reordered': np.int8})

In [22]:
orders = pd.read_csv('orders.csv', dtype={'order_id': np.int32, 
                                                           'user_id': np.int32, 
                                                           'order_number': np.int16,  # max 100, could use int8
                                                           'order_dow': np.int8, 
                                                           'order_hour_of_day': np.int8, 
                                                           'days_since_prior_order': np.float16})


In [23]:
orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.int8)
orders.days_since_prior_order = orders.days_since_prior_order.fillna(-1).astype(np.int8)

In [24]:
indexes = np.linspace(0, len(op_prior), num=10, dtype=np.int32)

train_details = pd.merge(op_train, orders, how='left', on='order_id').apply(partial(pd.to_numeric, errors='ignore'
                                                                                   ,downcast='integer'))

In [26]:
train_details = pd.merge(train_details, items[['product_id', 'aisle_id', 'department_id']].apply(partial(pd.to_numeric,
                                                                                                        errors='ignore',
                                                                                                        downcast='integer')),
                         how='left', on='product_id')

In [27]:
print(train_details.shape, op_train.shape)

(1384617, 12) (1384617, 4)


In [34]:
order_details = pd.merge(left=pd.merge(op_prior, items[['product_id', 'aisle_id', 'department_id' ]],
                                how='left', on='product_id'), right=orders, how='left', on='order_id')

In [28]:
#  Set eval_set to 1 because it is in the training set
train_orders = orders[orders.eval_set == 1]

In [29]:
train_orders.index.name = 'raw_order'
train_orders.reset_index(inplace=True)

In [72]:
order_details.head(50)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,86,16,202279,0,3,5,9,8
1,2,28985,2,1,83,4,202279,0,3,5,9,8
2,2,9327,3,0,104,13,202279,0,3,5,9,8
3,2,45918,4,1,19,13,202279,0,3,5,9,8
4,2,30035,5,0,17,13,202279,0,3,5,9,8
5,2,17794,6,1,83,4,202279,0,3,5,9,8
6,2,40141,7,1,105,13,202279,0,3,5,9,8
7,2,1819,8,1,88,13,202279,0,3,5,9,8
8,2,43668,9,0,123,4,202279,0,3,5,9,8
9,3,33754,1,1,120,16,205970,0,16,5,17,12


In [69]:
df = order_details.groupby('user_id')['order_number'].max()

In [70]:
df

user_id
1         10
2         14
3         12
4          5
5          4
6          3
7         20
8          3
9          3
10         5
11         7
12         5
13        12
14        13
15        22
16         6
17        40
18         6
19         9
20         4
21        33
22        15
23         4
24        18
25         3
26        12
27        81
28        24
29        18
30         8
          ..
206180    18
206181    14
206182    10
206183    14
206184     4
206185    10
206186     3
206187    34
206188     7
206189     6
206190     8
206191     5
206192    14
206193    41
206194    11
206195    19
206196     4
206197    23
206198     7
206199    19
206200    23
206201    32
206202    22
206203     5
206204     4
206205     3
206206    67
206207    16
206208    49
206209    13
Name: order_number, Length: 206209, dtype: int16

In [75]:
def get_last_orders_reordered(test_orders):
    #     Match the user_id in training set to the user_id in test set
    test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]
    #     Order by each user's biggest order (these will have the best learning for predicitons)
    last_orders = test_history.groupby('user_id')['order_number'].max()
    
    #   merge the last_orders to test history where the user reordered 
    order_history = pd.merge(last_orders.reset_index(), test_history[test_history.reordered == 1], how='left', 
                             on=['user_id', 'order_number'])

    #     merge order history (above) with the test orders to create a predicting model.. fill the null with -1. This indicates if the item predicted did not match the purchased
    test_history = pd.merge(order_history[['user_id', 'product_id']], test_orders[['user_id', 'order_id']], how='left', 
                 on='user_id').fillna(-1).groupby('order_id')['product_id'].apply(lambda x: ' '.join([str(int(e)) for e in set(x)])).reset_index().replace(to_replace='-1', value='None')
    test_history.columns = ['order_id', 'products']
    
    # occasionally there is a bug where a line with order_id == -1 makes it through. doesn't *seem* to effect things
    return test_history[test_history.order_id > 0].set_index('order_id')

In [85]:
cvpreds = []

kf = KFold(3, shuffle=False, random_state=3)
for train_index, test_index in kf.split(train_orders.index):
    cvpreds.append(get_last_orders_reordered(train_orders.iloc[test_index]))
    
df_cvpreds = pd.concat(cvpreds).sort_index()
df_cvpreds.head()

Unnamed: 0_level_0,products
order_id,Unnamed: 1_level_1
1,43633 30881 5707 14947
36,35939 24964 26629 581 44359 47734 16759
38,33731 8012
96,24489 27966
98,4357 43654 34065 19731 1939 45204 33686 40986 ...


# Generate a CSV Submission Format F1 Score

In [38]:
test = pd.read_csv('sample_submission.csv')

In [86]:
try:
    df_train_gt = pd.read_csv('train.csv', index_col='order_id')
except:
    train_gtl = []

    for uid, subset in train_details.groupby('user_id'):
        subset1 = subset[subset.reordered == 1]
        oid = subset.order_id.values[0]

        if len(subset1) == 0:
            train_gtl.append((oid, 'None'))
            continue

        ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
        # .strip is needed because join can have a padding space at the end
        train_gtl.append((oid, ostr.strip()))

    df_train_gt = pd.DataFrame(train_gtl)

    df_train_gt.columns = ['order_id', 'products']
    df_train_gt.set_index('order_id', inplace=True)
    df_train_gt.sort_index(inplace=True)
    
#     df_train_gt.to_csv('train.csv')

In [87]:
f1 = []
for gt, pred in zip(df_train_gt.sort_index().products, df_cvpreds.sort_index().products):
    lgt = gt.replace("None", "-1").split(' ')
    lpred = pred.replace("None", "-1").split(' ')
    
    rr = (np.intersect1d(lgt, lpred))
    precision = np.float(len(rr)) / len(lpred)
    recall = np.float(len(rr)) / len(lgt)

    denom = precision + recall
    f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

print(np.mean(f1))

0.32558651711
