In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
fname_aisles = "data/raw/aisles.csv"
fname_products = "data/raw/products.csv"
fname_userorders = "data/raw/orders.csv"
fname_productorders = "data/raw/order_products__prior.csv"

In [3]:
# reading in aisles dataset
df_aisles = pd.read_csv(fname_aisles)
df_aisles.set_index('aisle_id', inplace=True)
df_aisles.head()

Unnamed: 0_level_0,aisle
aisle_id,Unnamed: 1_level_1
1,prepared soups salads
2,specialty cheeses
3,energy granola bars
4,instant foods
5,marinades meat preparation


In [9]:
# reading in products df
df_products = pd.read_csv(fname_products)
df_products = df_products[['product_id', 'aisle_id']]
df_products.set_index('product_id', inplace=True)
df_products.head()

Unnamed: 0_level_0,aisle_id
product_id,Unnamed: 1_level_1
1,61
2,104
3,94
4,38
5,5


In [10]:
# user id to order id mapping
df_userorders = pd.read_csv(fname_userorders)
df_userorders = df_userorders[['order_id', 'user_id']][df_userorders.eval_set == 'prior']
df_userorders.set_index('order_id', inplace=True)
df_userorders.head()

Unnamed: 0_level_0,user_id
order_id,Unnamed: 1_level_1
2539329,1
2398795,1
473747,1
2254736,1
431534,1


In [11]:
# product to order mapping
df_productorders = pd.read_csv(fname_productorders)

In [12]:
df_productorders = df_productorders[['order_id', 'product_id']]
df_productorders.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [13]:
# creating full dataset
df_full = df_productorders.merge(df_userorders, left_on='order_id', right_on=df_userorders.index.values)

In [15]:
df_full = df_full.merge(df_products, left_on='product_id', right_on=df_products.index.values)

In [16]:
df_full.head()

Unnamed: 0,order_id,product_id,user_id,aisle_id
0,2,33120,202279,86
1,26,33120,153404,86
2,120,33120,23750,86
3,327,33120,58707,86
4,390,33120,166654,86


In [18]:
user_aisle_count = df_full[['user_id', 'aisle_id']].groupby(by=['user_id', 'aisle_id']).size()

In [21]:
user_aisle_count

user_id  aisle_id
1        21           8
         23          12
         24           5
         45           1
         53           2
         54           2
         77          13
         88           1
         91           2
         117          9
         120          1
         121          3
2        1            1
         3            6
         14           1
         17           2
         20           1
         21           5
         23          10
         24          33
         31           5
         38          12
         42           4
         48           2
         49           1
         57           1
         58           1
         66           3
         67           9
         72           5
                     ..
206209   59           2
         60           1
         61           1
         66           1
         67           4
         74           2
         75           8
         77          10
         78           4
         81           

In [22]:
user_aisle_count = user_aisle_count.unstack(fill_value=0)

In [23]:
user_aisle_count

aisle_id,1,2,3,4,5,6,7,8,9,10,...,125,126,127,128,129,130,131,132,133,134
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Writing matrix values to csv file
pd.DataFrame(user_aisle_count.values).to_csv("data/interim/user_aisle_matrix_values.csv", header=False, index=False)

In [25]:
# Writing column headers to csv file
pd.DataFrame(user_aisle_count.columns).T.to_csv("data/interim/user_aisle_matrix_headers.csv", header=False, index=False)

In [26]:
# Writing row IDs to csv file
pd.DataFrame(user_aisle_count.index).T.to_csv("data/interim/user_aisle_matrix_rowIDs.csv", header=False, index=False)