In [1]:
import numpy as np
import pandas as pd 
import scipy.sparse as ssp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [2]:
#Input data
data_path = "/Users/BharathiSrinivasan/Documents/HU-MEMS-Sem3/Info_Systems/repo/InstaCart/input/"
orders = pd.read_csv(data_path + "orders.csv")
train_orders = pd.read_csv(data_path + "order_products__train.csv")
prior_orders = pd.read_csv(data_path + "order_products__prior.csv")
products = pd.read_csv(data_path + "products.csv").set_index('product_id')

In [8]:
#Only keep orders which have been reordered
prior_orders = prior_orders[prior_orders.reordered==1]

#Merge with orders dataset to get user data
prior_ord = pd.merge(prior_orders,orders,on='order_id',how='left')
products = products.reset_index()
prior_ord.head()

In [9]:
#Merge with products dataset
prior_ord = pd.merge(prior_ord,products,on='product_id',how='left')
prior_ord[['user_id','order_id','product_id','product_name','reordered']].head()

Unnamed: 0,user_id,order_id,product_id,product_name,reordered
0,202279,2,33120,Organic Egg Whites,1
1,202279,2,28985,Michigan Organic Kale,1
2,202279,2,45918,Coconut Butter,1
3,202279,2,17794,Carrots,1
4,202279,2,40141,Original Unflavored Gelatine Mix,1


In [10]:
prior_ord["product_name"] = prior_ord["product_name"].astype(str)
prior_ord = prior_ord.groupby("user_id").apply(lambda order: order['product_name'].tolist())
prior_ord = prior_ord.reset_index()
prior_ord.columns = ['user_id','product_set']
prior_ord.product_set = prior_ord.product_set.astype(str)
prior_ord.head()

Unnamed: 0,user_id,product_set
0,1,"['Soda', 'Original Beef Jerky', 'Pistachios', ..."
1,2,"['Chipotle Beef & Pork Realstick', 'Chipotle B..."
2,3,"['Organic Baby Spinach', 'Unsweetened Chocolat..."
3,4,['Enchilada Black Bean Vegetable']
4,5,"['Plain Whole Milk Yogurt', 'Whole Vitamin D M..."


In [15]:
tfidf = TfidfVectorizer(min_df=5, max_features=1000
                        , strip_accents='unicode',lowercase =True,
analyzer='word', token_pattern=r'\w+', use_idf=True, 
smooth_idf=True, sublinear_tf=True, stop_words = 'english')
tfidf.fit(prior_ord['product_set'])

print(tfidf.vocabulary_)


{'soda': 839, 'original': 632, 'beef': 81, 'jerky': 480, 'pistachios': 682, 'organic': 630, 'string': 887, 'cheese': 189, 'bag': 57, 'bananas': 68, 'cinnamon': 216, 'toast': 921, 'crunch': 272, 'xl': 984, 'size': 817, 'paper': 643, 'towel': 936, 'rolls': 760, 'half': 433, 'zero': 998, 'calorie': 164, 'cola': 232, 'aged': 23, 'white': 979, 'cheddar': 187, 'popcorn': 693, 'chipotle': 205, 'pork': 695, 'banana': 67, 'good': 406, 'juice': 481, 'drink': 316, 'blueberry': 107, 'acai': 20, 'flavor': 358, 'stackers': 876, 'brown': 132, 'rice': 750, 'lightly': 524, 'salted': 777, 'hass': 441, 'avocado': 52, 'cake': 160, 'salt': 776, 'free': 370, 'strawberry': 885, 'rhubarb': 749, 'yoghurt': 991, 'raspberry': 736, 'chips': 206, 'cherry': 194, 'pomegranate': 691, 'greek': 424, 'yogurt': 992, 'fat': 341, 'total': 934, '2': 5, 'natural': 596, 'strained': 883, 'honey': 460, 'lowfat': 537, 'sparkling': 852, 'lemon': 515, 'water': 973, 'sticks': 881, 'low': 536, 'moisture': 579, 'skim': 818, 'mozzarel

In [16]:
text = tfidf.transform(prior_ord['product_set'])
print(text.todense())

svd = TruncatedSVD(n_components=2)
text = svd.fit_transform(text)
text = pd.DataFrame(text)
text.columns = ['pf_0','pf_1']
text['user_id'] = prior_ord.user_id
text.head()

[[0.         0.         0.         ... 0.         0.18440683 0.        ]
 [0.12561484 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.11617723 0.         ... 0.         0.         0.0977696 ]
 [0.         0.         0.12298043 ... 0.         0.         0.06689542]
 [0.         0.05980777 0.         ... 0.         0.         0.        ]]


Unnamed: 0,pf_0,pf_1,user_id
0,0.163496,-0.104835,1
1,0.406453,-0.149489,2
2,0.398428,0.083896,3
3,0.060366,-0.013478,4
4,0.263798,0.155411,5


In [17]:
print(tfidf.idf_)
idf = tfidf.idf_

[3.47231187 3.32551223 5.1449477  2.33289859 4.36668076 2.40837576
 4.60981597 3.32995399 3.59661216 4.41880028 5.18520033 4.48560567
 4.75777037 5.38658998 4.85377685 4.91406777 4.34063212 5.26740146
 5.40656667 3.50262978 4.35279755 3.46250303 5.24689684 3.67553272
 4.4497833  5.16407954 4.39445228 5.32969531 4.53652696 2.32320276
 3.91876303 3.62794124 5.43231917 5.73460004 5.38421937 4.29592055
 5.16028687 4.20830019 2.04283959 5.38303618 2.87174311 4.97089259
 5.02506133 4.68693711 5.15180559 5.25796578 5.00190899 3.59121101
 5.32484908 3.46342757 5.15965615 4.85703267 2.19572953 4.11668277
 1.91345225 4.95381143 3.14437489 2.21637933 5.79582008 3.79253883
 4.96257437 3.81982176 4.91530158 5.23560888 2.94328028 4.44329378
 5.0269921  2.12213892 2.46329655 2.88158084 5.2110822  5.28977058
 3.11898461 3.83412435 3.14122846 5.03364032 4.52593239 4.28652832
 5.24038215 3.31309966 2.55745699 2.97154542 4.41235879 5.0070379
 4.52844475 5.54344738 2.9211221  5.16471305 4.3584409  3.17435

In [None]:
#Continuous columns - TO DO
idf = tf.feature_column.numeric_column('idf')
dow = tf.feature_column.numeric_column('order_dow')
hour = tf.feature_column.numeric_column('order_hour_of_day')
days_prior = tf.feature_column.numeric_column('days_since_prior_order')
cart_order = tf.feature_column.numeric_column('add_to_cart_order')
