# Import Libraries

In [1]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

  from numpy.core.umath_tests import inner1d


# Import Pickle

In [2]:
from sagemaker import get_execution_role
import boto3
from s3fs.core import S3FileSystem
role = get_execution_role()
bucket='instacartresearchsymposium'
data_key = 'userProductTotal.pickle'
s3 = S3FileSystem(anon=False)
userProductTotal = joblib.load(s3.open('{}/{}'.format(bucket, data_key), mode='rb'))
userProductTotal.head()

Unnamed: 0,ProductID,UserID,UserTotalProductOrder
0,1,138,2
1,1,709,1
2,1,751,1
3,1,764,2
4,1,777,1


In [3]:
data_key = 'popularProducts.pickle'
popularProducts = joblib.load(s3.open('{}/{}'.format(bucket, data_key), mode='rb'))

data_key = 'popularUsers.pickle'
popularUsers = joblib.load(s3.open('{}/{}'.format(bucket, data_key), mode='rb'))


# Make User x Product Matrix 

In [4]:
popularUsers.head()
popularUsers.count()

UserID                           52368
UserTotalUniqueProductOrdered    52368
dtype: int64

In [5]:
popularProducts.head()

Unnamed: 0,ProductID,ProductTotalOrdered
0,1,1928
2,3,283
3,4,351
9,10,2691
22,23,1138


In [6]:
userProductTotal.count()

ProductID                13863746
UserID                   13863746
UserTotalProductOrder    13863746
dtype: int64

## Dropping Unpopular Product

In [7]:
userProductTotal_droppedUnpopularProducts = userProductTotal[userProductTotal.ProductID.isin(popularProducts.ProductID)]
userProductTotal_droppedUnpopularProducts.count()

ProductID                12522195
UserID                   12522195
UserTotalProductOrder    12522195
dtype: int64

## Dropping Unpopular User

In [8]:
userProductTotal_droppedUnpopularUser = userProductTotal_droppedUnpopularProducts[userProductTotal_droppedUnpopularProducts.UserID.isin(popularUsers.UserID)]
userProductTotal_droppedUnpopularUser.count()

ProductID                6846961
UserID                   6846961
UserTotalProductOrder    6846961
dtype: int64

## Make Pivot Table

In [9]:
userProductTotal_pivot = userProductTotal_droppedUnpopularUser.pivot(index='ProductID', columns='UserID', values='UserTotalProductOrder').fillna(0)

In [10]:
#userProductTotal_pivot = userProductTotal.groupby(['ProductID', 'UserID'])['UserTotalProductOrder'].max().unstack()

In [11]:
userProductTotal_pivot.head()

UserID,2,10,14,19,21,27,28,31,35,37,...,206165,206166,206174,206193,206199,206200,206201,206206,206207,206208
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# transform matrix to scipy sparse matrix

In [12]:
from scipy.sparse import csr_matrix
userProductTotal_pivot_sparce = csr_matrix(userProductTotal_pivot.values)

# Make KNN Model

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [14]:
knn.fit(userProductTotal_pivot_sparce)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [15]:
data_key = 'orig_products.pickle'
products = joblib.load(s3.open('{}/{}'.format(bucket, data_key), mode='rb'))
products.head()

Unnamed: 0,ProductID,ProductName,AisleID,DepartmentID
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [18]:
# create mapper from movie title to index
mapper = userProductTotal_droppedUnpopularUser.groupby("ProductID",as_index = False).count().drop_duplicates()[["ProductID"]]
mapper = mapper.reset_index()

def getIndexForProductID(productID): 
    return mapper.loc[mapper.ProductID == productID]["index"].values[0]
getIndexForProductID(23)

4

In [19]:
def makeProductRecommendation(model, data, productID, numRecommendations):
    model.fit(data)
    print("Product ID inserted:", productID, products[products.ProductID == productID]["ProductName"].to_string(index=False))
    print("Recommending....")
    distances,indices = model.kneighbors(data[getIndexForProductID(productID)], n_neighbors = numRecommendations + 1)
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # print recommendations
    #print(raw_recommends)
    for i, (productID, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, products[products.ProductID == productID]["ProductName"].to_string(index=False), dist))

In [20]:
makeProductRecommendation(knn,userProductTotal_pivot_sparce, 21137, 5)

Product ID inserted: 21137 Organic Strawberries
Recommending....
1: Unsweetened Strawberry Kiwi Water, with distance of 0.5410525998524192
2: Garlic Asiago Brazilian Cheese Bread, with distance of 0.5248754978423431
3: Chicken & vegetable dumplings with rice in an ..., with distance of 0.5009665165843328
4: Original English Muffins, with distance of 0.4858102812322225
5: Mung Bean Pasta, with distance of 0.4801123950004902


In [21]:
products[products.ProductID == 21137]["ProductName"].to_string(index=False)

'Organic Strawberries'

In [22]:
joblib.dump(userProductTotal_pivot_sparce, "userProductTotal_pivot_sparce")
joblib.dump(mapper, "userProductTotal_pivot_sparce_mapper")

['userProductTotal_pivot_sparce_mapper']