# User Collaborative Recommender Systems

Reference: GA class notebook by Riley Dallas<br>
**This note book can answer this question: Given a user in the receipe review dataset, can I find other users who have similar preferences?**

In [1]:
# Import library
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'    # with this can handle more merories, avoid kernel dead error.
import time      # track the program running time

## Load `recipes.csv` and `reviews.csv`

In [2]:
t0=time.time()
recipes = pd.read_csv('organized_recipes.csv')
recipes.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,sat_fat,carbs
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0


In [3]:
ratings = pd.read_csv('cleaned_reviews.csv')
ratings.head(3)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...


## Drop unnecessary columns
---

We won't need the `date` or `review` column from `ratings`. We only  `name` and `id` columns from `recipes`. 

In [4]:
ratings = ratings[['user_id', 'recipe_id', 'rating']]

In [5]:
recipes = recipes[['id','name']]

In [6]:
recipes.shape,  ratings.shape

((191481, 2), (1071351, 3))

In [7]:
df = pd.merge(ratings, recipes, how='inner', left_on='recipe_id', right_on='id').drop(columns='id')

In [8]:
print(df.shape)
df.head()

(885798, 4)


Unnamed: 0,user_id,recipe_id,rating,name
0,76535,134728,4,kfc honey bbq strips
1,273745,134728,5,kfc honey bbq strips
2,353911,134728,5,kfc honey bbq strips
3,190375,134728,5,kfc honey bbq strips
4,255338,134728,5,kfc honey bbq strips


In [9]:
review_count = df.groupby('recipe_id').count()
review_count

Unnamed: 0_level_0,user_id,rating,name
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40,9,9,9
45,2,2,2
46,2,2,2
49,18,18,18
58,7,7,7
...,...,...,...
537319,1,1,1
537458,1,1,1
537459,1,1,1
537485,1,1,1


In [10]:
# Due to limitation of the computer memory (AND google colab), 
# we only select recipes with number of ratings between 5 and 8 to avoid "kernal dead error"
selected_recipes = review_count[(review_count['rating'] > 4) & (review_count['rating'] < 9)].index
selected_recipes

Int64Index([    58,     91,     92,     93,    136,    139,    170,    210,
               224,    240,
            ...
            526222, 530478, 531253, 532736, 532740, 533699, 534900, 535779,
            536119, 536678],
           dtype='int64', name='recipe_id', length=23095)

In [11]:
# select the recipes subset.
df = df.set_index('recipe_id').loc[selected_recipes,:]

In [12]:
df.reset_index(inplace=True)

In [13]:
# display new size of the dataset, it becomes much smaller.
print(df.shape)
df.head()

(141512, 4)


Unnamed: 0,recipe_id,user_id,rating,name
0,58,437767,3,low fat burgundy beef vegetable stew
1,58,162826,5,low fat burgundy beef vegetable stew
2,58,5060,5,low fat burgundy beef vegetable stew
3,58,1060485,3,low fat burgundy beef vegetable stew
4,58,1279229,5,low fat burgundy beef vegetable stew


In [14]:
# clear up the memeories
del ratings
ratings = pd.DataFrame() 

del recipes
recipes = pd.DataFrame()

In [15]:
t1=time.time()-t0
t1

8.313942909240723

## Create pivot table
---

Because we're creating an user-based collaborative recommender, we'll set up our pivot table as follows:
1. The `user_id` will be the index
2. The `name` will be the column
3. The `rating` will be the value


In [16]:
pivot = pd.pivot_table(df, index='user_id', columns='name', values='rating')

pivot.head()

name,1 000 artichoke hearts,1 2 3 jambalaya,1 asian noodle salad,1 favorite chinese steamed whole fish by sy,1 gram fat pumpkin spice muffins low fat,1 hour smoky ham and lentil soup,1 minute stromboli,1 squash dressing,10 bean soup,10 layer poor man s lasagna casserole,...,zucchini with bacon cheese,zucchini with chickpea and mushroom stuffing,zucchini with salsa,zucchini yellow squash stir fry,zuccuash bake from nimz territory,zuke soup,zulu cabbage,zuppa di broccoli broccoli soup,zwiebelkuchen southwest german onion cake,zydeco ya ya deviled eggs
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1533,,,,,,,,,,,...,,,,,,,,,,
1535,,,,,,,,,,,...,,,,,,,,,,
1634,,,,,,,,,,,...,,,,,,,,,,
1676,,,,,,,,,,,...,,,,,,,,,,
1792,,,,,,,,,,,...,,,,,,,,,,


In [17]:
pivot.shape

(37737, 23079)

In [18]:
t2=time.time()-t0
t2

64.92830204963684

## Create sparse matrix
---

Calculate the cosine similarity for each user_id using the `pairwise_distances` function. Before that, we need to create a sparse matrix (datatype) using `scipy`'s `sparse` module like so:


In [19]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print(sparse_pivot)

  (0, 2996)	5.0
  (0, 3778)	5.0
  (0, 3779)	5.0
  (0, 5325)	5.0
  (0, 8176)	5.0
  (0, 8961)	5.0
  (0, 9277)	5.0
  (0, 10757)	5.0
  (0, 10951)	5.0
  (0, 11706)	5.0
  (0, 11730)	5.0
  (0, 12407)	5.0
  (0, 16513)	5.0
  (0, 16515)	5.0
  (0, 18148)	5.0
  (0, 21652)	4.0
  (0, 21661)	5.0
  (0, 21799)	5.0
  (0, 22717)	5.0
  (0, 23019)	4.0
  (1, 124)	4.0
  (1, 126)	5.0
  (1, 179)	3.0
  (1, 449)	5.0
  (1, 451)	5.0
  :	:
  (37712, 4069)	1.0
  (37713, 8099)	5.0
  (37714, 2122)	5.0
  (37715, 5697)	5.0
  (37716, 14441)	4.0
  (37717, 21584)	5.0
  (37718, 10430)	5.0
  (37719, 20517)	5.0
  (37720, 17490)	5.0
  (37721, 8657)	5.0
  (37722, 3158)	4.0
  (37723, 411)	3.0
  (37724, 21523)	4.0
  (37725, 9688)	5.0
  (37726, 12566)	5.0
  (37727, 10532)	5.0
  (37728, 20190)	3.0
  (37729, 12056)	5.0
  (37730, 17423)	5.0
  (37731, 10532)	1.0
  (37732, 2386)	5.0
  (37733, 3796)	5.0
  (37734, 8098)	5.0
  (37735, 20960)	5.0
  (37736, 2464)	5.0


## Calculate cosine similarity
---

`sklearn` has a built-in `pairwise_distances` function that we can use for our recommender. It will return a square matrix, comparing every recipe with every other resipe in the dataset.

In [20]:
# Note that a distance of 1 is a similarity of 0.
dists = pairwise_distances(sparse_pivot, metric='cosine')
# dists = cosine_distances(sparse_pivot)                         # Identical but more concise

dists

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [21]:
np.round(dists,3)

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [22]:
# Here, similarity is 1 - distance.
similarities = cosine_similarity(sparse_pivot)

In [23]:
#This can not run, kernal dead error. My computer(neither google colab) has enough memory to run this.
#Verify they are the same

# np.all(np.isclose((1.0 - dists), similarities))

## Create distances DataFrame
---

At this point, we essentially have a recommender. We'll load it into a `pandas` DataFrame for readability. 

You'll notice that each movie has a "distance" of 0 with itself (along the diagonal).

In [24]:
recommender_df = pd.DataFrame(similarities, 
                              columns=pivot.index, 
                              index=pivot.index)
recommender_df.head()

user_id,1533,1535,1634,1676,1792,1891,1962,2046,2054,2059,...,2002361642,2002363091,2002363779,2002364091,2002364382,2002368192,2002368412,2002368953,2002369279,2002371843
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1533,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1535,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1676,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1792,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
recommender_df.shape

(37737, 37737)

In [26]:
t3=time.time()-t0
t3

474.9731650352478

In [27]:
#recommender_df.to_csv('recommender_df.csv', index=False)  # Save this file for quick access

## Ealuate recommender performance
---

Now comes the fun part! Let's check out a few recipes to see if the recommender aligns with our intuition. In the cell below we'll do the following:
1. Create a search for users with user_id
2. Use that to find all user_id matching the search query
3. For each user_id, we'll list off the the ten most similar movies

In [28]:
query = '18338'
user_ids = recommender_df[recommender_df.index.map(lambda id: str(id)).str.contains(query)].index

for user_id in user_ids:
    print(user_id)
#     print('Average rating', recommender_df.loc[name, :].mean())
#     print('Number of ratings', recommender_df.T[name].count())
#     print('')
#     print('10 closest recipes')
    print(recommender_df[user_id].sort_values(ascending=False)[1:11])
    print('')
    print('*******************************************************************************************')
    print('')

183381
user_id
749803        0.329690
181116        0.329690
125728        0.329690
120896        0.329690
130093        0.263752
553402        0.263752
2002168950    0.263752
146348        0.263752
130080        0.263752
251608        0.233126
Name: 183381, dtype: float64

*******************************************************************************************

183386
user_id
42661      0.577350
276663     0.577350
1168939    0.270295
925054     0.243108
94355      0.224055
315055     0.177123
804851     0.169224
353131     0.145803
987323     0.141027
392230     0.119660
Name: 183386, dtype: float64

*******************************************************************************************

2118338
user_id
2118338    1.000000
1580557    0.514496
796254     0.478913
88378      0.145371
360080     0.000000
360437     0.000000
360372     0.000000
360369     0.000000
360194     0.000000
360127     0.000000
Name: 2118338, dtype: float64

***********************************************

In [29]:
pd.DataFrame(recommender_df[[user_id]].sort_values(by=user_id, ascending=False).head(6))

user_id,2118338
user_id,Unnamed: 1_level_1
547623,1.0
2118338,1.0
1580557,0.514496
796254,0.478913
88378,0.145371
1271905,0.0


In [30]:
recommender_df[[user_id]].sort_values(by=user_id, ascending=False).head(6)
# .set_axis(['Value'], axis=1)

user_id,2118338
user_id,Unnamed: 1_level_1
547623,1.0
2118338,1.0
1580557,0.514496
796254,0.478913
88378,0.145371
1271905,0.0


In [31]:
chosen_user = pd.DataFrame()
for user_id in user_ids:
    chosen_user = pd.concat([chosen_user, recommender_df[[user_id]].sort_values(by=user_id, ascending=False).head(6)])

In [32]:
chosen_user

user_id,183381,183386,2118338
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
183381,1.0,,
749803,0.32969,,
120896,0.32969,,
181116,0.32969,,
125728,0.32969,,
146348,0.263752,,
183386,,1.0,
42661,,0.57735,
276663,,0.57735,
1168939,,0.270295,
