# Imports

In [1]:
from sqlalchemy import create_engine
from config import USERNAME, PASSWORD, HOST_PORT, DB_NAME
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# create sqlalchemy engine for reading in data from postgresql db
engine = create_engine(f"postgresql+psycopg2://{USERNAME}:{PASSWORD}@localhost:{HOST_PORT}/{DB_NAME}")

# Looking at Recipes Tags and Ratings

In [3]:
# define query to pull data from postgresql
query = """SELECT recipes.recipe_id, recipes.title, tags.tag_id, tags.tag, recipes.total_rating, recipes.review_count
                FROM food.recipes
                INNER JOIN food.recipes_tags ON recipes.recipe_id = recipes_tags.recipe_id
                INNER JOIN food.tags ON recipes_tags.tag_id = tags.tag_id;"""

In [4]:
# pull data in from postgres sql
data = pd.read_sql_query(query, con=engine)

In [5]:
data.shape

(18803, 6)

In [6]:
data.head()

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
0,1,Baked French Toast Casserole with Maple Syrup,1,american,4.5,2513
1,1,Baked French Toast Casserole with Maple Syrup,28,american,4.5,2513
2,2,Not Yo' Mama's Banana Pudding,1,american,4.9,2171
3,2,Not Yo' Mama's Banana Pudding,28,american,4.9,2171
4,3,Pumpkin Gooey Butter Cakes,1,american,4.8,2157


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18803 entries, 0 to 18802
Data columns (total 6 columns):
recipe_id       18803 non-null int64
title           18803 non-null object
tag_id          18803 non-null int64
tag             18803 non-null object
total_rating    18803 non-null float64
review_count    18803 non-null int64
dtypes: float64(1), int64(3), object(2)
memory usage: 881.5+ KB


In [8]:
# looking at the first recipe
data[data.recipe_id == 1]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
0,1,Baked French Toast Casserole with Maple Syrup,1,american,4.5,2513
1,1,Baked French Toast Casserole with Maple Syrup,28,american,4.5,2513
13285,1,Baked French Toast Casserole with Maple Syrup,24,dessert,4.5,2513


In [9]:
# split out part of the data to look at only recipe id, tag id and tag
data_tags = data[["recipe_id", "tag_id", "tag"]]

In [11]:
# convert the tag id into a string for next step
data_tags.tag_id = data_tags.tag_id.astype(str)

In [12]:
# group the tags to recipe level
data_tags_grouped = data_tags.groupby("recipe_id").agg({"tag_id": " ".join,
                                                                            "tag": " ".join})

In [13]:
data_tags_grouped.head()

Unnamed: 0_level_0,tag_id,tag
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1 28 24,american american dessert
2,1 28 24,american american dessert
3,1 28 24,american american dessert
4,1 28 25,american american side-dish
5,2 39 26,european european main-dish


In [14]:
# split out the data to look at only recipe id, recipe rating, and review count
data_ratings = data[["recipe_id", "total_rating", "review_count"]]

In [15]:
data_ratings.head()

Unnamed: 0,recipe_id,total_rating,review_count
0,1,4.5,2513
1,1,4.5,2513
2,2,4.9,2171
3,2,4.9,2171
4,3,4.8,2157


In [16]:
# group ratings and review counts to recipe level
data_avg_ratings = data_ratings.groupby("recipe_id").agg({"total_rating":"mean",
                                                                                    "review_count":"mean"})

In [17]:
# merge the dataframe with tags and the dataframe with ratings
data_tags_ratings = data_avg_ratings.merge(data_tags_grouped, how="left", on="recipe_id").reset_index()

In [18]:
data_tags_ratings.head()

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
0,1,4.5,2513,1 28 24,american american dessert
1,2,4.9,2171,1 28 24,american american dessert
2,3,4.8,2157,1 28 24,american american dessert
3,4,4.7,1689,1 28 25,american american side-dish
4,5,4.6,1306,2 39 26,european european main-dish


In [19]:
# check shape
data_tags_ratings.shape

(6652, 5)

In [20]:
# check null values
data_tags_ratings.isnull().sum()

recipe_id       0
total_rating    0
review_count    0
tag_id          0
tag             0
dtype: int64

In [21]:
# check the recipes with uncategorized labels
data_tags_ratings[data_tags_ratings.tag.str.contains("uncategorized")]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
151,152,4.5,229,6 33 24,uncategorized uncategorized dessert
163,164,4.8,217,6 33 26,uncategorized uncategorized main-dish
207,208,4.8,179,6 33 24,uncategorized uncategorized dessert
261,262,4.6,148,6 33 27,uncategorized uncategorized appetizer
293,294,4.7,134,6 33 26,uncategorized uncategorized main-dish
323,324,4.6,119,6 33 25,uncategorized uncategorized side-dish
343,344,4.6,112,6 33 26,uncategorized uncategorized main-dish
349,350,4.6,111,6 33 27,uncategorized uncategorized appetizer
352,353,4.7,110,6 33 24,uncategorized uncategorized dessert
365,366,4.9,107,6 33 26,uncategorized uncategorized main-dish


# TF-IDF Vectors

In [22]:
# instantiate the TF IDF vectorizer from sklearn for vectorizing the tags
tf_idf = TfidfVectorizer()

In [23]:
# fit and transform the tags into vectors
data_tags_ratings_tfidf = tf_idf.fit_transform(data_tags_ratings.tag_id)

In [24]:
data_tags_ratings_tfidf

<6652x36 sparse matrix of type '<class 'numpy.float64'>'
	with 12837 stored elements in Compressed Sparse Row format>

In [25]:
# calculate cosine similarity of the tag vectors in the sparse matrix
r2r = cosine_similarity(data_tags_ratings_tfidf)

In [26]:
r2r

array([[1.        , 1.        , 1.        , ..., 0.45528307, 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 0.45528307, 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 0.45528307, 0.        ,
        1.        ],
       ...,
       [0.45528307, 0.45528307, 0.45528307, ..., 1.        , 0.55286517,
        0.45528307],
       [0.        , 0.        , 0.        , ..., 0.55286517, 1.        ,
        0.        ],
       [1.        , 1.        , 1.        , ..., 0.45528307, 0.        ,
        1.        ]])

In [27]:
# create a dataframe of the cosine similarity scores of the vectorized tags
data_tfidf_r2r = pd.DataFrame(r2r)

In [28]:
data_tfidf_r2r.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6642,6643,6644,6645,6646,6647,6648,6649,6650,6651
0,1.0,1.0,1.0,0.44465,0.0,0.548466,1.0,0.548466,0.548466,1.0,...,0.0,0.0,0.44465,0.0,0.0,0.0,0.0,0.455283,0.0,1.0
1,1.0,1.0,1.0,0.44465,0.0,0.548466,1.0,0.548466,0.548466,1.0,...,0.0,0.0,0.44465,0.0,0.0,0.0,0.0,0.455283,0.0,1.0
2,1.0,1.0,1.0,0.44465,0.0,0.548466,1.0,0.548466,0.548466,1.0,...,0.0,0.0,0.44465,0.0,0.0,0.0,0.0,0.455283,0.0,1.0
3,0.44465,0.44465,0.44465,1.0,0.0,0.59036,0.44465,0.59036,0.59036,0.44465,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.490059,0.0,0.44465
4,0.0,0.0,0.0,0.0,1.0,0.230891,0.0,0.230891,0.230891,0.0,...,0.270737,0.270737,0.0,0.270737,0.197761,1.0,0.270737,0.0,0.0,0.0


In [29]:
data_tags_ratings.columns

Index(['recipe_id', 'total_rating', 'review_count', 'tag_id', 'tag'], dtype='object')

In [30]:
index_to_recipe_id = data_tags_ratings.recipe_id

In [32]:
index_to_recipe_id[:10]

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: recipe_id, dtype: int64

In [33]:
# set the tf_idf cosine similarity table's columns to reflect which scores correspond to which recipe
data_tfidf_r2r.columns = [str(index_to_recipe_id[int(col)]) for col in data_tfidf_r2r.columns]

In [34]:
data_tfidf_r2r.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '6643', '6644', '6645', '6646', '6647', '6648', '6649', '6650', '6651',
       '6652'],
      dtype='object', length=6652)

In [35]:
# set the tf_idf cosine similarity table's index to reflect which scores correspond to which recipe
data_tfidf_r2r.index = [index_to_recipe_id[idx] for idx in data_tfidf_r2r.index]

In [36]:
data_tfidf_r2r.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            6643, 6644, 6645, 6646, 6647, 6648, 6649, 6650, 6651, 6652],
           dtype='int64', length=6652)

In [37]:
data_tfidf_r2r.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,6643,6644,6645,6646,6647,6648,6649,6650,6651,6652
1,1.0,1.0,1.0,0.44465,0.0,0.548466,1.0,0.548466,0.548466,1.0,...,0.0,0.0,0.44465,0.0,0.0,0.0,0.0,0.455283,0.0,1.0
2,1.0,1.0,1.0,0.44465,0.0,0.548466,1.0,0.548466,0.548466,1.0,...,0.0,0.0,0.44465,0.0,0.0,0.0,0.0,0.455283,0.0,1.0
3,1.0,1.0,1.0,0.44465,0.0,0.548466,1.0,0.548466,0.548466,1.0,...,0.0,0.0,0.44465,0.0,0.0,0.0,0.0,0.455283,0.0,1.0
4,0.44465,0.44465,0.44465,1.0,0.0,0.59036,0.44465,0.59036,0.59036,0.44465,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.490059,0.0,0.44465
5,0.0,0.0,0.0,0.0,1.0,0.230891,0.0,0.230891,0.230891,0.0,...,0.270737,0.270737,0.0,0.270737,0.197761,1.0,0.270737,0.0,0.0,0.0


# Most Similar Recipes

In [38]:
# looking at top ten similar recipes based on cuisine or category
data_tfidf_r2r.iloc[0].sort_values(ascending=False)[:10]

6652    1.0
1324    1.0
379     1.0
382     1.0
394     1.0
397     1.0
399     1.0
401     1.0
424     1.0
435     1.0
Name: 1, dtype: float64

In [39]:
# look at first recipe
data_tags_ratings[data_tags_ratings.recipe_id == 6652]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
6651,6652,4.9,13,1 28 24,american american dessert


In [40]:
# grab the title from main dataframe with recipe_id
data[data.recipe_id == 6652]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
13283,6652,Fruit Cobbler,1,american,4.9,13
13284,6652,Fruit Cobbler,28,american,4.9,13
18802,6652,Fruit Cobbler,24,dessert,4.9,13


In [41]:
# look at second recipe
data_tags_ratings[data_tags_ratings.recipe_id == 1324]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
1323,1324,3.0,2,1 28 24,american american dessert


In [42]:
# grab the recipe name from main dataframe with recipe _id
data[data.recipe_id == 1324]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
2645,1324,Mango Flavored Syrup for Soft Drinks,1,american,3.0,2
2646,1324,Mango Flavored Syrup for Soft Drinks,28,american,3.0,2
14440,1324,Mango Flavored Syrup for Soft Drinks,24,dessert,3.0,2


In [43]:
# look at third recipe
data_tags_ratings[data_tags_ratings.recipe_id == 379]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
378,379,4.7,103,1 28 24,american american dessert


In [44]:
# grab the recipe name from main dataframe with recipe_id
data[data.recipe_id == 379]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
756,379,Slow Berry Cobbler,1,american,4.7,103
757,379,Slow Berry Cobbler,28,american,4.7,103
13635,379,Slow Berry Cobbler,24,dessert,4.7,103


In [45]:
# looking at the recipes when the similarity scores differ from 1 - sharp drop
data_tfidf_r2r.iloc[0].sort_values(ascending=False)[160:170]

155     1.000000
6344    1.000000
1       1.000000
3546    0.642726
6154    0.642726
5086    0.642726
4208    0.642726
5110    0.642726
5113    0.642726
2389    0.642726
Name: 1, dtype: float64

In [46]:
# look at first recipe after similarity score drops from 1 to 0.64
data_tags_ratings[data_tags_ratings.recipe_id == 3546]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
3545,3546,5.0,3,1 28,american american


In [47]:
# grab recipe name from main dataframe with recipe id
data[data.recipe_id == 3546]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
7088,3546,Furikake Salmon,1,american,5.0,3
7089,3546,Furikake Salmon,28,american,5.0,3


In [48]:
# look at second recipe after similarity score drops from 1 to 0.64
data_tags_ratings[data_tags_ratings.recipe_id == 6154]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
6153,6154,2.0,14,1 28,american american


In [49]:
# grab recipe name from main dataframe with recipe id
data[data.recipe_id == 6154]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
12287,6154,Chocolate-Hazelnut Sandwich Cookies,1,american,2.0,14
12288,6154,Chocolate-Hazelnut Sandwich Cookies,28,american,2.0,14


# User Profile for User #1

In [50]:
# define query to pull data from postgresql
query2 = """SELECT reviews.recipe_id, recipes.title, reviews.user_id, reviews.rating, tags.tag_id, tags.tag FROM food.reviews
                INNER JOIN food.recipes ON recipes.recipe_id = reviews.recipe_id
                INNER JOIN food.recipes_tags ON recipes_tags.recipe_id = recipes.recipe_id
                INNER JOIN food.tags ON tags.tag_id = recipes_tags.tag_id;"""

In [51]:
# pull data in from postgres sql
user_data = pd.read_sql_query(query2, con=engine)

In [52]:
user_data.shape

(188039, 6)

In [53]:
user_data.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,24,dessert
1,1,Baked French Toast Casserole with Maple Syrup,112,5.0,28,american
2,1,Baked French Toast Casserole with Maple Syrup,112,5.0,1,american
3,2,Not Yo' Mama's Banana Pudding,113,5.0,24,dessert
4,2,Not Yo' Mama's Banana Pudding,113,5.0,28,american


In [54]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188039 entries, 0 to 188038
Data columns (total 6 columns):
recipe_id    188039 non-null int64
title        188039 non-null object
user_id      188039 non-null int64
rating       188039 non-null float64
tag_id       188039 non-null int64
tag          188039 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 8.6+ MB


In [56]:
# convert tag_id to string from integer
user_data.tag_id = user_data.tag_id.apply(str)

In [60]:
user_data_grouped = user_data.groupby(["recipe_id", "title", "user_id"]).agg({"rating":"mean", "tag_id":" ".join, "tag":" ".join}).reset_index()

In [61]:
user_data_grouped.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,24 28 1,dessert american american
1,1,Baked French Toast Casserole with Maple Syrup,114,5.0,24 28 1 24 28 1 24 28 1,dessert american american dessert american ame...
2,1,Baked French Toast Casserole with Maple Syrup,4251,5.0,24 28 1,dessert american american
3,1,Baked French Toast Casserole with Maple Syrup,7643,4.0,24 28 1,dessert american american
4,1,Baked French Toast Casserole with Maple Syrup,10606,5.0,24 28 1,dessert american american


In [64]:
# set the information on user #1 to a variable
user_data1 = user_data_grouped[user_data_grouped.user_id == 112]

In [65]:
user_data1.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,24 28 1,dessert american american
13402,1956,Baked Chinese Rice with Peas and Ginger,112,5.0,24 28 1,dessert american american


In [67]:
user_data1["weight"] = user_data1.rating/5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
user_data1.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag,weight
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,24 28 1,dessert american american,1.0
13402,1956,Baked Chinese Rice with Peas and Ginger,112,5.0,24 28 1,dessert american american,1.0


In [73]:
data_tags_ratings_tfidf[user_data1.user_id.values].toarray().T

array([[0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.5213493 , 0.5213493 ],
       [0.        , 0.        ],
       [0.85334337, 0.85334337],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.

In [74]:
# calculate the dot product of the 
user1_profile = np.dot(data_tags_ratings_tfidf[user_data1.user_id.values].toarray().T, user_data1.weight.values)

In [75]:
user1_profile

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.04269861, 0.        , 1.70668673, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [76]:
# find the cosine similarity between the recipes based on tag vectors
C = cosine_similarity(user1_profile.reshape(1, -1), data_tags_ratings_tfidf)

In [77]:
C

array([[0.54846592, 0.54846592, 0.54846592, ..., 0.6044766 , 0.        ,
        0.54846592]])

In [78]:
# sort the similarity scores in descending order
R = np.argsort(C)[:, ::-1]

In [79]:
R

array([[5869, 2121,  284, ..., 3291, 3290, 3325]])

In [81]:
# separate out the similarity scores from the user id values to be used to generate recommendations
recommendations =  [i for i in R[0] if i not in user_data1["user_id"].values]

In [82]:
# merge in the recipe names to the dataframe with tags, average ratings, average review counts
data_tags_ratings_titles = data_tags_ratings.merge(data[["recipe_id", "title"]], how="left", on="recipe_id")

In [83]:
data_tags_ratings_titles.shape

(18803, 6)

In [84]:
# drop duplicates
data_tags_ratings_titles.drop_duplicates(subset=["recipe_id", "title"], inplace=True)

In [85]:
data_tags_ratings_titles.shape

(6652, 6)

In [86]:
data_tags_ratings_titles.head()

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
0,1,4.5,2513,1 28 24,american american dessert,Baked French Toast Casserole with Maple Syrup
3,2,4.9,2171,1 28 24,american american dessert,Not Yo' Mama's Banana Pudding
6,3,4.8,2157,1 28 24,american american dessert,Pumpkin Gooey Butter Cakes
9,4,4.7,1689,1 28 25,american american side-dish,Corn Casserole
12,5,4.6,1306,2 39 26,european european main-dish,Roman-Style Chicken


In [87]:
data_tags_ratings_titles.tail()

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
18788,6648,4.7,13,2 39 26,european european main-dish,"Pat's Potato Pierogis ""Elegante"""
18791,6649,5.0,12,6 33 26,uncategorized uncategorized main-dish,Egg Baked in Acorn Squash
18794,6650,4.6,13,1 28 27,american american appetizer,Grilled French Bread Pizza with Mushroom Pesto...
18797,6651,4.9,13,6 33 27,uncategorized uncategorized appetizer,The Perfect Boiled Eggs
18800,6652,4.9,13,1 28 24,american american dessert,Fruit Cobbler


In [88]:
data_tags_ratings_titles.isnull().sum()

recipe_id       0
total_rating    0
review_count    0
tag_id          0
tag             0
title           0
dtype: int64

In [89]:
data_tags_ratings_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6652 entries, 0 to 18800
Data columns (total 6 columns):
recipe_id       6652 non-null int64
total_rating    6652 non-null float64
review_count    6652 non-null int64
tag_id          6652 non-null object
tag             6652 non-null object
title           6652 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 363.8+ KB


In [90]:
# look at the top couple of recommendations for user 1
data_tags_ratings_titles.title[recommendations].head(10)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


5869                                                  NaN
2121                                                  NaN
284                                                   NaN
282     Momma Callie's Banana Nut Bread with Honey Butter
281                                                   NaN
280                                                   NaN
279                         Lemon-Garlic Shrimp and Grits
2105                Slow-Cooker Chinese Beef and Bok Choy
275                                                   NaN
273                               Spaghetti and Meatballs
Name: title, dtype: object

In [91]:
data_tags_ratings_titles[data_tags_ratings_titles.recipe_id==5869]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
16543,5869,5.0,1,7 32 27,asian asian appetizer,"Grilled Shrimp ""Lollipop"" with Spicy Almond Sauce"
