# Imports

In [1]:
from sqlalchemy import create_engine
from config import USERNAME, PASSWORD, HOST_PORT, DB_NAME
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# create sqlalchemy engine for reading in data from postgresql db
engine = create_engine(f"postgresql+psycopg2://{USERNAME}:{PASSWORD}@localhost:{HOST_PORT}/{DB_NAME}")

# Looking at Recipes Tags and Ratings

In [3]:
# define query to pull data from postgresql
query = """SELECT recipes.recipe_id, recipes.title, tags.tag_id, tags.tag, recipes.total_rating, recipes.review_count
                FROM food.recipes
                INNER JOIN food.recipes_tags ON recipes.recipe_id = recipes_tags.recipe_id
                INNER JOIN food.tags ON recipes_tags.tag_id = tags.tag_id;"""

In [4]:
# pull data in from postgres sql
data = pd.read_sql_query(query, con=engine)

In [5]:
data.shape

(87937, 6)

In [6]:
data.head()

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
0,1,Baked French Toast Casserole with Maple Syrup,1,american,4.5,2513
1,1,Baked French Toast Casserole with Maple Syrup,31,Baking,4.5,2513
2,2,Not Yo' Mama's Banana Pudding,1,american,4.9,2171
3,1,Baked French Toast Casserole with Maple Syrup,32,Dessert,4.5,2513
4,3,Pumpkin Gooey Butter Cakes,1,american,4.8,2157


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87937 entries, 0 to 87936
Data columns (total 6 columns):
recipe_id       87937 non-null int64
title           87937 non-null object
tag_id          87937 non-null int64
tag             87937 non-null object
total_rating    87937 non-null float64
review_count    87937 non-null int64
dtypes: float64(1), int64(3), object(2)
memory usage: 4.0+ MB


In [8]:
# looking at the first recipe
data[data.recipe_id == 1]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
0,1,Baked French Toast Casserole with Maple Syrup,1,american,4.5,2513
1,1,Baked French Toast Casserole with Maple Syrup,31,Baking,4.5,2513
3,1,Baked French Toast Casserole with Maple Syrup,32,Dessert,4.5,2513
5,1,Baked French Toast Casserole with Maple Syrup,33,Make Ahead,4.5,2513
7,1,Baked French Toast Casserole with Maple Syrup,34,American,4.5,2513
9,1,Baked French Toast Casserole with Maple Syrup,35,French Toast Recipes,4.5,2513
11,1,Baked French Toast Casserole with Maple Syrup,36,Egg Recipes,4.5,2513
13,1,Baked French Toast Casserole with Maple Syrup,37,Dairy Recipes,4.5,2513
15,1,Baked French Toast Casserole with Maple Syrup,38,Nut Recipes,4.5,2513
17,1,Baked French Toast Casserole with Maple Syrup,39,Brunch,4.5,2513


In [9]:
# split out part of the data to look at only recipe id, tag id and tag
data_tags = data[["recipe_id", "tag_id", "tag"]]

In [10]:
# convert the tag id into a string for next step
data_tags.loc[:, "tag_id"] = data_tags.loc[:, "tag_id"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
# group the tags to recipe level
data_tags_grouped = data_tags.groupby("recipe_id").agg({"tag_id": " ".join,
                                                                            "tag": " ".join})

In [12]:
data_tags_grouped.head()

Unnamed: 0_level_0,tag_id,tag
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...
2,1 42 32 43 44 45 46 47 48 49 50 34 51 52 37 53...,american Easy Dessert Recipes Dessert Easy Ban...
3,1 42 32 43 56 57 58 59 31 50 34 60 61 52 53 62...,american Easy Dessert Recipes Dessert Easy Eas...
4,1 66 67 43 68 69 56 34 51 70 52 71 63 37 72 25,american Easy Casserole Recipes Casserole Easy...
5,2 73 74 43 75 76 77 78 79 33 80 81 34 82 83 84...,european Easy Chicken Chicken Easy Poultry Eas...


In [13]:
# split out the data to look at only recipe id, recipe rating, and review count
data_ratings = data[["recipe_id", "total_rating", "review_count"]]

In [14]:
data_ratings.head()

Unnamed: 0,recipe_id,total_rating,review_count
0,1,4.5,2513
1,1,4.5,2513
2,2,4.9,2171
3,1,4.5,2513
4,3,4.8,2157


In [15]:
# group ratings and review counts to recipe level
data_avg_ratings = data_ratings.groupby("recipe_id").agg({"total_rating":"mean",
                                                                                    "review_count":"mean"})

In [16]:
# merge the dataframe with tags and the dataframe with ratings
data_tags_ratings = data_avg_ratings.merge(data_tags_grouped, how="left", on="recipe_id").reset_index()

In [17]:
data_tags_ratings.head()

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
0,1,4.5,2513,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...
1,2,4.9,2171,1 42 32 43 44 45 46 47 48 49 50 34 51 52 37 53...,american Easy Dessert Recipes Dessert Easy Ban...
2,3,4.8,2157,1 42 32 43 56 57 58 59 31 50 34 60 61 52 53 62...,american Easy Dessert Recipes Dessert Easy Eas...
3,4,4.7,1689,1 66 67 43 68 69 56 34 51 70 52 71 63 37 72 25,american Easy Casserole Recipes Casserole Easy...
4,5,4.6,1306,2 73 74 43 75 76 77 78 79 33 80 81 34 82 83 84...,european Easy Chicken Chicken Easy Poultry Eas...


In [18]:
# check shape
data_tags_ratings.shape

(6652, 5)

In [19]:
# check null values
data_tags_ratings.isnull().sum()

recipe_id       0
total_rating    0
review_count    0
tag_id          0
tag             0
dtype: int64

In [20]:
# check the recipes with uncategorized labels
data_tags_ratings[data_tags_ratings.tag.str.contains("uncategorized")]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
151,152,4.5,229,6 31 32 147 58 59 41 24,uncategorized Baking Dessert Bread Holiday Tha...
163,164,4.8,217,6 197 43 198 199 76 77 200 112 337 105 233 192...,uncategorized Easy Shrimp Recipes Easy Shellfi...
207,208,4.8,179,6 48 32 46 31 115 365 314 40 144 24,uncategorized Fruit Dessert Recipes Dessert Fr...
261,262,4.6,148,6 125 108 43 76 77 200 112 396 177 241 63 397 ...,uncategorized Easy Appetizer Appetizer Easy Ea...
293,294,4.7,134,6 79 412 387 74 75 77 84 83 26,uncategorized Healthy Food Processor Oven Frie...
323,324,4.6,119,6 132 63 93 231 82 97 122 69 77 112 41 72 103 25,uncategorized Celery Vegetable Beans and Legum...
343,344,4.6,112,6 76 43 77 217 218 96 82 97 112 152 55 41 26,uncategorized Easy Main Dish Easy Main Dish Ea...
349,350,4.6,111,6 162 105 101 63 108 27,uncategorized Vegetarian Soup Tomato Vegetable...
352,353,4.7,110,6 286 156 46 115 157 48 32 31 390 434 37 172 3...,uncategorized Apple Pie Apple Fruit Pie Recipe...
365,366,4.9,107,6 73 74 43 75 76 77 200 112 217 218 221 72 133 26,uncategorized Easy Chicken Chicken Easy Poultr...


# TF-IDF Vectors

In [21]:
# instantiate the TF IDF vectorizer from sklearn for vectorizing the tags
tf_idf = TfidfVectorizer()

In [22]:
# fit and transform the tags into vectors
data_tags_ratings_tfidf = tf_idf.fit_transform(data_tags_ratings.tag_id)

In [23]:
data_tags_ratings_tfidf

<6652x711 sparse matrix of type '<class 'numpy.float64'>'
	with 81971 stored elements in Compressed Sparse Row format>

In [24]:
# calculate cosine similarity of the tag vectors in the sparse matrix
r2r = cosine_similarity(data_tags_ratings_tfidf)

In [25]:
r2r

array([[1.        , 0.17692319, 0.17125275, ..., 0.1057307 , 0.2342178 ,
        0.2687431 ],
       [0.17692319, 1.        , 0.29259322, ..., 0.07763473, 0.        ,
        0.2054009 ],
       [0.17125275, 0.29259322, 1.        , ..., 0.08649271, 0.        ,
        0.19213963],
       ...,
       [0.1057307 , 0.07763473, 0.08649271, ..., 1.        , 0.19309455,
        0.06827919],
       [0.2342178 , 0.        , 0.        , ..., 0.19309455, 1.        ,
        0.03343333],
       [0.2687431 , 0.2054009 , 0.19213963, ..., 0.06827919, 0.03343333,
        1.        ]])

In [26]:
# create a dataframe of the cosine similarity scores of the vectorized tags
data_tfidf_r2r = pd.DataFrame(r2r)

In [27]:
data_tfidf_r2r.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6642,6643,6644,6645,6646,6647,6648,6649,6650,6651
0,1.0,0.176923,0.171253,0.080878,0.179542,0.038247,0.261367,0.095541,0.057133,0.193726,...,0.0,0.0,0.050554,0.232994,0.0,0.046034,0.180928,0.105731,0.234218,0.268743
1,0.176923,1.0,0.292593,0.16782,0.043248,0.043277,0.164113,0.067735,0.153336,0.232599,...,0.0,0.02787,0.070376,0.0,0.0,0.057691,0.0,0.077635,0.0,0.205401
2,0.171253,0.292593,1.0,0.129214,0.042698,0.042727,0.304931,0.039402,0.039991,0.737091,...,0.013589,0.036904,0.081115,0.012189,0.0,0.033175,0.091499,0.086493,0.0,0.19214
3,0.080878,0.16782,0.129214,1.0,0.076341,0.076392,0.038313,0.104927,0.207311,0.123049,...,0.04622,0.050536,0.220734,0.041458,0.0,0.239304,0.032194,0.118443,0.024897,0.111172
4,0.179542,0.043248,0.042698,0.076341,1.0,0.533622,0.040831,0.168862,0.144216,0.040661,...,0.077309,0.033078,0.051643,0.069344,0.093019,0.080406,0.053848,0.069751,0.04686,0.037003


In [28]:
data_tags_ratings.columns

Index(['recipe_id', 'total_rating', 'review_count', 'tag_id', 'tag'], dtype='object')

In [29]:
index_to_recipe_id = data_tags_ratings.recipe_id

In [30]:
index_to_recipe_id[:10]

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: recipe_id, dtype: int64

In [31]:
# set the tf_idf cosine similarity table's columns to reflect which scores correspond to which recipe
data_tfidf_r2r.columns = [str(index_to_recipe_id[int(col)]) for col in data_tfidf_r2r.columns]

In [32]:
data_tfidf_r2r.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '6643', '6644', '6645', '6646', '6647', '6648', '6649', '6650', '6651',
       '6652'],
      dtype='object', length=6652)

In [33]:
# set the tf_idf cosine similarity table's index to reflect which scores correspond to which recipe
data_tfidf_r2r.index = [index_to_recipe_id[idx] for idx in data_tfidf_r2r.index]

In [34]:
data_tfidf_r2r.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            6643, 6644, 6645, 6646, 6647, 6648, 6649, 6650, 6651, 6652],
           dtype='int64', length=6652)

In [35]:
data_tfidf_r2r.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,6643,6644,6645,6646,6647,6648,6649,6650,6651,6652
1,1.0,0.176923,0.171253,0.080878,0.179542,0.038247,0.261367,0.095541,0.057133,0.193726,...,0.0,0.0,0.050554,0.232994,0.0,0.046034,0.180928,0.105731,0.234218,0.268743
2,0.176923,1.0,0.292593,0.16782,0.043248,0.043277,0.164113,0.067735,0.153336,0.232599,...,0.0,0.02787,0.070376,0.0,0.0,0.057691,0.0,0.077635,0.0,0.205401
3,0.171253,0.292593,1.0,0.129214,0.042698,0.042727,0.304931,0.039402,0.039991,0.737091,...,0.013589,0.036904,0.081115,0.012189,0.0,0.033175,0.091499,0.086493,0.0,0.19214
4,0.080878,0.16782,0.129214,1.0,0.076341,0.076392,0.038313,0.104927,0.207311,0.123049,...,0.04622,0.050536,0.220734,0.041458,0.0,0.239304,0.032194,0.118443,0.024897,0.111172
5,0.179542,0.043248,0.042698,0.076341,1.0,0.533622,0.040831,0.168862,0.144216,0.040661,...,0.077309,0.033078,0.051643,0.069344,0.093019,0.080406,0.053848,0.069751,0.04686,0.037003


# Most Similar Recipes

In [36]:
# looking at top ten similar recipes based on cuisine or category
data_tfidf_r2r.iloc[0].sort_values(ascending=False)[:10]

1       1.000000
1958    0.913991
2389    0.611845
2212    0.546665
105     0.541332
1983    0.541332
2410    0.541202
225     0.533641
1179    0.532549
2290    0.510045
Name: 1, dtype: float64

In [37]:
# look at first recipe
data_tags_ratings[data_tags_ratings.recipe_id == 6652]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
6651,6652,4.9,13,1 24 42 32 43 56 627 48 417 46 31 162 34 282 2...,american dessert Easy Dessert Recipes Dessert ...


In [38]:
# grab the title from main dataframe with recipe_id
data[data.recipe_id == 6652]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
13283,6652,Fruit Cobbler,1,american,4.9,13
18802,6652,Fruit Cobbler,24,dessert,4.9,13
87917,6652,Fruit Cobbler,42,Easy Dessert Recipes,4.9,13
87918,6652,Fruit Cobbler,32,Dessert,4.9,13
87919,6652,Fruit Cobbler,43,Easy,4.9,13
87920,6652,Fruit Cobbler,56,Easy Baking,4.9,13
87921,6652,Fruit Cobbler,627,Blackberry Dessert,4.9,13
87922,6652,Fruit Cobbler,48,Fruit Dessert Recipes,4.9,13
87923,6652,Fruit Cobbler,417,Blackberry,4.9,13
87924,6652,Fruit Cobbler,46,Fruit,4.9,13


In [39]:
# look at second recipe
data_tags_ratings[data_tags_ratings.recipe_id == 1324]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
1323,1324,3.0,2,1 24 42 32 43 48 46 34 507 117 436 128 72 84 129,american dessert Easy Dessert Recipes Dessert ...


In [40]:
# grab the recipe name from main dataframe with recipe _id
data[data.recipe_id == 1324]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
2645,1324,Mango Flavored Syrup for Soft Drinks,1,american,3.0,2
14440,1324,Mango Flavored Syrup for Soft Drinks,24,dessert,3.0,2
28549,1324,Mango Flavored Syrup for Soft Drinks,42,Easy Dessert Recipes,3.0,2
28550,1324,Mango Flavored Syrup for Soft Drinks,32,Dessert,3.0,2
28551,1324,Mango Flavored Syrup for Soft Drinks,43,Easy,3.0,2
28552,1324,Mango Flavored Syrup for Soft Drinks,48,Fruit Dessert Recipes,3.0,2
28553,1324,Mango Flavored Syrup for Soft Drinks,46,Fruit,3.0,2
28554,1324,Mango Flavored Syrup for Soft Drinks,34,American,3.0,2
28555,1324,Mango Flavored Syrup for Soft Drinks,507,Caribbean,3.0,2
28556,1324,Mango Flavored Syrup for Soft Drinks,117,Sugar,3.0,2


In [41]:
# look at third recipe
data_tags_ratings[data_tags_ratings.recipe_id == 379]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
378,379,4.7,103,1 450 282 46 34 117 263 37 314 141 32 111 41 24,american Berry Cobbler Cobbler Recipes Fruit A...


In [42]:
# grab the recipe name from main dataframe with recipe_id
data[data.recipe_id == 379]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
756,379,Slow Berry Cobbler,1,american,4.7,103
10469,379,Slow Berry Cobbler,450,Berry Cobbler,4.7,103
10471,379,Slow Berry Cobbler,282,Cobbler Recipes,4.7,103
10473,379,Slow Berry Cobbler,46,Fruit,4.7,103
10475,379,Slow Berry Cobbler,34,American,4.7,103
10477,379,Slow Berry Cobbler,117,Sugar,4.7,103
10479,379,Slow Berry Cobbler,263,Raspberry Recipes,4.7,103
10481,379,Slow Berry Cobbler,37,Dairy Recipes,4.7,103
10483,379,Slow Berry Cobbler,314,Strawberry,4.7,103
10485,379,Slow Berry Cobbler,141,Blueberry,4.7,103


In [43]:
# looking at the recipes when the similarity scores differ from 1 - sharp drop
data_tfidf_r2r.iloc[0].sort_values(ascending=False)[160:170]

3299    0.298825
3087    0.297237
181     0.297237
83      0.296339
5010    0.295529
159     0.294181
4133    0.294103
4339    0.292646
3269    0.291609
2425    0.291256
Name: 1, dtype: float64

In [44]:
# look at first recipe after similarity score drops from 1 to 0.64
data_tags_ratings[data_tags_ratings.recipe_id == 3546]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
3545,3546,5.0,3,1 34 502 304 305,american American Hawaiian Recipes Fish Salmon


In [45]:
# grab recipe name from main dataframe with recipe id
data[data.recipe_id == 3546]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
7088,3546,Furikake Salmon,1,american,5.0,3
52705,3546,Furikake Salmon,34,American,5.0,3
52706,3546,Furikake Salmon,502,Hawaiian Recipes,5.0,3
52707,3546,Furikake Salmon,304,Fish,5.0,3
52708,3546,Furikake Salmon,305,Salmon,5.0,3


In [46]:
# look at second recipe after similarity score drops from 1 to 0.64
data_tags_ratings[data_tags_ratings.recipe_id == 6154]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag
6153,6154,2.0,14,1 200 43 112 312 208 209 90 186 34 149 38 65 144,american Easy Lunch Recipes Easy Lunch Easy Sn...


In [47]:
# grab recipe name from main dataframe with recipe id
data[data.recipe_id == 6154]

Unnamed: 0,recipe_id,title,tag_id,tag,total_rating,review_count
12287,6154,Chocolate-Hazelnut Sandwich Cookies,1,american,2.0,14
81726,6154,Chocolate-Hazelnut Sandwich Cookies,200,Easy Lunch Recipes,2.0,14
81727,6154,Chocolate-Hazelnut Sandwich Cookies,43,Easy,2.0,14
81728,6154,Chocolate-Hazelnut Sandwich Cookies,112,Lunch,2.0,14
81729,6154,Chocolate-Hazelnut Sandwich Cookies,312,Easy Snack Recipes,2.0,14
81730,6154,Chocolate-Hazelnut Sandwich Cookies,208,Snack,2.0,14
81731,6154,Chocolate-Hazelnut Sandwich Cookies,209,Chocolate Cookie Recipes,2.0,14
81732,6154,Chocolate-Hazelnut Sandwich Cookies,90,Chocolate,2.0,14
81733,6154,Chocolate-Hazelnut Sandwich Cookies,186,Cookie,2.0,14
81734,6154,Chocolate-Hazelnut Sandwich Cookies,34,American,2.0,14


# User Profile for User #1

In [48]:
# define query to pull data from postgresql
query2 = """SELECT reviews.recipe_id, recipes.title, reviews.user_id, reviews.rating, tags.tag_id, tags.tag FROM food.reviews
                INNER JOIN food.recipes ON recipes.recipe_id = reviews.recipe_id
                INNER JOIN food.recipes_tags ON recipes_tags.recipe_id = recipes.recipe_id
                INNER JOIN food.tags ON tags.tag_id = recipes_tags.tag_id;"""

In [49]:
# pull data in from postgres sql
user_data = pd.read_sql_query(query2, con=engine)

In [50]:
user_data.shape

(879403, 6)

In [51]:
user_data.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag
0,2,Not Yo' Mama's Banana Pudding,114,4.9,1,american
1,2,Not Yo' Mama's Banana Pudding,114,5.0,1,american
2,2,Not Yo' Mama's Banana Pudding,114,4.9,1,american
3,2,Not Yo' Mama's Banana Pudding,114,4.9,1,american
4,2,Not Yo' Mama's Banana Pudding,15714,5.0,1,american


In [52]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879403 entries, 0 to 879402
Data columns (total 6 columns):
recipe_id    879403 non-null int64
title        879403 non-null object
user_id      879403 non-null int64
rating       879403 non-null float64
tag_id       879403 non-null int64
tag          879403 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 40.3+ MB


In [53]:
# convert tag_id to string from integer
user_data.tag_id = user_data.tag_id.apply(str)

In [54]:
user_data_grouped = user_data.groupby(["recipe_id", "title", "user_id"]).agg({"rating":"mean", "tag_id":" ".join, "tag":" ".join}).reset_index()

In [55]:
user_data_grouped.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...
1,1,Baked French Toast Casserole with Maple Syrup,114,5.0,1 1 1 31 31 31 32 32 32 33 33 33 34 34 34 35 3...,american american american Baking Baking Bakin...
2,1,Baked French Toast Casserole with Maple Syrup,4251,5.0,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...
3,1,Baked French Toast Casserole with Maple Syrup,7643,4.0,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...
4,1,Baked French Toast Casserole with Maple Syrup,10606,5.0,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...


In [56]:
# set the information on user #1 to a variable
user_data1 = user_data_grouped[user_data_grouped.user_id == 112]

In [57]:
user_data1.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...
13402,1956,Baked Chinese Rice with Peas and Ginger,112,5.0,1 24 426 427 122 93 94 69 72 84 129,american dessert Asian Chinese Recipes Rice Re...


In [58]:
user_data1["weight"] = user_data1.rating/5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [59]:
user_data1.head()

Unnamed: 0,recipe_id,title,user_id,rating,tag_id,tag,weight
0,1,Baked French Toast Casserole with Maple Syrup,112,5.0,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...,1.0
13402,1956,Baked Chinese Rice with Peas and Ginger,112,5.0,1 24 426 427 122 93 94 69 72 84 129,american dessert Asian Chinese Recipes Rice Re...,1.0


In [60]:
data_tags_ratings_tfidf[user_data1.user_id.values].toarray().T

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [61]:
# calculate the dot product of the 
user1_profile = np.dot(data_tags_ratings_tfidf[user_data1.user_id.values].toarray().T, user_data1.weight.values)

In [62]:
user1_profile

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.46554544, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [63]:
# find the cosine similarity between the recipes based on tag vectors
C = cosine_similarity(user1_profile.reshape(1, -1), data_tags_ratings_tfidf)

In [64]:
C

array([[0.05771158, 0.0518568 , 0.15604356, ..., 0.04725515, 0.01377098,
        0.02506907]])

In [65]:
# sort the similarity scores in descending order
R = np.argsort(C)[:, ::-1]

In [66]:
R

array([[ 112,  442,   66, ..., 3674, 3681, 4173]])

In [67]:
# separate out the similarity scores from the user id values to be used to generate recommendations
recommendations =  [i for i in R[0] if i not in user_data1["user_id"].values]

In [68]:
# merge in the recipe names to the dataframe with tags, average ratings, average review counts
data_tags_ratings_titles = data_tags_ratings.merge(data[["recipe_id", "title"]], how="left", on="recipe_id")

In [69]:
data_tags_ratings_titles.shape

(87937, 6)

In [70]:
# drop duplicates
data_tags_ratings_titles.drop_duplicates(subset=["recipe_id", "title"], inplace=True)

In [71]:
data_tags_ratings_titles.shape

(6652, 6)

In [72]:
data_tags_ratings_titles.head()

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
0,1,4.5,2513,1 31 32 33 34 35 36 37 38 39 40 41 24,american Baking Dessert Make Ahead American Fr...,Baked French Toast Casserole with Maple Syrup
13,2,4.9,2171,1 42 32 43 44 45 46 47 48 49 50 34 51 52 37 53...,american Easy Dessert Recipes Dessert Easy Ban...,Not Yo' Mama's Banana Pudding
33,3,4.8,2157,1 42 32 43 56 57 58 59 31 50 34 60 61 52 53 62...,american Easy Dessert Recipes Dessert Easy Eas...,Pumpkin Gooey Butter Cakes
53,4,4.7,1689,1 66 67 43 68 69 56 34 51 70 52 71 63 37 72 25,american Easy Casserole Recipes Casserole Easy...,Corn Casserole
69,5,4.6,1306,2 73 74 43 75 76 77 78 79 33 80 81 34 82 83 84...,european Easy Chicken Chicken Easy Poultry Eas...,Roman-Style Chicken


In [73]:
data_tags_ratings_titles.tail()

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
87866,6648,4.7,13,2 26 407 52 231 63 80 721 70 37 77 69 162,european main-dish Cheesy Potatoes Cheese Pota...,"Pat's Potato Pierogis ""Elegante"""
87879,6649,5.0,12,6 26 694 63 36 77 143 39 64 72 162,uncategorized main-dish Acorn Squash Vegetable...,Egg Baked in Acorn Squash
87890,6650,4.6,13,1 27 125 108 43 217 218 162 34 147 177 63 52 3...,american appetizer Easy Appetizer Appetizer Ea...,Grilled French Bread Pizza with Mushroom Pesto...
87905,6651,4.9,13,6 27 36 108 77 39 143 72 133 162,uncategorized appetizer Egg Recipes Appetizer ...,The Perfect Boiled Eggs
87915,6652,4.9,13,1 24 42 32 43 56 627 48 417 46 31 162 34 282 2...,american dessert Easy Dessert Recipes Dessert ...,Fruit Cobbler


In [74]:
data_tags_ratings_titles.isnull().sum()

recipe_id       0
total_rating    0
review_count    0
tag_id          0
tag             0
title           0
dtype: int64

In [75]:
data_tags_ratings_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6652 entries, 0 to 87915
Data columns (total 6 columns):
recipe_id       6652 non-null int64
total_rating    6652 non-null float64
review_count    6652 non-null int64
tag_id          6652 non-null object
tag             6652 non-null object
title           6652 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 363.8+ KB


In [76]:
# look at the top couple of recommendations for user 1
data_tags_ratings_titles.title[recommendations].head(10)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


442     NaN
66      NaN
5211    NaN
409     NaN
5190    NaN
145     NaN
188     NaN
5285    NaN
483     NaN
2370    NaN
Name: title, dtype: object

In [78]:
data_tags_ratings_titles[data_tags_ratings_titles.recipe_id==442]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
6921,442,4.8,89,1 73 74 43 75 76 77 34 46 408 325 83 144 84 85...,american Easy Chicken Chicken Easy Poultry Eas...,Apricot Glazed Chicken with Dried Plums and Sage


In [79]:
data_tags_ratings_titles[data_tags_ratings_titles.recipe_id==66]

Unnamed: 0,recipe_id,total_rating,review_count,tag_id,tag,title
1023,66,4.6,398,1 31 32 33 50 34 99 141 46 52 53 58 55 41 24,american Baking Dessert Make Ahead Mixer Recip...,The Ultimate Cheesecake
