In [1]:
import pandas as pd
import numpy as np

### Join data for user descriptive analysis user_recipes

In [2]:
df_recipes_in = pd.read_parquet('../01_DataCleansing/recipes_in.parquet', engine='pyarrow')
df_recipes_in.head(3)

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Side Dishes,"[winter squash, mexican seasoning, mixed spice..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Breakfast,"[prepared pizza crust, sausage patty, eggs, mi..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Main Dish,"[ground beef, yellow onions, diced tomatoes, t..."


In [31]:
df_users_in = pd.read_csv('../01_DataCleansing/users_in.csv')
df_users_in.head(3)

Unnamed: 0,user_id,recipe_id,ratings
0,0,1118,5.0
1,0,27680,5.0
2,0,32541,5.0


In [4]:
df_users_in = df_users_in[df_users_in['ratings'] > 3.0]

In [5]:
df_users_in.head(3)

Unnamed: 0,user_id,recipe_id,ratings
0,0,1118,5.0
1,0,27680,5.0
2,0,32541,5.0


In [6]:
df_user_recipes = pd.merge(df_recipes_in, df_users_in, on='recipe_id', how='right')
df_user_recipes = df_user_recipes.drop(['nutrition', 'minutes', 'meal_of_day'], axis=1)
df_user_recipes.head(3)

Unnamed: 0,recipe_id,techniques,cuisine,ingredients,user_id,ratings
0,1118,,,,0,5.0
1,27680,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,"[boiling water, tea bags, sugar, frozen limead...",0,5.0
2,32541,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,"[center-cut pork chops, egg, evaporated milk, ...",0,5.0


In [7]:
# user_recipes table without Na value
df_user_recipes_without_na = df_user_recipes.dropna()

### Prepare user ingredient counts

In [8]:
df_user_num_ings = df_user_recipes_without_na[['user_id', 'ingredients']].explode('ingredients')
df_user_num_ings_gb = df_user_num_ings.groupby('user_id', as_index=False).nunique() \
                                        .rename({'ingredients':'num_ingredients'}, axis=1)
df_user_num_ings_gb.to_parquet('../05_RecipeExplorationTool/data/user_ing_count.parquet', engine='pyarrow')

### Prepare user analysis user_techniques

In [9]:
df_user_recipes_without_na_copy = df_user_recipes_without_na.copy()

In [10]:
df_user_recipes_without_na_copy['techniques'] = df_user_recipes_without_na_copy.apply(
        lambda x: dict(filter(lambda elem: elem[1] == 1,x['techniques'].items())), axis=1)
df_user_recipes_without_na_copy.head(3)

Unnamed: 0,recipe_id,techniques,cuisine,ingredients,user_id,ratings
1,27680,"{'Boil': 1, 'Pour': 1}",North American,"[boiling water, tea bags, sugar, frozen limead...",0,5.0
2,32541,"{'Bake': 1, 'Crush': 1}",North American,"[center-cut pork chops, egg, evaporated milk, ...",0,5.0
3,137353,"{'Melt': 1, 'Skillet': 1}",Uncategorized,"[butter, mushrooms, dijon mustard, whole wheat...",0,5.0


In [11]:
df_user_techniques = df_user_recipes_without_na_copy.explode('techniques')
df_user_techniques = df_user_techniques.drop(['cuisine','ingredients', 'ratings', 'recipe_id'], axis=1)
df_user_techniques = df_user_techniques.groupby(['user_id', 'techniques']).size().reset_index()

df_user_techniques['count'] = df_user_techniques[0]

In [12]:
df_user_techniques = df_user_techniques.drop(0,1)
df_user_techniques

  df_user_techniques = df_user_techniques.drop(0,1)


Unnamed: 0,user_id,techniques,count
0,0,Bake,5
1,0,Blend,1
2,0,Boil,4
3,0,Combine,3
4,0,Crush,2
...,...,...,...
292879,25074,Combine,1
292880,25074,Shred,1
292881,25074,Skillet,1
292882,25074,Toast,1


In [13]:
df_user_techniques['user_id'].unique().size

23031

In [14]:
df_0 = df_user_techniques.groupby(['user_id']) \
    .apply(lambda x: x.sort_values(['count'], ascending=False)) \
    .reset_index(drop=True)
df_1 = df_0.groupby('user_id').head(15)

In [15]:
df_1

Unnamed: 0,user_id,techniques,count
0,0,Bake,5
1,0,Pour,5
2,0,Drain,4
3,0,Boil,4
4,0,Skillet,3
...,...,...,...
292879,25074,Combine,1
292880,25074,Shred,1
292881,25074,Skillet,1
292882,25074,Toast,1


In [16]:
df_1.to_parquet('../05_RecipeExplorationTool/data/user_techniques.parquet', engine='pyarrow')

### Prepare user analysis user_ingredients

In [17]:
df_user_ingredients = df_user_recipes_without_na.explode('ingredients')
df_user_ingredients = df_user_ingredients.drop(['cuisine', 'techniques', 'ratings', 'recipe_id'], axis=1)
df_user_ingredients.head(3)

Unnamed: 0,ingredients,user_id
1,boiling water,0
1,tea bags,0
1,sugar,0


In [18]:
df_user_ingredients = df_user_ingredients.groupby(['user_id', 'ingredients']).size().reset_index()
df_user_ingredients['count'] = df_user_ingredients[0]
df_user_ingredients = df_user_ingredients.drop([0], axis=1)
df_user_ingredients

Unnamed: 0,user_id,ingredients,count
0,0,apple,1
1,0,apple cider,1
2,0,artificial sweetener,1
3,0,baking cocoa,1
4,0,baking powder,2
...,...,...,...
1719706,25074,salt,1
1719707,25074,sugar-free applesauce,1
1719708,25074,vanilla extract,1
1719709,25074,wheat germ,1


In [19]:
df_user_ingredients.loc[(df_user_ingredients['count']>=10)]


Unnamed: 0,user_id,ingredients,count
219,1,salt,10
347,3,active dry yeast,10
349,3,all-purpose flour,56
399,3,bacon,27
409,3,baking powder,66
...,...,...,...
1550219,16411,salt,10
1558320,16690,salt,12
1565720,16966,salt,10
1591045,17982,salt,10


In [20]:
df_user_ingredients['user_id'].unique().size

23180

In [21]:
df_11 = df_user_ingredients.groupby(['user_id']) \
    .apply(lambda x: x.sort_values(['count'], ascending=False)) \
    .reset_index(drop=True)
df_22 = df_11.groupby('user_id').head(15)

In [22]:
df_22.head(18)

Unnamed: 0,user_id,ingredients,count
0,0,salt,7
1,0,butter,5
2,0,garlic,4
3,0,sugar,4
4,0,egg,3
5,0,paprika,3
6,0,black pepper,3
7,0,onions,3
8,0,garlic cloves,3
9,0,garlic powder,2


In [23]:
df_22.to_parquet('../05_RecipeExplorationTool/data/user_ingredients.parquet', engine='pyarrow')

### Prepare user analysis user_cuisine

In [24]:
df_user_cuisine = df_user_recipes_without_na[['user_id', 'cuisine']]
df_user_cuisine = df_user_cuisine.groupby(['user_id', 'cuisine']).size().reset_index()
df_user_cuisine['count'] = df_user_cuisine[0]
df_user_cuisine = df_user_cuisine.drop([0], axis=1)
df_user_cuisine

Unnamed: 0,user_id,cuisine,count
0,0,Asian,2
1,0,North American,5
2,0,South West Pacific,1
3,0,Uncategorized,7
4,1,European,3
...,...,...,...
70005,25071,North American,1
70006,25072,North American,1
70007,25073,Comfort Food,1
70008,25073,Uncategorized,1


In [25]:
df_user_cuisine['user_id'].unique().size

23180

In [26]:
df_3 = df_user_cuisine.groupby(['user_id']) \
    .apply(lambda x: x.sort_values(['count'], ascending=False)) \
    .reset_index(drop=True)
df_4 = df_3.groupby('user_id').head(15)
df_4

Unnamed: 0,user_id,cuisine,count
0,0,Uncategorized,7
1,0,North American,5
2,0,Asian,2
3,0,South West Pacific,1
4,1,Uncategorized,12
...,...,...,...
70005,25071,North American,1
70006,25072,North American,1
70007,25073,Comfort Food,1
70008,25073,Uncategorized,1


In [27]:
df_4.to_parquet('../05_RecipeExplorationTool/data/user_cuisines.parquet', engine='pyarrow')