# Exploration Data Analysis pour explorer les modes de consomation 
### Par Alexandre Rocchi (pas toucher les loulous)



In [65]:
import pandas 
import ast
import matplotlib.pyplot as plt
import scipy



### Importation des Datasets et création de la table 

In [66]:

recipes = pandas.read_csv("../dataset/RAW_recipes.csv")
interactions = pandas.read_csv("../dataset/RAW_interactions.csv")

In [67]:
# Convertir les listes (considéré comme des strings, en liste python)

recipes['nutrition'] = recipes['nutrition'].apply(ast.literal_eval)

recipes['tags'] = recipes['tags'].apply(ast.literal_eval)

recipes['steps'] = recipes['steps'].apply(ast.literal_eval)

recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)


In [68]:
# Convertir les dates en 'datetype' python :
recipes['submitted'] = pandas.to_datetime(recipes['submitted'])

interactions['date'] = pandas.to_datetime(interactions['date'])

In [69]:
recipes.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13


In [70]:
interactions.head(3)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...


df = pandas.merge(recipes, interactions)

Travaillons sur une masse de données plus faible pour la prise en main des données et des différentes tables

In [71]:
df = pandas.merge(recipes, interactions, left_on='id', right_on='recipe_id')


In [72]:
df.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id,recipe_id,date,rating,review
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,4470,137739,2006-02-18,5,I used an acorn squash and recipe#137681 Swee...
1,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,593927,137739,2010-08-21,5,This was a nice change. I used butternut squas...
2,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,178427,137739,2011-12-05,5,Excellent recipe! I used butternut squash and ...


In [73]:

df_g = df.groupby(['recipe_id']).agg({
    'name':'first',
    'minutes':'first',
    'contributor_id':'first',
    'submitted':'first',
    'tags':'first',
    'nutrition':'first',
    'steps':'first',
    'n_steps':'first',
    'description':'first',
    'ingredients':'first',
    'n_ingredients':'first',
    'review': list,
    'date': list,
    'user_id': list,
    'rating': list
}).reset_index()

print(df_g.head(1))


   recipe_id                               name  minutes  contributor_id  \
0         38  low fat berry blue frozen dessert     1485            1533   

   submitted                                               tags  \
0 1999-08-09  [weeknight, time-to-make, course, preparation,...   

                                  nutrition  \
0  [170.9, 3.0, 120.0, 1.0, 6.0, 6.0, 12.0]   

                                               steps  n_steps  \
0  [toss 2 cups berries with sugar, let stand for...       13   

                                         description  \
0  this is yummy and low-fat, it always turns out...   

                                         ingredients  n_ingredients  \
0  [blueberries, granulated sugar, vanilla yogurt...              4   

                                              review  \
0  [Yummy, yummy, yummy! I am a big fan of fruit ...   

                                                date  \
0  [2008-02-13 00:00:00, 2009-01-15 00:00:00, 201...   

    

In [74]:
df_g.dtypes

recipe_id                  int64
name                      object
minutes                    int64
contributor_id             int64
submitted         datetime64[ns]
tags                      object
nutrition                 object
steps                     object
n_steps                    int64
description               object
ingredients               object
n_ingredients              int64
review                    object
date                      object
user_id                   object
rating                    object
dtype: object

In [75]:
print("Nombre de valeurs manquantes :", df_g.isna().sum().sum())

Nombre de valeurs manquantes : 4980


Listes des variables et leur type :

| recipe_id | minutes | contributor_id | submitted | tags | steps | n_steps | description | ingredients | n_ingredients | review | date | user_id | rating |
| :-------: | :-----: | :------------: | :-------: | :--: | :---: | :-----: | :---------: | :---------: | :-----------: | :----: | :--: | :-----: | :----: |
| qualitative nominale | quantitative discrete | qualitative nominale | quantitative continue | qualitative nominale | qualitative nominale | quantitative discrete | qualitative nominale | qualitative nominale | quantitative discrete | qualitative nominale | qualitative nominale | qualitative nominale |  qualitative ordinale |

### Analyse Univariés

Commencons par l'analyse univariés de chaque variables, commencons par les variables de type Quantitatives discrete 

Les différentes colonnes Quantitatives discrète : *minutes, n_steps, n_ingredients*

In [76]:
df_g['contributor_id'] = df_g.contributor_id.astype('category')
df_g['recipe_id'] = df_g.recipe_id.astype('category')

In [77]:
print(df_g.describe())

print(f"\nNombre de recettes : ",df_g['recipe_id'].count())

            minutes                      submitted        n_steps  \
count  2.316370e+05                         231637  231637.000000   
mean   9.398546e+03  2006-11-14 01:58:34.925508608       9.765499   
min    0.000000e+00            1999-08-06 00:00:00       0.000000   
25%    2.000000e+01            2004-09-16 00:00:00       6.000000   
50%    4.000000e+01            2007-01-23 00:00:00       9.000000   
75%    6.500000e+01            2008-10-29 00:00:00      12.000000   
max    2.147484e+09            2018-12-04 00:00:00     145.000000   
std    4.461963e+06                            NaN       5.995128   

       n_ingredients  
count  231637.000000  
mean        9.051153  
min         1.000000  
25%         6.000000  
50%         9.000000  
75%        11.000000  
max        43.000000  
std         3.734796  

Nombre de recettes :  231637


##### Colonne : *minutes*

In [78]:
df_g  = df_g.sort_values('minutes',ascending=False)
df_g[['name','minutes']].head(5)


Unnamed: 0,name,minutes
141904,no bake granola balls,2147483647
213797,how to preserve a husband,1051200
155593,homemade fruit liquers,288000
120304,celtic druid s honey mead meade metheglin,259260
207668,homemade vanilla,259205


Les deux premières dates semblent etres des outlayers de recettes 'trolls' : nous les retirons donc du dataset. 

In [79]:
df_g = df_g.drop(df_g['minutes'].nlargest(2).index)

df_g[['name','minutes']].head(5)

Unnamed: 0,name,minutes
155593,homemade fruit liquers,288000
120304,celtic druid s honey mead meade metheglin,259260
207668,homemade vanilla,259205
57271,peach brandy,216015
40504,angelica liqueur,201610


In [80]:
df_g.describe()

Unnamed: 0,minutes,submitted,n_steps,n_ingredients
count,231635.0,231635,231635.0,231635.0
mean,123.108144,2006-11-14 01:46:51.507760384,9.765506,9.051188
min,0.0,1999-08-06 00:00:00,0.0,1.0
25%,20.0,2004-09-16 00:00:00,6.0,6.0
50%,40.0,2007-01-23 00:00:00,9.0,9.0
75%,65.0,2008-10-29 00:00:00,12.0,11.0
max,288000.0,2018-12-04 00:00:00,145.0,43.0
std,1977.767905,,5.995153,3.734782
