In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
import seaborn as sns
import sys
from AFSA import AFSA
from scipy import stats

np.set_printoptions(threshold=sys.maxsize)

# Import interactions and food data

In [2]:
'''load rating dataset in a Pandas dataframe'''
ratings1 = pd.read_csv('Raw Data/RAW_interactions.csv', usecols=['user_id', 'recipe_id', 'rating'], engine='python').rename(columns={"recipe_id": "food_id"})
ratings1

Unnamed: 0,user_id,food_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,126440,85009,5
4,57222,85009,5
...,...,...,...
1132362,116593,72730,0
1132363,583662,386618,5
1132364,157126,78003,5
1132365,53932,78003,4


In [3]:
items = pd.read_csv('food_nutrition_IQR15.csv',
                    usecols=['id', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat',
                             'carbohydrates'] ,engine='python').rename(columns={"id": "food_id"})
items

Unnamed: 0,food_id,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,38,170.9,3,120,1,6,6,12
1,41,536.1,36,128,64,58,19,21
2,45,228.0,10,98,11,8,8,12
3,46,4.3,0,0,0,0,0,0
4,50,89.4,4,29,0,4,1,4
...,...,...,...,...,...,...,...,...
166538,537458,390.8,37,12,12,11,73,12
166539,537459,220.7,15,49,2,3,30,4
166540,537485,52.8,3,0,4,1,1,2
166541,537671,207.9,12,93,10,6,8,10


In [4]:
'''merge rates with recipes nutrition dataset to remove the interactions with outliers'''
ratings = pd.merge(ratings1,items).drop(['calories', 'total_fat', 'sugar','sodium', 'protein', 'saturated_fat', 'carbohydrates'],axis=1)
ratings

Unnamed: 0,user_id,food_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,76535,134728,4
4,273745,134728,5
...,...,...,...
816845,169430,470995,5
816846,417131,241491,4
816847,648540,241491,5
816848,1269180,257796,4


# Remove low interactions

In [5]:
#add count to each user
ratings['users_frequency'] = ratings['user_id'].map(ratings['user_id'].value_counts())
ratings

Unnamed: 0,user_id,food_id,rating,users_frequency
0,38094,40893,4,32
1,1293707,40893,5,39
2,8937,44394,4,21
3,76535,134728,4,237
4,273745,134728,5,1
...,...,...,...,...
816845,169430,470995,5,3277
816846,417131,241491,4,5
816847,648540,241491,5,33
816848,1269180,257796,4,95


In [7]:
#add count to each item
ratings['items_frequency'] = ratings['food_id'].map(ratings['food_id'].value_counts())
ratings

Unnamed: 0,user_id,food_id,rating,users_frequency,items_frequency
0,38094,40893,4,32,2
1,1293707,40893,5,39,2
2,8937,44394,4,21,1
3,76535,134728,4,237,7
4,273745,134728,5,1,7
...,...,...,...,...,...
816845,169430,470995,5,3277,1
816846,417131,241491,4,5,2
816847,648540,241491,5,33,2
816848,1269180,257796,4,95,1


In [8]:
#get only users with 20 rates or more
ratings_users_20 = ratings[(ratings.users_frequency >= 20) & (ratings.items_frequency >= 20)]
ratings_users_20

Unnamed: 0,user_id,food_id,rating,users_frequency,items_frequency
22,56680,79222,5,144,25
24,101823,79222,5,1064,25
26,446143,79222,4,541,25
27,226989,79222,4,59,25
28,868654,79222,5,51,25
...,...,...,...,...,...
816666,8526,34620,5,54,20
816667,41809,34620,5,228,20
816668,240958,34620,5,26,20
816669,52125,34620,5,229,20


In [15]:
#ratings_users_20.to_csv('ratings_items_users.csv')

# Convert user-item to user-feature interactions

In [9]:
'''merge rates with recipes nutrition dataset in a Pandas dataframe'''
user_nutrition = pd.merge(ratings_users_20,items)
user_nutrition

Unnamed: 0,user_id,food_id,rating,users_frequency,items_frequency,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,56680,79222,5,144,25,274.2,12,15,18,32,22,12
1,101823,79222,5,1064,25,274.2,12,15,18,32,22,12
2,446143,79222,4,541,25,274.2,12,15,18,32,22,12
3,226989,79222,4,59,25,274.2,12,15,18,32,22,12
4,868654,79222,5,51,25,274.2,12,15,18,32,22,12
...,...,...,...,...,...,...,...,...,...,...,...,...
157338,8526,34620,5,54,20,147.2,3,3,6,7,6,9
157339,41809,34620,5,228,20,147.2,3,3,6,7,6,9
157340,240958,34620,5,26,20,147.2,3,3,6,7,6,9
157341,52125,34620,5,229,20,147.2,3,3,6,7,6,9


In [10]:
'''sort values and remove unwantted cols'''
ratings_nutrition = user_nutrition.sort_values('user_id').drop(['food_id', 'rating', 'users_frequency','items_frequency'],
                                                               axis=1).reset_index(drop=True)  
ratings_nutrition

Unnamed: 0,user_id,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,1533,380.0,15,134,11,52,8,15
1,1533,110.8,11,14,2,1,24,3
2,1533,484.9,22,4,12,47,25,22
3,1533,196.9,4,155,1,1,9,14
4,1533,30.3,0,13,2,1,0,2
...,...,...,...,...,...,...,...,...
157338,2001453193,159.5,1,7,36,31,0,6
157339,2001453193,186.7,4,12,6,14,5,10
157340,2001453193,41.9,3,4,45,3,1,1
157341,2001453193,398.9,9,34,16,27,13,23


In [11]:
'''get the AV of nutrition'''
ratings_nutrition_AV = ratings_nutrition.groupby('user_id').mean()
ratings_nutrition_AV

Unnamed: 0_level_0,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1533,239.983333,16.333333,43.333333,9.000000,21.250000,18.666667,8.000000
1535,222.534359,15.328205,40.830769,11.861538,14.348718,20.707692,8.133333
1634,238.914286,17.857143,9.428571,13.428571,23.857143,24.428571,6.571429
1676,299.000000,20.500000,56.500000,25.500000,39.000000,12.250000,8.500000
1891,250.127273,17.454545,47.545455,15.727273,10.909091,26.636364,9.818182
...,...,...,...,...,...,...,...
2001330613,217.821429,16.428571,35.928571,12.642857,17.071429,21.571429,6.785714
2001356926,259.564286,21.142857,13.357143,21.142857,38.642857,33.857143,4.285714
2001362355,303.845946,20.162162,36.675676,23.648649,30.540541,27.351351,9.837838
2001436530,320.133333,18.000000,24.000000,31.000000,29.666667,17.333333,12.000000


In [12]:
ratings_nutrition_AV.to_csv('ratings_nutrition_AV_20_20.csv')