In [45]:
import pandas as pd
import numpy as np

In [46]:
df = pd.read_csv('recipes_info.csv')
df.head()

Unnamed: 0,recipe_id,country,country_secondary,dish_category,dish_type,diet_type,carbohydrate_base,carbohydrate_category,protein,protein_cut,protein_type,family_friendly,spice_level,prep_time
0,2,india,india,stove top / bowl food,curry,fish,basmati,rice,shellfish,prawns,fish & seafood,no,no spice,25
1,5,italy,italy,protein&veg,meat & side veg,meat,white potatoes,potato,beans,butter beans,pulses,no,mild,35
2,9,thailand,thailand,stove top / bowl food,curry,vegan,basmati,rice,tofu,protein not found,vegetarian,no,spicy,25
3,16,united kingdom,united kingdom,protein&veg,fish & side veg,fish,,,oily fish,trout,fish & seafood,no,no spice,20
4,19,tunisia,tunisia,protein&veg,egg dish,vegetarian,millet,grains,eggs,eggs,dairy & eggs,no,no spice,30


In [47]:
cols =  ['country', 'dish_category',
         'dish_type', 'diet_type', 'carbohydrate_base', 'carbohydrate_category',
        'protein', 'protein_cut', 'protein_type', 'family_friendly',
        'spice_level', 'prep_time']

In [173]:
base = df.copy()

In [174]:
base.index = base.recipe_id
del base['recipe_id']
del base['country_secondary']
# treat mising content as information
base['dish_category'].replace(to_replace={'protein&veg':'protein & veg'}, inplace=True)
base.fillna('missing', inplace=True)
base.head(10)

Unnamed: 0_level_0,country,dish_category,dish_type,diet_type,carbohydrate_base,carbohydrate_category,protein,protein_cut,protein_type,family_friendly,spice_level,prep_time
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,india,stove top / bowl food,curry,fish,basmati,rice,shellfish,prawns,fish & seafood,no,no spice,25
5,italy,protein & veg,meat & side veg,meat,white potatoes,potato,beans,butter beans,pulses,no,mild,35
9,thailand,stove top / bowl food,curry,vegan,basmati,rice,tofu,protein not found,vegetarian,no,spicy,25
16,united kingdom,protein & veg,fish & side veg,fish,missing,missing,oily fish,trout,fish & seafood,no,no spice,20
19,tunisia,protein & veg,egg dish,vegetarian,millet,grains,eggs,eggs,dairy & eggs,no,no spice,30
20,tunisia,protein & veg,cheese & side veg,vegetarian,cous cous,grains,cheese,feta,dairy & eggs,no,no spice,40
30,italy,stove top / bowl food,pasta,vegetarian,white pasta,pasta,cheese,cheddar,dairy & eggs,no,no spice,40
53,lebanon,salads,tabbouleh,vegetarian,bulgur wheat,grains,cheese,halloumi,dairy & eggs,no,no spice,30
54,italy,stove top / bowl food,risotto,vegetarian,risotto rice,rice,missing,missing,missing,no,no spice,40
57,japan,soups,ramen,meat,wholewheat noodle nests,noodles,chicken,skin off thigh chicken,poultry & meat,yes,no spice,35


In [73]:
def prep_time_class(field):
    # nb highly subjective, could ideally be validated by understanding customer perception 
    if field <=20:
        return 'fast'
    elif field > 20 and field <= 40:
        return 'medium'
    elif field > 40:
        return 'slow'

base['prep_time'] = base['prep_time'].apply(prep_time_class)

In [80]:
base.head()

Unnamed: 0_level_0,country,country_secondary,dish_category,dish_type,diet_type,carbohydrate_base,carbohydrate_category,protein,protein_cut,protein_type,family_friendly,spice_level,prep_time
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,india,india,stove top / bowl food,curry,fish,basmati,rice,shellfish,prawns,fish & seafood,no,no spice,medium
5,italy,italy,protein&veg,meat & side veg,meat,white potatoes,potato,beans,butter beans,pulses,no,mild,medium
9,thailand,thailand,stove top / bowl food,curry,vegan,basmati,rice,tofu,protein not found,vegetarian,no,spicy,medium
16,united kingdom,united kingdom,protein&veg,fish & side veg,fish,missing,missing,oily fish,trout,fish & seafood,no,no spice,fast
19,tunisia,tunisia,protein&veg,egg dish,vegetarian,millet,grains,eggs,eggs,dairy & eggs,no,no spice,medium


In [152]:
def jaccard_similarity(matrix_df):
    # utilise numpys matrix operations for fast computation
    a = matrix_df.values.copy()
    b = matrix_df.values.copy()

    all_recipes_by_n_recipes = np.repeat(a[np.newaxis, :, :], 
                                         a.shape[0],
                                         axis=0)

    all_recipes = b.reshape(b.shape[0], 
                            1, 
                            b.shape[1])

    intersect = np.sum(all_recipes_by_n_recipes == all_recipes, axis=2)
    union = np.sum(all_recipes_by_n_recipes != all_recipes, axis=2)*2 + intersect
    jaccard_sim = intersect/union 

    jaccard_sim_df = pd.DataFrame(jaccard_sim, index=base.index, columns=base.index)
    
    return jaccard_sim_df

In [165]:
jsim = jaccard_similarity(base)
jsim.head(10)

recipe_id,2,5,9,16,19,20,30,53,54,57,...,1270,1276,1277,1278,1279,1280,1281,1282,1283,1284
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.083333,0.3,0.181818,0.130435,0.130435,0.181818,0.130435,0.238095,0.083333,...,0.130435,0.130435,0.083333,0.368421,0.130435,0.083333,0.3,0.083333,0.238095,0.130435
5,0.083333,1.0,0.083333,0.083333,0.130435,0.130435,0.181818,0.083333,0.181818,0.083333,...,0.130435,0.181818,0.181818,0.083333,0.04,0.083333,0.181818,0.04,0.130435,0.238095
9,0.3,0.083333,1.0,0.04,0.083333,0.083333,0.130435,0.083333,0.181818,0.04,...,0.083333,0.083333,0.04,0.083333,0.083333,0.04,0.083333,0.04,0.130435,0.181818
16,0.181818,0.083333,0.04,1.0,0.130435,0.130435,0.083333,0.083333,0.083333,0.04,...,0.3,0.130435,0.130435,0.368421,0.181818,0.083333,0.181818,0.04,0.130435,0.04
19,0.130435,0.130435,0.083333,0.130435,1.0,0.529412,0.238095,0.3,0.181818,0.083333,...,0.130435,0.181818,0.083333,0.130435,0.04,0.083333,0.04,0.181818,0.130435,0.083333
20,0.130435,0.130435,0.083333,0.130435,0.529412,1.0,0.3,0.368421,0.181818,0.083333,...,0.130435,0.238095,0.083333,0.130435,0.04,0.083333,0.04,0.238095,0.181818,0.083333
30,0.181818,0.181818,0.130435,0.083333,0.238095,0.3,1.0,0.3,0.368421,0.083333,...,0.130435,0.3,0.181818,0.130435,0.04,0.083333,0.368421,0.444444,0.444444,0.130435
53,0.130435,0.083333,0.083333,0.083333,0.3,0.368421,0.3,1.0,0.181818,0.083333,...,0.130435,0.238095,0.083333,0.181818,0.083333,0.083333,0.04,0.238095,0.181818,0.083333
54,0.238095,0.181818,0.181818,0.083333,0.181818,0.181818,0.368421,0.181818,1.0,0.083333,...,0.130435,0.181818,0.181818,0.130435,0.04,0.083333,0.181818,0.130435,0.444444,0.130435
57,0.083333,0.083333,0.04,0.04,0.083333,0.083333,0.083333,0.083333,0.083333,1.0,...,0.238095,0.130435,0.181818,0.083333,0.083333,0.130435,0.0,0.130435,0.04,0.083333


In [166]:
jsim[57].sort_values(ascending=False).head()

recipe_id
57      1.000000
285     0.625000
553     0.529412
395     0.444444
1173    0.444444
Name: 57, dtype: float64

In [167]:
base.loc[57]

country                                    japan
country_secondary                          japan
dish_category                              soups
dish_type                                  ramen
diet_type                                   meat
carbohydrate_base        wholewheat noodle nests
carbohydrate_category                    noodles
protein                                  chicken
protein_cut               skin off thigh chicken
protein_type                      poultry & meat
family_friendly                              yes
spice_level                             no spice
prep_time                                 medium
Name: 57, dtype: object

In [168]:
base.loc[285]

country                                    japan
country_secondary                          japan
dish_category                              soups
dish_type                                  ramen
diet_type                                   meat
carbohydrate_base        wholewheat noodle nests
carbohydrate_category                    noodles
protein                                  chicken
protein_cut                       breast chicken
protein_type                      poultry & meat
family_friendly                               no
spice_level                                 mild
prep_time                                 medium
Name: 285, dtype: object

In [230]:
base.loc[553]

country                                    japan
dish_category              stove top / bowl food
dish_type                               stir fry
diet_type                                   meat
carbohydrate_base        wholewheat noodle nests
carbohydrate_category                    noodles
protein                                  chicken
protein_cut                       breast chicken
protein_type                      poultry & meat
family_friendly                               no
spice_level                             no spice
prep_time                                     30
Name: 553, dtype: object

### Cosine Similarity

In [219]:
cos_base = df.copy()

cos_base.index = cos_base.recipe_id
del cos_base['recipe_id']
del cos_base['country_secondary']
# treat mising content as information
cos_base.fillna('missing', inplace=True)
cos_base['family_friendly'].replace(to_replace={'no':'family unfriendly', 'yes': 'family friendly'}, inplace=True)
cos_base['dish_category'].replace(to_replace={'protein&veg':'protein & veg'}, inplace=True)

def prep_time_class(field):
    # nb highly subjective, could ideally be validated by understanding customer perception 
    if field <=20:
        return 'fast'
    elif field > 20 and field <= 40:
        return 'medium'
    elif field > 40:
        return 'slow'

cos_base['prep_time'] = cos_base['prep_time'].apply(prep_time_class)
cos_base.head()

Unnamed: 0_level_0,country,dish_category,dish_type,diet_type,carbohydrate_base,carbohydrate_category,protein,protein_cut,protein_type,family_friendly,spice_level,prep_time
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,india,stove top / bowl food,curry,fish,basmati,rice,shellfish,prawns,fish & seafood,family unfriendly,no spice,medium
5,italy,protein & veg,meat & side veg,meat,white potatoes,potato,beans,butter beans,pulses,family unfriendly,mild,medium
9,thailand,stove top / bowl food,curry,vegan,basmati,rice,tofu,protein not found,vegetarian,family unfriendly,spicy,medium
16,united kingdom,protein & veg,fish & side veg,fish,missing,missing,oily fish,trout,fish & seafood,family unfriendly,no spice,fast
19,tunisia,protein & veg,egg dish,vegetarian,millet,grains,eggs,eggs,dairy & eggs,family unfriendly,no spice,medium


In [220]:
def columns_to_blob(fields):
    all_col_values = fields.tolist()
    return "||".join(all_col_values)

cos_base['blob'] = cos_base.apply(columns_to_blob, axis=1)

In [221]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse.csr import csr_matrix #need this if you want to save tfidf_matrix

def custom_tokeniser(doc):   
    return doc.split("||")

tf = TfidfVectorizer(analyzer='word',
                     min_df = 0, 
                     tokenizer = custom_tokeniser
                    )
tfidf_matrix =  tf.fit_transform(cos_base['blob'])
tfidf_matrix

<260x176 sparse matrix of type '<class 'numpy.float64'>'
	with 3009 stored elements in Compressed Sparse Row format>

In [222]:
cos_base['blob'].apply(custom_tokeniser).iloc[0]

['india',
 'stove top / bowl food',
 'curry',
 'fish',
 'basmati',
 'rice',
 'shellfish',
 'prawns',
 'fish & seafood',
 'family unfriendly',
 'no spice',
 'medium']

In [224]:
tokens = tf.get_feature_names()
tf_idf_df = pd.DataFrame(data=tfidf_matrix.toarray(),
                         columns=tokens)
tf_idf_df.head()

Unnamed: 0,armenia,austria,bacon pork,baguette,basa,basmati,bbq,beans,beef,belly pork,...,vegetarian,vermicelli,vietnam,waxy potatoes,wheatberries,white pasta,white potatoes,whitefish,wholewheat noodle nests,wraps
0,0.0,0.0,0.0,0.0,0.0,0.250424,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.350186,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.31144,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.227184,0.0,0.0,0.0,0.0,...,0.21262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.17619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [226]:
tf_idf_df.iloc[0].sort_values().tail(14)

malaysia                 0.000000
lamb                     0.000000
family unfriendly        0.124767
medium                   0.134036
no spice                 0.137329
stove top / bowl food    0.193608
rice                     0.218007
basmati                  0.250424
fish                     0.273901
fish & seafood           0.293342
india                    0.307761
curry                    0.307761
prawns                   0.474015
shellfish                0.474015
Name: 0, dtype: float64

In [228]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim_df = pd.DataFrame(cos_sim, index=cos_base.index, columns=cos_base.index)
cos_sim_df.head()

recipe_id,2,5,9,16,19,20,30,53,54,57,...,1270,1276,1277,1278,1279,1280,1281,1282,1283,1284
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.033589,0.250362,0.171415,0.039386,0.047208,0.088349,0.045545,0.13421,0.032379,...,0.054124,0.049034,0.036484,0.555551,0.152048,0.028098,0.549035,0.034144,0.198361,0.062831
5,0.033589,1.0,0.030472,0.05447,0.06023,0.072191,0.10458,0.0292,0.103915,0.037725,...,0.060433,0.122881,0.120075,0.028152,0.013422,0.033057,0.139595,0.016686,0.090164,0.2514
9,0.250362,0.030472,1.0,0.012383,0.06033,0.072312,0.112317,0.069764,0.153718,0.014331,...,0.031426,0.028471,0.014967,0.025497,0.0717,0.011526,0.039825,0.061315,0.093075,0.118201
16,0.171415,0.05447,0.012383,1.0,0.05331,0.063898,0.029673,0.02624,0.276553,0.01454,...,0.298739,0.028251,0.096479,0.285004,0.185698,0.024637,0.178512,0.015332,0.097891,0.012076
19,0.039386,0.06023,0.06033,0.05331,1.0,0.407995,0.124394,0.170681,0.078805,0.024341,...,0.040688,0.079786,0.027427,0.033011,0.010073,0.021123,0.009684,0.106479,0.073183,0.022303


In [229]:
cos_sim_df[57].sort_values(ascending=False).head()

recipe_id
57      1.000000
285     0.803306
58      0.643302
553     0.584656
1173    0.555756
Name: 57, dtype: float64

In [232]:
cos_base.loc[57]

country                                                              japan
dish_category                                                        soups
dish_type                                                            ramen
diet_type                                                             meat
carbohydrate_base                                  wholewheat noodle nests
carbohydrate_category                                              noodles
protein                                                            chicken
protein_cut                                         skin off thigh chicken
protein_type                                                poultry & meat
family_friendly                                            family friendly
spice_level                                                       no spice
prep_time                                                           medium
blob                     japan||soups||ramen||meat||wholewheat noodle n...
Name: 57, dtype: object

In [233]:
cos_base.loc[285]

country                                                              japan
dish_category                                                        soups
dish_type                                                            ramen
diet_type                                                             meat
carbohydrate_base                                  wholewheat noodle nests
carbohydrate_category                                              noodles
protein                                                            chicken
protein_cut                                                 breast chicken
protein_type                                                poultry & meat
family_friendly                                          family unfriendly
spice_level                                                           mild
prep_time                                                           medium
blob                     japan||soups||ramen||meat||wholewheat noodle n...
Name: 285, dtype: object

In [234]:
cos_base.loc[58]

country                                                              japan
dish_category                                                        soups
dish_type                                                            ramen
diet_type                                                            vegan
carbohydrate_base                                  wholewheat noodle nests
carbohydrate_category                                              noodles
protein                                                               tofu
protein_cut                                                        missing
protein_type                                                    vegetarian
family_friendly                                          family unfriendly
spice_level                                                       no spice
prep_time                                                           medium
blob                     japan||soups||ramen||vegan||wholewheat noodle ...
Name: 58, dtype: object