# Recommender System - Content Based Filter

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np


In [2]:
# Import necessary datasets 

fruit_nutrition = pd.read_csv("C:/Users/ariji/OneDrive/Desktop/Data science & ML/Unsupervised Machine Learning/Data/fruit_nutrition.csv")
fruit_nutrition.head()

Unnamed: 0,fruit_100g,energy_kcal,water_g,protein_g,totalfat_g,fiber_g,sugars_g,vitaminc_mg
0,Banana,89,74.9,1.09,0.33,2.6,12.2,8.7
1,Lemon,29,89.0,1.1,0.3,2.8,2.5,53.0
2,Lime,30,88.3,0.7,0.2,2.8,1.69,29.1
3,Mango,46,88.3,0.91,0.27,1.5,8.39,4.1
4,Peach,60,83.5,0.82,0.38,1.6,13.7,36.4


In [4]:
# Select only numerical columns and update the index

nutrition = fruit_nutrition.set_index('fruit_100g')
nutrition.index.name = None
nutrition

Unnamed: 0,energy_kcal,water_g,protein_g,totalfat_g,fiber_g,sugars_g,vitaminc_mg
Banana,89,74.9,1.09,0.33,2.6,12.2,8.7
Lemon,29,89.0,1.1,0.3,2.8,2.5,53.0
Lime,30,88.3,0.7,0.2,2.8,1.69,29.1
Mango,46,88.3,0.91,0.27,1.5,8.39,4.1
Peach,60,83.5,0.82,0.38,1.6,13.7,36.4
Pineapple,50,86.0,0.54,0.12,1.4,9.85,47.8


### a. Cosine similarity with 'sugars_g', 'vitaminc_mg'

In [6]:
# select just 2 columns 

df = nutrition.iloc[: , 5:]
df.head()

Unnamed: 0,sugars_g,vitaminc_mg
Banana,12.2,8.7
Lemon,2.5,53.0
Lime,1.69,29.1
Mango,8.39,4.1
Peach,13.7,36.4


In [19]:
df.columns

Index(['sugars_g', 'vitaminc_mg'], dtype='object')

In [10]:
# Calculate the cosine similarity between two fruits
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(df.loc[['Mango' , 'Banana']])[0][1]

0.9864305980007304

In [11]:
# find all cosine similarities

cosine_matrix = cosine_similarity(df)
cosine_matrix

array([[1.        , 0.61832414, 0.62683477, 0.9864306 , 0.83018982,
        0.73298243],
       [0.61832414, 1.        , 0.99994086, 0.48090193, 0.95146351,
        0.98784307],
       [0.62683477, 0.99994086, 1.        , 0.49040872, 0.95475426,
        0.98947527],
       [0.9864306 , 0.48090193, 0.49040872, 1.        , 0.72739811,
        0.61135407],
       [0.83018982, 0.95146351, 0.95475426, 0.72739811, 1.        ,
        0.98773952],
       [0.73298243, 0.98784307, 0.98947527, 0.61135407, 0.98773952,
        1.        ]])

In [12]:
# show each row of the array on one line
import numpy as np

# set linewidth to a large value to prevent wrapping
np.set_printoptions(linewidth=np.inf)

In [13]:
# find all cosine similarities

cosine_matrix = cosine_similarity(df)
cosine_matrix

array([[1.        , 0.61832414, 0.62683477, 0.9864306 , 0.83018982, 0.73298243],
       [0.61832414, 1.        , 0.99994086, 0.48090193, 0.95146351, 0.98784307],
       [0.62683477, 0.99994086, 1.        , 0.49040872, 0.95475426, 0.98947527],
       [0.9864306 , 0.48090193, 0.49040872, 1.        , 0.72739811, 0.61135407],
       [0.83018982, 0.95146351, 0.95475426, 0.72739811, 1.        , 0.98773952],
       [0.73298243, 0.98784307, 0.98947527, 0.61135407, 0.98773952, 1.        ]])

In [18]:
# turn the array into a dataframe

cosine_df = pd.DataFrame(cosine_matrix , index=df.index , columns=df.index)
cosine_df

Unnamed: 0,Banana,Lemon,Lime,Mango,Peach,Pineapple
Banana,1.0,0.618324,0.626835,0.986431,0.83019,0.732982
Lemon,0.618324,1.0,0.999941,0.480902,0.951464,0.987843
Lime,0.626835,0.999941,1.0,0.490409,0.954754,0.989475
Mango,0.986431,0.480902,0.490409,1.0,0.727398,0.611354
Peach,0.83019,0.951464,0.954754,0.727398,1.0,0.98774
Pineapple,0.732982,0.987843,0.989475,0.611354,0.98774,1.0


### b. cosine similarity with all Nutritional values

In [20]:
# view original data
nutrition

Unnamed: 0,energy_kcal,water_g,protein_g,totalfat_g,fiber_g,sugars_g,vitaminc_mg
Banana,89,74.9,1.09,0.33,2.6,12.2,8.7
Lemon,29,89.0,1.1,0.3,2.8,2.5,53.0
Lime,30,88.3,0.7,0.2,2.8,1.69,29.1
Mango,46,88.3,0.91,0.27,1.5,8.39,4.1
Peach,60,83.5,0.82,0.38,1.6,13.7,36.4
Pineapple,50,86.0,0.54,0.12,1.4,9.85,47.8


In [21]:
# cosine similarity with all columns

cosine_matrix_all = cosine_similarity(nutrition)
cosine_matrix_all

array([[1.        , 0.77186848, 0.83412488, 0.92474189, 0.93676783, 0.87931265],
       [0.77186848, 1.        , 0.97739451, 0.87650678, 0.94133956, 0.9780653 ],
       [0.83412488, 0.97739451, 1.        , 0.95282112, 0.95471941, 0.96986632],
       [0.92474189, 0.87650678, 0.95282112, 1.        , 0.94580335, 0.91804609],
       [0.93676783, 0.94133956, 0.95471941, 0.94580335, 1.        , 0.98972148],
       [0.87931265, 0.9780653 , 0.96986632, 0.91804609, 0.98972148, 1.        ]])

In [29]:
# convert the similarity matrix into a dataframe

cosine_all_df = pd.DataFrame(cosine_matrix_all , index=nutrition.index , columns=nutrition.index)
cosine_all_df

Unnamed: 0,Banana,Lemon,Lime,Mango,Peach,Pineapple
Banana,1.0,0.771868,0.834125,0.924742,0.936768,0.879313
Lemon,0.771868,1.0,0.977395,0.876507,0.94134,0.978065
Lime,0.834125,0.977395,1.0,0.952821,0.954719,0.969866
Mango,0.924742,0.876507,0.952821,1.0,0.945803,0.918046
Peach,0.936768,0.94134,0.954719,0.945803,1.0,0.989721
Pineapple,0.879313,0.978065,0.969866,0.918046,0.989721,1.0


### c. make a content based filtering recommendation

In [27]:
# if i like mangos, what other fruits might i like? -- using sugar and vitamin c only

cosine_df[['Mango']].sort_values(by='Mango' , ascending=False)

Unnamed: 0,Mango
Mango,1.0
Banana,0.986431
Peach,0.727398
Pineapple,0.611354
Lime,0.490409
Lemon,0.480902


In [34]:
# if i like mangos, what other fruits might i like? -- using all nutritional values

cosine_all_df[['Mango']].sort_values(by='Mango' , ascending=False)

Unnamed: 0,Mango
Mango,1.0
Lime,0.952821
Peach,0.945803
Banana,0.924742
Pineapple,0.918046
Lemon,0.876507


### d. Function to Make Fruit Recommendations

In [35]:
# write a function to make fruit recommendations
def recommend_fruits_based_on_liking(fruit_name, similarity_df):
        
    # check if the input fruit is in the DataFrame
    if fruit_name not in df.index:
        return f"{fruit_name} is not found in the fruit list."
    
    # sort the specified column in descending order
    sorted_column = similarity_df[[fruit_name]].sort_values(by=fruit_name, ascending=False)
    
    # display the result
    return sorted_column

In [36]:
# example usage with mangos + sugar and vitamin c

recommend_fruits_based_on_liking('Mango' , cosine_df)

Unnamed: 0,Mango
Mango,1.0
Banana,0.986431
Peach,0.727398
Pineapple,0.611354
Lime,0.490409
Lemon,0.480902


In [37]:
# example usage with mangos + all nutritional values

recommend_fruits_based_on_liking('Mango' , cosine_all_df)

Unnamed: 0,Mango
Mango,1.0
Lime,0.952821
Peach,0.945803
Banana,0.924742
Pineapple,0.918046
Lemon,0.876507


In [38]:
# error example 

recommend_fruits_based_on_liking('kiwi' , cosine_all_df)

'kiwi is not found in the fruit list.'