In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('C:\\Users\\aahsa\\Downloads\\Recipes_Features.csv')

**Define correlation:**

In [4]:
# Define the features to be used for calculating cosine similarity
features = ['MealType', 'Difficulty', 'CuisineType', 'Description', 'RecipeCategory', 'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeInstructions', 'RecipeYield', 'AuthorName']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert categorical columns to numeric
categorical_columns = ['MealType', 'Difficulty', 'CuisineType', 'Description', 'RecipeCategory', 'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeInstructions', 'RecipeYield', 'AuthorName']
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Sample a subset of the data (e.g., 10,000 rows)
df_sampled = df.sample(n=10000, random_state=42)

# Calculate the cosine similarity matrix for the sampled data
similarity_matrix = cosine_similarity(df_sampled[features])

# Display the similarity matrix
print(similarity_matrix)

[[1.         0.42308548 0.82535909 ... 0.78895115 0.65405422 0.80264749]
 [0.42308548 1.         0.66038227 ... 0.78911139 0.61573453 0.84069069]
 [0.82535909 0.66038227 1.         ... 0.97372566 0.60597066 0.7839983 ]
 ...
 [0.78895115 0.78911139 0.97372566 ... 1.         0.61882256 0.85212293]
 [0.65405422 0.61573453 0.60597066 ... 0.61882256 1.         0.84484752]
 [0.80264749 0.84069069 0.7839983  ... 0.85212293 0.84484752 1.        ]]


In [5]:
# Function to get recommendations
def get_recommendations(recipe_id, n_recommendations=5):
    # Get the similarity scores for the given recipe
    similarity_scores = similarity_matrix[recipe_id]
    # Get the indices of the most similar recipes
    similar_indices = similarity_scores.argsort()[-(n_recommendations + 1):][::-1]
    # Exclude the input recipe itself from the recommendations
    similar_indices = similar_indices[similar_indices != recipe_id]
    return similar_indices[:n_recommendations]

# Example usage
recipe_id = 6
recommendations = get_recommendations(recipe_id)
print(recommendations)

# Display the recommended recipes
recommended_recipes = df.iloc[recommendations]
print(recommended_recipes)


[1689  209 9153 5355 2976]
      RecipeId                                   Name  AuthorId  CookTime  \
1689      5002                      Chocolate Cookies      1866     480.0   
209        447               M&amp;m Super Snack Bars     60989    4500.0   
9153     18794  Roasted Almonds with Southwest Spices     17608     900.0   
5355     12646                     Cheddar Rum Spread     20371       1.0   
2976      8393                 Strawberry Mousse Cake     47510       1.0   

      PrepTime  TotalTime  Description  \
1689      1200       1680       115815   
209       2400       6900       151708   
9153       300       1200       173222   
5355       300        300       110002   
2976      1500       1500       185431   

                                                 Images  RecipeCategory  \
1689                                       character(0)              78   
209                                        character(0)              13   
9153                            

**K-Nearest Neighbor - similarity based on feature.**

In [13]:
from sklearn.neighbors import NearestNeighbors

nn_model = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine')

# Fit the model on the selected features
nn_model.fit(df[features])

# Function to get recommendations
def get_recommendations(recipe_id, n_recommendations=5):
    # Get the feature vector for the given recipe
    recipe_vector = df.loc[recipe_id, features].values.reshape(1, -1)
    
    # Find the nearest neighbors
    distances, indices = nn_model.kneighbors(recipe_vector, n_neighbors=n_recommendations+1)
    
    # Exclude the input recipe itself from the recommendations
    similar_recipes = indices.flatten()[1:]
    
    return similar_recipes

# Example usage
recipe_id = 6
recommendations = get_recommendations(recipe_id)
print(recommendations)

[190306  88742 164575 224781  31116]




In [15]:
# Display the recommended recipes
recommended_recipes = df.iloc[recommendations]
print(recommended_recipes)


        RecipeId                          Name  AuthorId  CookTime  PrepTime  \
190306    318937  Mom's Famous Oatmeal Cookies    437991     900.0       600   
88742     154332           Creamy Cheese Grits    282965     900.0       600   
164575    277990      Creamy Rice and Broccoli    721861    1200.0       600   
224781    374800                 Spinach Pasta    517443     420.0       900   
31116      56134                  Cheese Grits     19044     300.0       600   

        TotalTime  Description  \
190306       1500        71911   
88742        1500       122019   
164575       1800       122641   
224781       1320        90955   
31116         900       110188   

                                                   Images  RecipeCategory  \
190306  "https://img.sndimg.com/food/image/upload/w_55...              78   
88742                                        character(0)              31   
164575                                       character(0)             284   
224781

**INGREDIENT BASED FEATURES:**

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Ensure that the 'RecipeIngredientParts' column is properly formatted
def parse_ingredients(ingredient_str):
    try:
        # Attempt to evaluate the string as a list
        ingredients = eval(ingredient_str)
        if isinstance(ingredients, list):
            return ingredients
        else:
            return []
    except:
        return []

In [36]:
from scipy.sparse import csr_matrix

df['RecipeIngredientParts'] = df['RecipeIngredientParts'].apply(parse_ingredients)

# Combine all ingredient parts into a single string for each recipe
df['Ingredients'] = df['RecipeIngredientParts'].apply(lambda x: ' '.join(x) if x else '')

# Check for empty ingredient strings and handle them
df['Ingredients'] = df['Ingredients'].replace('', 'no_ingredients')

# Sample a subset of the data (e.g., 10,000 rows)
df_sampled = df.sample(n=10000, random_state=42)

# Use TF-IDF to convert the ingredient text into numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
ingredient_features = tfidf_vectorizer.fit_transform(df_sampled['Ingredients'])

# Convert the TF-IDF features to a sparse matrix
ingredient_features_sparse = csr_matrix(ingredient_features)

# Calculate the cosine similarity matrix based on ingredient features
ingredient_similarity_matrix = cosine_similarity(ingredient_features_sparse)

In [38]:
# Function to get ingredient-based recommendations
def get_ingredient_recommendations(recipe_id, n_recommendations=5):
    # Get the similarity scores for the given recipe
    similarity_scores = ingredient_similarity_matrix[recipe_id]
    # Get the indices of the most similar recipes
    similar_indices = similarity_scores.argsort()[-(n_recommendations + 1):][::-1]
    # Exclude the input recipe itself from the recommendations
    similar_indices = similar_indices[similar_indices != recipe_id]
    return similar_indices[:n_recommendations]

# Example usage
recipe_id = 6
ingredient_recommendations = get_ingredient_recommendations(recipe_id)
print(ingredient_recommendations)

# Display the recommended recipes
recommended_recipes = df.iloc[ingredient_recommendations]
print(recommended_recipes)

[9999 3329 3336 3335 3334]
      RecipeId                            Name  AuthorId  CookTime  PrepTime  \
9999     20209  Chicken - Artichoke Sandwiches      5060    1080.0       600   
3329      8985       Fried Macaroni and Cheese      2178    1800.0       900   
3336      8997                     Turkey Loaf      9441    3000.0       300   
3335      8995                Chocolate Mousse      9045     300.0       900   
3334      8994                      Rose Lassi      6357       1.0       600   

      TotalTime  Description  \
9999       1680       111894   
3329       2700       132967   
3336       3300       277774   
3335       1200          943   
3334        600       174282   

                                                 Images  RecipeCategory  \
9999  c("https://img.sndimg.com/food/image/upload/w_...             146   
3329                                       character(0)              47   
3336                                       character(0)             146   

**Model Based On Reviews:**

In [91]:
# Normalize the AggregatedRating and ReviewCount columns
df['NormalizedRating'] = df['AggregatedRating'] / df['AggregatedRating'].max()
df['NormalizedReviewCount'] = df['ReviewCount'] / df['ReviewCount'].max()

# Create a combined popularity score
df['Popularity'] = df['NormalizedRating'] + df['NormalizedReviewCount']

In [93]:
# Function to recommend recipes based on popularity
def recommend_recipes(n_recommendations=5):
    # Sort the recipes by popularity score in descending order
    recommended_recipes = df.sort_values(by='Popularity', ascending=False).head(n_recommendations)
    return recommended_recipes

# Example usage
recommendations = recommend_recipes()
print(recommendations[['RecipeId', 'Name', 'Popularity']])


       RecipeId                                     Name  Popularity
785        2886                        Best Banana Bread    2.000000
19529     35813                   Oatmeal Raisin Cookies    1.620326
30070     54257  Yes, Virginia There is a Great Meatloaf    1.608887
13303     25690                                 Pancakes    1.505939
41235     73440                               Beer Bread    1.457545


In [None]:
df.to_csv('Recipes_Model.csv', index=False)