In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import ast

In [2]:
import pandas as pd

# Load the dataset to inspect its structure and content
file_path = 'data_ML2.csv'
data = pd.read_csv(file_path)

# Show the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,Order_ID,User_ID,Dish_ID,Dish_Name,Ingredients_List,Dish_Rating,Order_Date,Quantity,Order_Price,Delivery_Location,User_Age,User_Gender,User_Preferences,Month,Season,Vendor_ID
0,1,106,1112,Dosa,"Rice, Urad Dal, Potatoes, Spices",3.2,2024-07-09,5,130,Kozhikode,30,Male,Vegetarian,Jul,Monsoon,V2
1,2,113,1152,Dosa,"Rice, Urad Dal, Potatoes, Spices",4.1,2024-01-14,2,86,Ambarnath,21,Male,Vegetarian,Jan,Summer,V6
2,3,106,1337,Thepla,"Wheat Flour, Fenugreek Leaves, Spices",4.4,2024-03-02,3,98,Bettiah,38,Female,Vegetarian,Mar,Monsoon,V3
3,4,123,1291,Chawal,"Rice, Water, Salt",2.6,2024-04-23,2,197,Eluru,28,Female,Vegetarian,Apr,Monsoon,V6
4,5,124,1380,Egg Curry,"Eggs, Onion, Tomatoes, Spices",2.0,2024-08-17,2,154,Katni,45,Male,Non-Vegetarian,Aug,Summer,V5


In [8]:
from sklearn.preprocessing import LabelEncoder

# Create a copy of the dataset for preprocessing
data_processed = data.copy()

# Encoding categorical variables like User_Preferences, Season, and Gender
label_encoders = {}
categorical_columns = ['User_Preferences', 'Season', 'User_Gender']

# Apply label encoding to each categorical column
for col in categorical_columns:
    le = LabelEncoder()
    data_processed[col] = le.fit_transform(data_processed[col])
    label_encoders[col] = le

# Show the processed dataset's first few rows to verify encoding
data_processed.head()


Unnamed: 0,Order_ID,User_ID,Dish_ID,Dish_Name,Ingredients_List,Dish_Rating,Order_Date,Quantity,Order_Price,Delivery_Location,User_Age,User_Gender,User_Preferences,Month,Season,Vendor_ID
0,1,106,1112,Dosa,"Rice, Urad Dal, Potatoes, Spices",3.2,2024-07-09,5,130,Kozhikode,30,1,1,Jul,0,V2
1,2,113,1152,Dosa,"Rice, Urad Dal, Potatoes, Spices",4.1,2024-01-14,2,86,Ambarnath,21,1,1,Jan,1,V6
2,3,106,1337,Thepla,"Wheat Flour, Fenugreek Leaves, Spices",4.4,2024-03-02,3,98,Bettiah,38,0,1,Mar,0,V3
3,4,123,1291,Chawal,"Rice, Water, Salt",2.6,2024-04-23,2,197,Eluru,28,0,1,Apr,0,V6
4,5,124,1380,Egg Curry,"Eggs, Onion, Tomatoes, Spices",2.0,2024-08-17,2,154,Katni,45,1,0,Aug,1,V5


In [10]:
from scipy.sparse import csr_matrix

# Use TF-IDF vectorizer to convert ingredients list into numerical features
vectorizer = TfidfVectorizer(stop_words='english')
ingredient_matrix = vectorizer.fit_transform(data['Ingredients_List'])

In [12]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Select features for KNN: 'User_Preferences', 'Dish_Rating', 'Season', and 'User_Age'
features = ['User_Preferences', 'Dish_Rating', 'Season', 'User_Age']

# Normalize the feature values using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(data_processed[features])

In [13]:
from scipy.sparse import hstack

# Normalize the original feature set before combining
X_combined = hstack([ingredient_matrix, scaler.transform(data_processed[features])])

In [14]:
# Refit the KNN model using the combined features (including ingredients)
knn_with_ingredients = NearestNeighbors(n_neighbors=5, algorithm='auto')
knn_with_ingredients.fit(X_combined)

In [15]:
# Function to get dish recommendations with ingredients included in the feature set
def recommend_dishes_with_ingredients_v2(user_preference, user_age, current_season, n_recommendations=5):
    # Create a query array based on the input (for features excluding ingredients)
    query_features = scaler.transform([[user_preference, 4.0, current_season, user_age]])  # Average rating assumed
    
    # Create a placeholder empty ingredient vector (all zeros) to represent no specific ingredient preference
    zero_ingredient_vector = csr_matrix((1, ingredient_matrix.shape[1]))
    
    # Combine ingredient vector and user features for query
    query_combined = hstack([zero_ingredient_vector, query_features])

    # Fetch the nearest neighbors based on combined features
    distances, indices = knn_with_ingredients.kneighbors(query_combined, n_neighbors=n_recommendations)

    # Fetch the recommended dishes and their ingredients
    recommendations = data.iloc[indices[0]][['Dish_Name', 'Ingredients_List']].values
    return recommendations

# Example: Recommend dishes for a vegetarian user, age 30, in summer, including ingredients
recommend_dishes_with_ingredients_v2(user_preference=1, user_age=30, current_season=1)



array([['Rajma Chawal', 'Kidney Beans, Rice, Onion, Garlic, Spices'],
       ['Sabudana Khichdi', 'Sabudana, Peanuts, Potatoes, Spices'],
       ['Dal Tadka', 'Yellow Lentils, Onion, Garlic, Tomatoes, Spices'],
       ['Pulao', 'Basmati Rice, Mixed Vegetables, Onion, Spices'],
       ['Masoor Dal', 'Red Lentils, Onion, Garlic, Tomatoes, Spices']],
      dtype=object)

In [21]:
def recommend_dishes_with_ingredients_input(user_preference, user_age, current_season, user_ingredients, n_recommendations=5):
    # Vectorize the user's input ingredients using the same TF-IDF vectorizer
    user_ingredient_vector = vectorizer.transform([user_ingredients])
    
    # Create a query array for the user's preferences, age, and season
    query_features = scaler.transform([[user_preference, 4.0, current_season, user_age]])  # Assuming an average rating
    
    # Combine the ingredient vector with the other user features
    query_combined = hstack([user_ingredient_vector, query_features])
    
    # Fetch the nearest neighbors based on combined features
    distances, indices = knn_with_ingredients.kneighbors(query_combined, n_recommendations)

    # Fetch the recommended dishes and their ingredients
    recommendations = data.iloc[indices[0]][['Dish_Name', 'Ingredients_List']].values
    return recommendations

# Example usage: A user wants dishes with "Garlic, Onion, Spices", is a vegetarian, aged 30, in summer
recommend_dishes_with_ingredients_input(
    user_preference=1,  # Vegetarian
    user_age=30,
    current_season=1,  # Summer
    user_ingredients="Kadhi"
)




array([['Rajma Chawal', 'Kidney Beans, Rice, Onion, Garlic, Spices'],
       ['Sabudana Khichdi', 'Sabudana, Peanuts, Potatoes, Spices'],
       ['Dal Tadka', 'Yellow Lentils, Onion, Garlic, Tomatoes, Spices'],
       ['Pulao', 'Basmati Rice, Mixed Vegetables, Onion, Spices'],
       ['Masoor Dal', 'Red Lentils, Onion, Garlic, Tomatoes, Spices']],
      dtype=object)