In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np
import re

In [None]:
df = pd.read_excel("../data/food_data.xlsx")

In [None]:
df.head()

In [None]:
df.main_category.unique()

In [None]:
df.rename(columns={'vitamin_K_ UG': 'vitamin_K_UG', 'vitamin D _UG' : 'vitamin_D_UG', 'vitamin B_12_UG' : 'vitamin_B_12_UG'}, inplace=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns

### Preprocessing

In [None]:
# Cleaning
df['description'] = df['description'].apply(lambda x: x[:-5] if x.endswith(", raw") else x)
df['main_category'] = df['main_category'].apply(lambda x: "Veg" if x == "Non Alcoholic" else x)
df['description'] = df['description'].apply(lambda x: re.sub(r"^Game meat,\s*", "", x).capitalize())

In [None]:
df.fillna(0, inplace = True)

In [None]:

in_mg = ['calcium_MG', 'potassium_MG', 'zinc_MG', 'vitamin_C_MG', 'iron_MG', 'magnesium_MG', 'phosphorus_MG',
          'sodium_MG', 'copper_MG', 'vitamin_E_MG', 'thiamin_MG', 'riboflavin_MG', 'cholesterol_MG', 'Niacin_MG', 
          'vitamin_B_6_MG', 'choline_total_MG']

in_grams = ['carbohydrate_G', 'water_G', 'total_lipid_fat_G', 'protein_G', 'fatty_acids_total_saturated_G', 
            'fiber_total_dietary_G','total_sugars_G', 'fatty_acids_total_monounsaturated_G', 
            'fatty_acids_total_polyunsaturated_G' ]

in_ug = ['vitamin_A_UG', 'vitamin_K_UG', 'folate_total_UG', 'vitamin_B_12_UG', 'selenium_UG', 'vitamin_D_UG' ]

others = ['description', 'sub_category', 'main_category', 'category', 'energy (kJ)']

# Convert grams to milligrams (g → mg)
df[in_grams] = df[in_grams] * 1000

# Convert micrograms to milligrams (µg → mg)
df[in_ug] = df[in_ug] / 1000




In [None]:

df.columns = df.columns.str.replace(r'_(UG|MG|G)$', '', regex=True)

In [None]:
df.columns

In [None]:
df['iron'].max()

In [None]:
# Select relevant columns (nutrients for modeling)
nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus','sodium', 'copper',
              'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
              'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin_B_12', 'selenium', 'vitamin_D' ]

# Normalize the nutrient values for ML
scaler = MinMaxScaler()
df[nutrients] = scaler.fit_transform(df[nutrients])

# Save the processed data
df.to_csv("processed_food_data.csv", index=False)
print("✅ Data preprocessing complete! File saved as 'processed_food_data.csv'.")


# Filtering data based on one deficiency.

In [None]:
# Load processed dataset
df = pd.read_csv("processed_food_data.csv")

# Define features (nutrient values)
nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus','sodium', 'copper',
              'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
              'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin_B_12', 'selenium', 'vitamin_D' ]
X = df[nutrients]

# Train a KNN model
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(X)

def recommend_food(deficiency):
    """Recommend food items based on a user's nutrient deficiency."""
    if deficiency not in nutrients:
        return "Invalid deficiency. Choose from: " + ", ".join(nutrients)

    # Create a query vector: 1 for the deficient nutrient, 0 for others
    deficiency_idx = nutrients.index(deficiency)
    sample = np.zeros(len(nutrients))
    sample[deficiency_idx] = 1  # Targeting the deficient nutrient

    # Find similar foods
    distances, indices = knn.kneighbors([sample])
    recommendations = df.iloc[indices[0]][['description', deficiency]]

    return recommendations.to_dict(orient="records")

# Test the model
print("✅ Model trained. Testing recommendations...\n")

print("🥗 Vitamin C Deficiency:")
print(recommend_food("vitamin_C"))

print("\n🥩 Iron Deficiency:")
print(recommend_food("iron"))


# Filtering data based on multiple deficiencies

In [None]:

# Load processed dataset
df = pd.read_csv("processed_food_data.csv")

# Define features (nutrient values)
nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus', 'sodium', 'copper',
             'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
             'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin_B_12', 'selenium', 'vitamin_D']
X = df[nutrients]

# Train a KNN model
knn = NearestNeighbors(n_neighbors=40, metric='euclidean')
knn.fit(X)

def recommend_food(deficiencies):
    """Recommend food items based on a user's nutrient deficiencies."""
    if not isinstance(deficiencies, list):
        return "Invalid input. Provide a list of deficiencies."
    
    invalid_nutrients = [d for d in deficiencies if d not in nutrients]
    if invalid_nutrients:
        return f"Invalid deficiencies: {', '.join(invalid_nutrients)}. Choose from: {', '.join(nutrients)}"

    # Create a query vector: 1 for deficient nutrients, 0 for others
    sample = np.zeros(len(nutrients))
    for deficiency in deficiencies:
        sample[nutrients.index(deficiency)] = 1  # Targeting deficient nutrients

    # Find similar foods
    distances, indices = knn.kneighbors([sample])
    recommendations = df.iloc[indices[0]][['description'] + deficiencies]

    return "\n".join([str(item) for item in recommendations.to_dict(orient="records")])

# Test the model
print("✅ Model trained. Testing recommendations...\n")

print("🥗 Vitamin C and Iron Deficiency:")
print(recommend_food(["vitamin_C", "iron"]))

print("\n🥩 Calcium and Magnesium Deficiency:")
print(recommend_food(["calcium", "magnesium"]))


###  Filtering food based on category veg/Non - veg

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

# Assuming df is already loaded with the dataset
df = pd.read_csv("processed_food_data.csv")

# Define the nutrients list within the function
def recommend_food(deficiencies, category=None):
    """Recommend food items based on a user's nutrient deficiencies, with optional category filtering."""
    
    # Define nutrients inside the function
    nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus', 'sodium', 'copper',
                 'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
                 'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin_B_12', 'selenium', 'vitamin_D']
    
    if not isinstance(deficiencies, list):
        return "Invalid input. Provide a list of deficiencies."
    
    # Check for invalid deficiencies
    invalid_nutrients = [d for d in deficiencies if d not in nutrients]
    if invalid_nutrients:
        return f"Invalid deficiencies: {', '.join(invalid_nutrients)}. Choose from: {', '.join(nutrients)}"

    # Filter by category (if specified)
    if category == 'Veg':
        # Filter the dataframe based on the main category (e.g., 'Veg')
        df_filtered = df[df['main_category'] == category]
        if df_filtered.empty:
            return f"No data found for the category: {category}."
    else:
        df_filtered = df  # If no category is selected, use the whole dataframe

    # Create a query vector: 1 for deficient nutrients, 0 for others
    sample = np.zeros(len(nutrients))
    for deficiency in deficiencies:
        sample[nutrients.index(deficiency)] = 1  # Targeting deficient nutrients

    # Train the KNN model on the filtered data (to ensure it's category-specific)
    X_filtered = df_filtered[nutrients]
    knn = NearestNeighbors(n_neighbors=40, metric='euclidean')
    knn.fit(X_filtered)

    # Find similar foods
    distances, indices = knn.kneighbors([sample])
    recommendations = df_filtered.iloc[indices[0]][['description'] + deficiencies]

    # Format recommendations as a list of strings
    recommendation_list = [f"\nRecommendations for {' and '.join(deficiencies)} Deficiency:"]
    for i, row in recommendations.iterrows():
        recommendation_list.append(f"Food: {row['description']}, {', '.join([f'{d.capitalize()}: {row[d]} mg' for d in deficiencies])}")

    return "\n".join(recommendation_list)

# Test the model with multiple deficiencies and category filtering
print("✅ Model trained. Testing recommendations...\n")

print("🥗 Vitamin C and Iron Deficiency (Veg Only):")
print(recommend_food(["vitamin_C", "iron"], category="Non-veg"))

print("\n🥩 Calcium and Magnesium Deficiency (Veg Only):")
print(recommend_food(["calcium", "magnesium"], category="Non-veg"))


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def evaluate_knn():
    kmeans = KMeans(n_clusters=2, random_state=42)  # Adjust clusters based on dataset
    cluster_labels = kmeans.fit_predict(X)  # Assign each food item to a cluster

    score = silhouette_score(X, cluster_labels)
    print(f"Silhouette Score for KNN-based Food Clustering: {score:.2f}")

evaluate_knn()


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

scores = []
cluster_range = range(2, 15)  # Test cluster sizes from 2 to 15

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    score = silhouette_score(X, cluster_labels)
    scores.append(score)

# Plot silhouette scores
plt.plot(cluster_range, scores, marker='o', linestyle='-')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.title("Optimal Number of Clusters")
plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Reduce to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', marker='o')
plt.title(f"Food Clusters with K={2}")
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster Label')
plt.show()


KNN Neighborhood Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("processed_food_data.csv")

def plot_knn_recommendation(deficiencies, category=None, k=5):
    """Plot the KNN recommendation process based on deficiencies."""
    
    # Define nutrients
    nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus', 'sodium', 'copper',
                 'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
                 'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin_B_12', 'selenium', 'vitamin_D']
    
    # Validate deficiencies
    for deficiency in deficiencies:
        if deficiency not in nutrients:
            raise ValueError(f"Invalid deficiency: {deficiency}. Choose from: {', '.join(nutrients)}")

    # Filter category if needed
    if category:
        df_filtered = df[df['main_category'] == category]
    else:
        df_filtered = df  # Use full dataset if no category is specified

    # Create a deficiency-based query vector
    sample = np.zeros(len(nutrients))
    for deficiency in deficiencies:
        sample[nutrients.index(deficiency)] = 1  # Mark deficiencies

    # Train KNN model
    X_filtered = df_filtered[nutrients]
    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(X_filtered)

    # Find nearest food recommendations
    distances, indices = knn.kneighbors([sample])
    recommended_foods = df_filtered.iloc[indices[0]]

    # Choose two nutrients for plotting (first two deficiencies)
    x_axis = deficiencies[0]
    y_axis = deficiencies[1] if len(deficiencies) > 1 else 'iron'  # Default to Iron if only one deficiency is given

    # Plot all foods in gray
    plt.figure(figsize=(8, 6))
    plt.scatter(df_filtered[x_axis], df_filtered[y_axis], color='lightgray', label='All Foods', alpha=0.5)

    # Highlight recommended foods in blue
    for i, idx in enumerate(indices[0]):
        plt.scatter(df_filtered.iloc[idx][x_axis], df_filtered.iloc[idx][y_axis], color='blue', label='Recommended Food' if i == 0 else "", alpha=0.7)
        plt.annotate(df_filtered.iloc[idx]['description'], (df_filtered.iloc[idx][x_axis], df_filtered.iloc[idx][y_axis]), fontsize=9)

    # Labels and title
    plt.xlabel(x_axis.capitalize())
    plt.ylabel(y_axis.capitalize())
    plt.title(f"KNN-Based Food Recommendation for {', '.join(deficiencies)} Deficiency")
    plt.legend()
    plt.show()

    # Print recommendations
    print(f"\n🔍 Top {k} Recommended Foods for {', '.join(deficiencies)} Deficiency:")
    for i, row in recommended_foods.iterrows():
        print(f"{i+1}. {row['description']} - {', '.join([f'{d.capitalize()}: {row[d]} mg' for d in deficiencies])}")

# Example: Plot recommendations for Iron & Vitamin C deficiency (Veg)
plot_knn_recommendation(deficiencies=["calcium", "iron", "niacin"], category="Veg", k=5)
