In [1]:
import numpy as np
import pandas as pd

In [2]:
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
selected_diseases = ['Hypertension', 'Diabetes(Type-1)', 'Skin Cancer', 'general']

# Create an empty dictionary to store similarity scores for each disease
similarity_scores_dict = {}

for selected_disease in selected_diseases:
    disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values
    nutrient_columns = food_data.columns[1:]
    food_nutrition = food_data[nutrient_columns].values
    similarity_scores = cosine_similarity(disease_limits, food_nutrition)
    inverted_similarity_scores = 1 - similarity_scores
    top_food_indices = np.argsort(inverted_similarity_scores)[0]

    # Store the similarity scores in the dictionary
    similarity_scores_dict[selected_disease] = inverted_similarity_scores

    print("Top Recommended Foods for", selected_disease)
    for i, idx in enumerate(top_food_indices[:5]):
        food_name = food_data.loc[idx, 'name']
        print(f"{i+1}. {food_name}")
    print()

    print("\nNot Recommended Foods for", selected_disease)
    for i, idx in enumerate(top_food_indices[-5:][::-1]):
        food_name = food_data.loc[idx, 'name']
        print(f"{i+1}. {food_name}")
    print('=' * 50)

Top Recommended Foods for Hypertension
1. Restaurant, without meat or noodles, vegetable chow mein, Chinese
2. Soup, ready to serve, traditional, beef barley, PROGRESSO
3. Soup, ready to serve, beef barley
4. Soup, ready-to-serve, canned, beef and vegetables
5. Soup, ready-to-serve, canned, chunky vegetable


Not Recommended Foods for Hypertension
1. Seasoning mix, coriander & annatto, sazon, dry
2. Salt, table
3. Leavening agents, baking soda
4. Fish oil, fully hydrogenated, menhaden
5. Oil, fully hydrogenated, soy, industrial
Top Recommended Foods for Diabetes(Type-1)
1. Restaurant, without meat or noodles, vegetable chow mein, Chinese
2. Squash, without salt, drained, boiled, cooked, crookneck and straightneck, summer
3. Squash, with salt, drained, boiled, cooked, crookneck and straightneck, summer
4. Soup, ready-to-serve, canned, chicken and vegetable
5. Squash, includes skin, zucchini, summer


Not Recommended Foods for Diabetes(Type-1)
1. Seasoning mix, coriander & annatto, sazon

In [5]:
# Convert the dictionary to a DataFrame
similarity_scores_dict

{'Hypertension': array([[0.83033188, 0.92873466, 0.00226299, ..., 0.16279868, 0.04943411,
         0.04943411]]),
 'Diabetes(Type-1)': array([[0.86963811, 0.92891784, 0.00248732, ..., 0.15985101, 0.04879213,
         0.04879213]]),
 'Skin Cancer': array([[0.90135264, 0.93812756, 0.00318844, ..., 0.16082725, 0.0471918 ,
         0.0471918 ]]),
 'general': array([[0.8303457 , 0.92413273, 0.00233985, ..., 0.16149148, 0.05102753,
         0.05102753]])}

In [6]:

flattened_dict = {disease: array.flatten() for disease, array in similarity_scores_dict.items()}
similarity_scores_df = pd.DataFrame(flattened_dict)

In [7]:
similarity_scores_df

Unnamed: 0,Hypertension,Diabetes(Type-1),Skin Cancer,general
0,0.830332,0.869638,0.901353,0.830346
1,0.928735,0.928918,0.938128,0.924133
2,0.002263,0.002487,0.003188,0.002340
3,0.799281,0.838257,0.867219,0.800134
4,0.108112,0.121677,0.124822,0.107994
...,...,...,...,...
8181,0.050708,0.050044,0.048484,0.052267
8182,0.117612,0.117246,0.115550,0.119226
8183,0.162799,0.159851,0.160827,0.161491
8184,0.049434,0.048792,0.047192,0.051028


### Export similarity scores

In [8]:
import pickle

with open('similarity_scores.pkl', 'wb') as model_file:
    pickle.dump(similarity_scores_df, model_file)

with open('food.pkl', 'wb') as model_file:
    pickle.dump(food_data, model_file)
with open('disease.pkl', 'wb') as model_file:
    pickle.dump(disease_data, model_file)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Get user input
selected_disease = input("Enter the disease name: ")

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Get user input
selected_disease = input("Enter the disease name: ")

# Extract disease limits
disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values

# Extract nutrient columns
nutrient_columns = food_data.columns[1:]
food_nutrition = food_data[nutrient_columns].values

# Normalize data
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.transform(disease_limits)

# Calculate distances
distances = disease_limits_scaled - food_nutrition_scaled

# Convert distances above the disease limit to negative and below to positive
distances_sign = np.where(distances > 0, 1, -1)
distances_magnitude = np.linalg.norm(distances, axis=1)
distances = distances_sign * distances_magnitude

# Create DataFrame with food names and distances
food_distance_df = pd.DataFrame({'Food': food_data['name'], 'Distance': distances})

# Train-test split
X = food_nutrition_scaled
y = distances
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict distances for the entire dataset
predicted_distances = model.predict(X)

# Evaluate the model
print("Model Evaluation:")
print("Train MSE:", mean_squared_error(y_train, model.predict(X_train)))
print("Test MSE:", mean_squared_error(y_test, model.predict(X_test)))
print("Train R-squared:", r2_score(y_train, model.predict(X_train)))
print("Test R-squared:", r2_score(y_test, model.predict(X_test)))

# Feature Importance
feature_importance = np.abs(model.coef_)  # Absolute values for better interpretation
sorted_indices = np.argsort(feature_importance)[::-1]  # Sort indices in descending order

# Display top 10 important features
print("\nTop 10 Important Features:")
for i in range(10):
    feature_index = sorted_indices[i]
    feature_name = nutrient_columns[feature_index]
    print(f"{i+1}. {feature_name}: {feature_importance[feature_index]}")

# Get top recommended foods and foods to avoid
top_n = 30
recommended_food_indices = np.argsort(predicted_distances)[:top_n]
avoid_food_indices = np.argsort(predicted_distances)[::-1][:top_n]

# Display recommended foods
print("\nTop Recommended Foods for", selected_disease)
for i, idx in enumerate(recommended_food_indices):
    food_name = food_data.loc[idx, 'name']
    print(f"{i+1}. {food_name} (Positive Distance)")

# Display foods to avoid
print("\nFoods to Avoid for", selected_disease)
for i, idx in enumerate(avoid_food_indices):
    food_name = food_data.loc[idx, 'name']
    print(f"{i+1}. {food_name} (Negative Distance)")



ValueError: operands could not be broadcast together with shapes (8186,73) (8186,) 

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Get user input
selected_disease = input("Enter the disease name: ")

# Extract disease limits
disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values

# Extract nutrient columns
nutrient_columns = food_data.columns[1:]
food_nutrition = food_data[nutrient_columns].values

# Normalize data
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.transform(disease_limits)

# Calculate distances
distances = disease_limits_scaled - food_nutrition_scaled

# Convert distances above the disease limit to negative and below to positive
distances_sign = np.where(distances > 0, 1, -1)
distances_magnitude = np.linalg.norm(distances, axis=1)
distances_signed = distances_sign * distances_magnitude[:, np.newaxis]  # Adding a new axis to match shapes

# Train-test split
X = food_nutrition_scaled
y = distances_signed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict distances for the entire dataset
predicted_distances = model.predict(X)

# Evaluate the model
print("Model Evaluation:")
print("Train MSE:", mean_squared_error(y_train, model.predict(X_train)))
print("Test MSE:", mean_squared_error(y_test, model.predict(X_test)))
print("Train R-squared:", r2_score(y_train, model.predict(X_train)))
print("Test R-squared:", r2_score(y_test, model.predict(X_test)))

# Get top recommended foods and foods to avoid
top_n = 30
recommended_food_indices = np.argsort(predicted_distances)[:top_n]
avoid_food_indices = np.argsort(predicted_distances)[::-1][:top_n]

# Display recommended foods
print("\nTop Recommended Foods for", selected_disease)
for i, idx in enumerate(recommended_food_indices):
    food_name = food_data.loc[idx, 'name']
    print(f"{i+1}. {food_name} (Positive Distance)")

# Display foods to avoid
print("\nFoods to Avoid for", selected_disease)
for i, idx in enumerate(avoid_food_indices):
    food_name = food_data.loc[idx, 'name']
    print(f"{i+1}. {food_name} (Negative Distance)")


Model Evaluation:
Train MSE: 44673.14362054429
Test MSE: 49918.75544688587
Train R-squared: 0.622887508674847
Test R-squared: -5.583544969027783

Top Recommended Foods for Skin Cancer
1. 54          Tomatoes, yellow
70     Frankfurter, meatless
3             Teff, uncooked
71          Emu, flat fillet
68          Fireweed, leaves
               ...          
37    Nuts, dried, beechnuts
50     KEEBLER, Waffle Bowls
2                   Eggplant
53          Tomatoes, orange
34    McDONALD'S, Hash Brown
Name: name, Length: 73, dtype: object (Positive Distance)
2. 63     Danish pastry, cheese
65     Spices, garlic powder
0                 Cornstarch
1               Nuts, pecans
66     Oil, soybean lecithin
               ...          
52          Durian or frozen
20         Grapes, muscadine
12      Mango nectar, canned
48          Chicory, witloof
43    Snack, Mixed Berry Bar
Name: name, Length: 73, dtype: object (Positive Distance)
3. 68          Fireweed, leaves
70     Frankfurter, meat

In [25]:
import pandas as pd

food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')
# Extract nutrient columns
nutrient_columns = food_data.columns[1:]
X = food_data[nutrient_columns].values

# Choose the disease you are interested in
selected_disease = "Hypertension"  # Update with the desired disease
y = abs(disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values - X)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))
# Predict deviations for all foods
all_food_deviations = model.predict(X_scaled)

# Get indices of top recommended foods (lowest deviation)
top_n = 10  # Update with the desired number of recommendations
recommended_food_indices = all_food_deviations.sum(axis=1).argsort()[:top_n]

# Display recommended foods
recommended_foods = food_data.iloc[recommended_food_indices]['name']
print("Top Recommended Foods:")
print(recommended_foods)



Mean Squared Error: 2648797.21745573
R-squared: 0.6703879566240043
Top Recommended Foods:
3443      UDI'S, Classic French Dinner Rolls, Gluten Free
7392    Rolls, and potato starch, tapioca starch, made...
1754                      Wheat, uncooked, KAMUT khorasan
1345                    McDONALD'S, Low Fat Caramel Sauce
481                                       Spelt, uncooked
7338    Bagels, toasted, with calcium propionate (incl...
6201    Formulated bar, milk chocolate peanut, SLIM-FA...
5456    Beef, simmered, cooked, brain, variety meats a...
3385           Beef, brain, variety meats and by-products
6873    Bread, made with tapioca starch and brown rice...
Name: name, dtype: object


In [28]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Merge datasets on common columns
merged_data = pd.merge(food_data, disease_data, on=['total_fat', 'saturated_fat', 'cholesterol', 'sodium', 'choline', 'folate', 'folic_acid', 'niacin',
                  'pantothenic_acid', 'riboflavin', 'thiamin', 'vitamin_a', 'vitamin_a_rae', 'carotene_alpha',
                  'carotene_beta', 'cryptoxanthin_beta', 'lutein_zeaxanthin', 'lucopene', 'vitamin_b12', 'vitamin_b6',
                  'vitamin_c', 'vitamin_d', 'vitamin_e', 'tocopherol_alpha', 'vitamin_k', 'calcium', 'copper', 'irom',
                  'magnesium', 'manganese', 'phosphorous', 'potassium', 'selenium', 'zink', 'protein', 'alanine',
                  'arginine', 'aspartic_acid', 'cystine', 'glutamic_acid', 'glycine', 'histidine', 'hydroxyproline',
                  'isoleucine', 'leucine', 'lysine', 'methionine', 'phenylalanine', 'proline', 'serine', 'threonine',
                  'tryptophan', 'tyrosine', 'valine', 'carbohydrate', 'fiber', 'sugars', 'fructose', 'galactose',
                  'glucose', 'lactose', 'maltose', 'sucrose', 'fat', 'saturated_fatty_acids', 'monounsaturated_fatty_acids',
                  'polyunsaturated_fatty_acids', 'fatty_acids_total_trans', 'alcohol', 'ash', 'caffeine', 'theobromine', 'water'])

# Define features and target variable
X = merged_data.drop(['disease_name'], axis=1)  # Features
y = merged_data['disease_name']  # Target variable

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Use the trained model to recommend and avoid foods
patient_data = pd.DataFrame(...)  # Provide patient-specific data
patient_data_scaled = scaler.transform(patient_data)
recommendations = model.predict(patient_data_scaled)

# Display recommendations
print("Recommended foods:", merged_data.loc[recommendations == 1, 'name'])
print("Foods to avoid:", merged_data.loc[recommendations == 0, 'name'])


ValueError: Found array with 0 sample(s) (shape=(0, 74)) while a minimum of 1 is required by StandardScaler.

ModuleNotFoundError: No module named 'quaternion'

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Get user input
selected_disease = input("Enter the disease name: ")

# Extract disease limits
disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values


In [6]:
!pip install tensorflow




In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Extract disease limits
disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values

# Extract nutrient columns
nutrient_columns = food_data.columns[1:]
food_nutrition = food_data[nutrient_columns].values

# Normalize data
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.transform(disease_limits)

# Train-test split
X = food_nutrition_scaled
y = disease_limits_scaled  # Target variable is disease_limits_scaled
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TensorFlow Linear Regression Model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(units=1, activation='linear')
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Predictions
predicted_distances_train = model.predict(X_train)
predicted_distances_test = model.predict(X_test)

# Evaluate the model
print("Model Evaluation:")
print("Train MSE:", mean_squared_error(y_train, predicted_distances_train))
print("Test MSE:", mean_squared_error(y_test, predicted_distances_test))
print("Train R-squared:", r2_score(y_train, predicted_distances_train))
print("Test R-squared:", r2_score(y_test, predicted_distances_test))





ValueError: Found input variables with inconsistent numbers of samples: [8186, 1]

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Extract disease limits
disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values

# Extract nutrient columns
nutrient_columns = food_data.columns[1:]
food_nutrition = food_data[nutrient_columns].values

# Normalize data
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.fit_transform(disease_limits)



In [14]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('preprocessed_food.csv')
disease_data = pd.read_csv('preprocessed_disease.csv')

# Get user input
selected_disease = input("Enter the disease name: ")

# Extract disease limits for the selected disease
disease_limits = disease_data[disease_data['disease_name'] == selected_disease].iloc[:, 1:].values

# Extract nutrient columns
nutrient_columns = food_data.columns[1:]
food_nutrition = food_data[nutrient_columns].values

# Normalize data
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.transform(disease_limits)

# Calculate distances
distance = food_nutrition_scaled - disease_limits_scaled
distance_magnitude = np.linalg.norm(distance, axis=1)

# Create DataFrame with food names and distances
food_distance_df = pd.DataFrame({'Food': food_data['name'], 'Distance': distance_magnitude})

# Sort foods based on distance (ascending order)
food_distance_df = food_distance_df.sort_values(by='Distance', ascending=True)

# Display recommended foods
print("\nTop Recommended Foods for", selected_disease)
for i, (food_name, dist) in enumerate(zip(food_distance_df['Food'], food_distance_df['Distance'])):
    print(f"{i+1}. {food_name} (Distance: {dist:.2f})")



Top Recommended Foods for Hypertension
1. Mollusks, moist heat, cooked, mixed species, clam (Distance: 523.12)
2. Lamb, liver, variety meats and by-products (Distance: 525.40)
3. Lamb, pan-fried, cooked, liver, variety meats and by-products (Distance: 526.62)
4. Beef, pan-fried, cooked, liver, variety meats and by-products (Distance: 527.51)
5. Lamb, braised, cooked, kidneys, variety meats and by-products (Distance: 527.70)
6. Veal, braised, cooked, liver, variety meats and by-products (Distance: 528.00)
7. Lamb, braised, cooked, liver, variety meats and by-products (Distance: 528.51)
8. Beef, boiled, cooked, variety meats and by-products liver, imported, New Zealand (Distance: 528.54)
9. Moose, braised (Alaska Native), liver (Distance: 530.08)
10. Beef, braised, cooked, liver, variety meats and by-products (Distance: 530.39)
11. Veal, pan-fried, cooked, liver, variety meats and by-products (Distance: 530.65)
12. Beef, liver, variety meats and by-products, imported, New Zealand (Dista

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data
food_data = pd.read_csv('./datasets/preprocessed_food.csv')
disease_data = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')

# Get user input
selected_disease = input("Enter the disease name: ")


In [4]:

# Extract disease limits for the selected disease
disease_limits = disease_data[disease_data['Disease'] == selected_disease].iloc[:, 1:].values

# Extract nutrient columns
nutrient_columns = food_data.columns[2:]
food_nutrition = food_data[nutrient_columns].values

food_nutrition



array([[7.000e+01, 1.030e+03, 6.890e+03, ..., 0.000e+00, 0.000e+00,
        8.750e+04],
       [5.100e+01, 3.340e+03, 4.870e+03, ..., 0.000e+00, 0.000e+00,
        8.904e+04],
       [6.000e+01, 3.280e+03, 4.670e+03, ..., 0.000e+00, 0.000e+00,
        8.810e+04],
       ...,
       [5.200e+01, 1.810e+03, 1.160e+04, ..., 0.000e+00, 0.000e+00,
        8.559e+04],
       [2.710e+02, 1.340e+03, 1.714e+04, ..., 0.000e+00, 0.000e+00,
        5.597e+04],
       [8.920e+02, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [5]:
# Normalize data
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.transform(disease_limits)



In [12]:
# Calculate distances
distance = food_nutrition_scaled - disease_limits_scaled
distance_magnitude = np.linalg.norm(distance, axis=1)

(array([[-0.0228033 , -0.29825999, -0.20385197, ...,  0.        ,
          0.        ,  0.19964295],
        [-0.1579062 , -0.02183019, -0.29508875, ...,  0.        ,
          0.        ,  0.2573872 ],
        [-0.09391009, -0.02901019, -0.30412209, ...,  0.        ,
          0.        ,  0.22214071],
        ...,
        [-0.15079552, -0.20492006,  0.00888329, ...,  0.        ,
          0.        ,  0.1280251 ],
        [ 1.40644314, -0.26116335,  0.25910694, ...,  0.        ,
          0.        , -0.98261416],
        [ 5.82217467, -0.42151656, -0.51505068, ...,  0.        ,
          0.        , -3.08127989]]),
 array([ 1.51940552,  1.58542991,  1.59803665, ...,  0.79941605,
         3.50440328, 13.75475812]))

In [15]:
# Train-test split
X = food_nutrition_scaled
y = distance_magnitude
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [14]:
# Predict distances
predicted_distances_train = model.predict(X_train)
predicted_distances_test = model.predict(X_test)

# Evaluate the model
print("Model Evaluation:")
print("Train MSE:", mean_squared_error(y_train, predicted_distances_train))
print("Test MSE:", mean_squared_error(y_test, predicted_distances_test))
print("Train R-squared:", r2_score(y_train, predicted_distances_train))
print("Test R-squared:", r2_score(y_test, predicted_distances_test))

# Get top recommended foods
top_n = 30
recommended_food_indices = np.argsort(predicted_distances_test)[:top_n]

# Display recommended foods
print("\nTop Recommended Foods for", selected_disease)
for i, idx in enumerate(recommended_food_indices):
    food_name = food_data.loc[idx, 'Name']
    print(f"{i+1}. {food_name}")

Model Evaluation:
Train MSE: 2.252706192379339e-29
Test MSE: 2.230950952012088e-29
Train R-squared: 1.0
Test R-squared: 1.0

Top Recommended Foods for AcidReflux
1. 16                 Buttermilk, fat free (skim)
11                       Milk, fat free (skim)
3                      Milk, low sodium, whole
21                          Goat's milk, whole
4               Milk, calcium fortified, whole
15                   Milk, lactose free, whole
12            Milk, lactose free, low fat (1%)
10                          Milk, low fat (1%)
8              Milk, acidophilus, low fat (1%)
18                Buttermilk, reduced fat (2%)
19                           Buttermilk, whole
20                 Kefir, NS as to fat content
13         Milk, lactose free, fat free (skim)
14        Milk, lactose free, reduced fat (2%)
2                                  Milk, whole
7                       Milk, reduced fat (2%)
1                                    Milk, NFS
9          Milk, acidophilus, reduce

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

foods_df = pd.read_csv("preprocessed_food.csv")
disease_limits_df = pd.read_csv("preprocessed_disease.csv")




In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load preprocessed datasets
foods_df = pd.read_csv('./datasets/preprocessed_food.csv')
recommend_df = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')
avoid_df = pd.read_csv('./datasets/preprocessed_avoidfordisease.csv')





In [25]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Load datasets
food_df = pd.read_csv('./datasets/preprocessed_food.csv')
recommended_df = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')

# Drop irrelevant columns for modeling
food_features = food_df.drop(['Name', 'Category'], axis=1)
recommended_features = recommended_df.drop('Disease', axis=1)



In [26]:
# Scale the features (optional but recommended for KNN)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
food_features_scaled = scaler.fit_transform(food_features)
recommended_features_scaled = scaler.transform(recommended_features)

In [27]:
# Train KNN model
k = 5  # Number of neighbors to consider
knn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
knn_model.fit(food_features_scaled)

In [28]:
def recommend_foods_for_disease(disease_name):
    disease_row = recommended_df[recommended_df['Disease'] == disease_name]
    if disease_row.empty:
        print("Disease not found in the database.")
        return
    
    disease_features = disease_row.iloc[0].drop('Disease')
    scaled_features = scaler.transform([disease_features])
    distances, indices = knn_model.kneighbors(scaled_features)
    recommended_foods = food_df.iloc[indices[0]]
    return recommended_foods



In [29]:
# Example: Input disease name and get recommended foods
disease_name = input("Enter the name of the disease: ")
recommended_foods = recommend_foods_for_disease(disease_name)
if recommended_foods is not None:
    print("Recommended foods for", disease_name, ":", recommended_foods['Name'].values)

Recommended foods for diarrhoea : ['Banana' 'Plantain, cooked with butter or margarine'
 'Plantain, cooked, fat added, NS as to fat type'
 'Plantain, cooked with oil' 'Green banana, fried']




In [17]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

food_df = pd.read_csv('./datasets/preprocessed_food.csv')
recommended_df = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')

# Select only the nutritional information columns
nutritional_cols = ['Energy (kcal)', 'Protein (mg)', 'Carbohydrate (mg)', 'Sugars, total(mg)', 
                    'Fiber, total dietary (mg)', 'Total Fat (mg)', 'Fatty acids, total saturated (mg)', 
                    'Cholesterol (mg)', 'Niacin (mg)', 'Folic acid ((mg))', 'Choline, total (mg)', 
                    'Vitamin C (mg)', 'Vitamin E (alpha-tocopherol) (mg)', 'Calcium (mg)', 
                    'Phosphorus (mg)', 'Magnesium (mg)', 'Potassium (mg)', 'Sodium (mg)', 
                    'Caffeine (mg)', 'Theobromine (mg)', 'Alcohol (mg)', 'Water(mg)']

food_nutrition = food_df[nutritional_cols]
disease_nutrition = disease_df[nutritional_cols]

def recommend_food(disease):
    # Get the nutritional information for the disease
    disease_nutrition = disease_df[disease_df['Disease'] == disease][nutritional_cols]
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(food_nutrition, disease_nutrition)
    
    # Find the index of the most similar food
    most_similar_index = similarity_scores.argmax()
    
    # Return the recommended food
    return food_df.iloc[most_similar_index]

# Test the function
print(recommend_food('Anemia'))


Name                                 Stuffed cabbage rolls with beef and rice
Category                                                    Meat mixed dishes
Energy (kcal)                                                             112
Protein (mg)                                                           8410.0
Carbohydrate (mg)                                                      8630.0
Sugars, total(mg)                                                      3450.0
Fiber, total dietary (mg)                                              1300.0
Total Fat (mg)                                                         5000.0
Fatty acids, total saturated (mg)                                      1874.0
Cholesterol (mg)                                                           38
Niacin (mg)                                                             1.907
Folic acid ((mg))                                                       0.005
Choline, total (mg)                                             

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
import pandas as pd
import numpy as np
food_df = pd.read_csv('./datasets/preprocessed_food.csv')
recommended_df = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')


In [8]:
# Define the transformers for specific columns
transformers = [
    ('col_tnf', OneHotEncoder(
        sparse_output=False, drop='first'),np.arange(food_df.shape[1])),
]

In [100]:
nutritional_cols = ['Energy (kcal)', 'Protein (mg)', 'Carbohydrate (mg)', 'Sugars, total(mg)', 
                    'Fiber, total dietary (mg)', 'Total Fat (mg)', 'Fatty acids, total saturated (mg)', 
                    'Cholesterol (mg)', 'Niacin (mg)', 'Folic acid ((mg))', 'Choline, total (mg)', 
                    'Vitamin C (mg)', 'Vitamin E (alpha-tocopherol) (mg)', 'Calcium (mg)', 
                    'Phosphorus (mg)', 'Magnesium (mg)', 'Potassium (mg)', 'Sodium (mg)', 
                    'Caffeine (mg)', 'Theobromine (mg)', 'Alcohol (mg)', 'Water(mg)']

In [101]:
#selected_disease="AcidReflux"
disease_limits = recommended_df[nutritional_cols].values

In [103]:
len(disease_limits)

20

In [104]:
# Extract nutrient columns
nutrient_columns = food_df.columns[2:]
food_nutrition = food_df[nutrient_columns].values
len(nutrient_columns),len(food_nutrition)

(22, 6529)

In [55]:
disease_limits

array([[7.32069094e+01, 3.52242515e+03, 1.14033221e+04, 7.22579101e+03,
        1.85626362e+03, 2.00456583e+03, 5.96563617e+02, 9.41929661e+00,
        9.96060146e-01, 5.41503000e-04, 1.79361562e+01, 1.13597059e+01,
        7.99354031e-01, 3.32163554e+01, 5.37591192e+01, 2.23386088e+01,
        2.34584283e+02, 1.17851167e+02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 8.21756570e+04]])

In [108]:
scaler = StandardScaler()
food_nutrition_scaled = scaler.fit_transform(food_nutrition)
disease_limits_scaled = scaler.transform(disease_limits)
len(food_nutrition_scaled),len(disease_limits_scaled)

(6529, 20)

In [110]:
# Calculate distances
distance = food_nutrition_scaled - disease_limits_scaled.T
#distance_magnitude = np.linalg.norm(distance, axis=1)
distance

ValueError: operands could not be broadcast together with shapes (6529,22) (22,20) 

In [112]:
import numpy as np

# Reshape disease_limits_scaled to have a third dimension of size 1
disease_limits_reshaped = disease_limits_scaled.reshape((1, 20, 22))

# Expand food_nutrition_scaled along the first axis to match the shape of disease_limits_reshaped
food_nutrition_expanded = np.expand_dims(food_nutrition_scaled, axis=1)

# Calculate distances
distance = food_nutrition_expanded - disease_limits_reshaped

# Calculate distance magnitude along the axis of diseases (axis=2)
distance_magnitude = np.linalg.norm(distance, axis=2)

# distance_magnitude now contains the distances between each food item and each disease limit


In [114]:
len(distance),len(distance_magnitude)

(6529, 6529)

In [122]:
from sklearn.model_selection import train_test_split
# Train-test split
X = food_nutrition_scaled
y = distance_magnitude
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
y_train[0]

array([4.19130286, 3.59556665, 4.3182386 , 4.06464712, 4.37108987,
       3.91454942, 4.19130286, 4.13278435, 4.29182132, 4.0027943 ,
       4.18953257, 4.10581475, 2.89487251, 6.02094872, 3.82786477,
       3.82080398, 5.01769734, 3.91509648, 4.39501662, 4.12211656])

In [126]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a KNN regression model
step2 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

ValueError: all features must be in [0, 21] or [-22, 0]

In [66]:
X_train.shape,y_train.shape

((5223, 22), (5223, 22))

In [71]:
!pip3 install torch

Collecting torch
  Downloading torch-2.2.0-cp310-cp310-win_amd64.whl (198.6 MB)
     -------------------------------------- 198.6/198.6 MB 5.8 MB/s eta 0:00:00
Collecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 17.6 MB/s eta 0:00:00
Collecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
     ---------------------------------------- 5.7/5.7 MB 17.5 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting fsspec
  Downloading fsspec-2024.2.0-py3-none-any.whl (170 kB)
     ------------------------------------- 170.9/170.9 KB 10.7 MB/s eta 0:00:00
Collecting mpmath>=0.19
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ------------------------------------- 536.2/536.2 KB 17.0 MB/s eta 0:00:00
Installing collected packages: mpmath, sympy, networkx, fsspec, filelock, torch
Successfully installed filelock-3.13.1 fsspec-2024.2.0 mpmath-1.3.0 netwo

You should consider upgrading via the 'C:\Users\amosh\MajorProject\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [72]:
import torch 
from torch import nn
device='cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)


Device: cpu


In [117]:
X_train=torch.tensor(X_train, dtype=torch.float32)
X_test=torch.tensor(X_test, dtype=torch.float32)
y_train=torch.tensor(y_train, dtype=torch.float32)
y_test=torch.tensor(y_test, dtype=torch.float32)

In [118]:
class linearRegression(nn.Module):
    def __init__(self, input_size,hidden_units, output_size):
        super(linearRegression, self).__init__()
        self.linear = nn.Sequential(nn.Linear(input_size,hidden_units),
                                    nn.ReLU(),
                                    nn.Linear(hidden_units, hidden_units),
                                    nn.ReLU(),
                                    nn.Linear(hidden_units, 32),
                                    nn.ReLU(),
                                    nn.Linear(32, output_size)
                                    )
        

        
    def forward(self, x):
        x = self.linear(x)
        return x

In [119]:
model=linearRegression(22,256,1)
model


linearRegression(
  (linear): Sequential(
    (0): Linear(in_features=22, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [120]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


In [121]:
X_train.shape,y_train.shape

(torch.Size([5223, 22]), torch.Size([5223, 20, 22]))

In [95]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
     -------------------------------------- 840.4/840.4 KB 3.3 MB/s eta 0:00:00
Collecting lightning-utilities>=0.8.0
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.1 torchmetrics-1.3.1


You should consider upgrading via the 'C:\Users\amosh\MajorProject\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
from torchmetrics import Accuracy

torchmetric_accuracy = Accuracy(task='multiclass', num_classes = 20).to(device)
torchmetric_accuracy(y_preds, y_test.to(device))

In [94]:
torch.manual_seed(42)

epochs = 10

# Placing data on CUDA
X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)

# Tracking
epoch_count = []
loss_values = []
test_loss_values = []

# Train Loop
for epoch in range(epochs):
  model.train()
  y_pred = model(X_train)
  loss = loss_fn(y_pred, y_train)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  # Testing
  if epoch % 2 == 0 or epoch == epochs - 1:
    model.eval()
    with torch.inference_mode():
      test_pred = model(X_test)
      test_loss = loss_fn(test_pred, y_test)

      epoch_count.append(epoch)
      loss_values.append(loss)
      test_loss_values.append(test_loss)
      print(f'Epoch: {epoch} ---- Train Loss: {loss:.4f} | Test Loss: {test_loss:.4f}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 ---- Train Loss: 1.0178 | Test Loss: 1.0038
Epoch: 2 ---- Train Loss: 1.0143 | Test Loss: 1.0004
Epoch: 4 ---- Train Loss: 1.0110 | Test Loss: 0.9971
Epoch: 6 ---- Train Loss: 1.0078 | Test Loss: 0.9940
Epoch: 8 ---- Train Loss: 1.0048 | Test Loss: 0.9910
Epoch: 9 ---- Train Loss: 1.0033 | Test Loss: 0.9896


In [16]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

# Load datasets
food_df = pd.read_csv('./datasets/preprocessed_food.csv')
recommended_df = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')


In [17]:
# Drop irrelevant columns for modeling
food_features = food_df.drop(['Name', 'Category'], axis=1)
recommended_features = recommended_df.drop('Disease', axis=1)

In [19]:
# Scale the features using PyTorch
scaler = StandardScaler()
food_features_scaled = torch.tensor(scaler.fit_transform(food_features.values), dtype=torch.float32)
recommended_features_scaled = torch.tensor(scaler.transform(recommended_features.values), dtype=torch.float32)




In [21]:
# Train KNN Regressor
k = 5  # Number of neighbors to consider
knn_model = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
knn_model.fit(food_features_scaled, food_features_scaled)  # Using food features for both input and output for demonstration


In [22]:
def recommend_foods_for_disease(disease_name):
    disease_row = recommended_df[recommended_df['Disease'] == disease_name]
    if disease_row.empty:
        print("Disease not found in the database.")
        return
    
    disease_features = disease_row.iloc[0].drop('Disease')
    scaled_features = scaler.transform([disease_features])
    scaled_features_tensor = torch.tensor(scaled_features, dtype=torch.float32)
    distances, indices = knn_model.kneighbors(scaled_features_tensor)
    recommended_foods = food_df.iloc[indices[0]]
    return recommended_foods


In [23]:

# Example: Input disease name and get recommended foods
disease_name = input("Enter the name of the disease: ")
recommended_foods = recommend_foods_for_disease(disease_name)
if recommended_foods is not None:
    print("Recommended foods for", disease_name, ":", recommended_foods['Name'].values)

Recommended foods for AcidReflux : ['Noodles with vegetables in tomato-based sauce, diet frozen meal'
 'Sweet potato, casserole or mashed'
 'Green beans, fresh, cooked, fat added, NS as to fat type'
 'Green beans, fresh, cooked with oil'
 'Green beans, fresh, cooked with butter or margarine']


In [24]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler

# Load datasets
food_df = pd.read_csv('./datasets/preprocessed_food.csv')
recommended_df = pd.read_csv('./datasets/preprocessed_recommendedfordisease.csv')

# Drop irrelevant columns for modeling
food_features = food_df.drop(['Name', 'Category'], axis=1)
recommended_features = recommended_df.drop('Disease', axis=1)





In [26]:
# Scale the features using PyTorch
scaler = StandardScaler()
food_features_scaled = torch.tensor(scaler.fit_transform(food_features.values), dtype=torch.float32)
recommended_features_scaled = torch.tensor(scaler.transform(recommended_features.values), dtype=torch.float32)

# Define a simple linear regression model
class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        return self.linear(x)



In [27]:
# Define model parameters
input_dim = food_features_scaled.shape[1]
output_dim = food_features_scaled.shape[1]  # Assuming output dimension is the same as input dimension


In [28]:
# Instantiate the linear regression model
model = LinearRegression(input_dim, output_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)



In [29]:
# Train the linear regression model
num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(food_features_scaled)
    loss = criterion(outputs, food_features_scaled)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))


Epoch [100/1000], Loss: 0.9634
Epoch [200/1000], Loss: 0.7025
Epoch [300/1000], Loss: 0.5416
Epoch [400/1000], Loss: 0.4339
Epoch [500/1000], Loss: 0.3569
Epoch [600/1000], Loss: 0.2992
Epoch [700/1000], Loss: 0.2546
Epoch [800/1000], Loss: 0.2191
Epoch [900/1000], Loss: 0.1905
Epoch [1000/1000], Loss: 0.1669


In [30]:

# Function to recommend foods for a given disease using the trained model
def recommend_foods_for_disease(disease_name):
    disease_row = recommended_df[recommended_df['Disease'] == disease_name]
    if disease_row.empty:
        print("Disease not found in the database.")
        return
    
    disease_features = disease_row.iloc[0].drop('Disease')
    scaled_features = scaler.transform([disease_features])
    scaled_features_tensor = torch.tensor(scaled_features, dtype=torch.float32)
    
    # Predict using the trained linear regression model
    predicted_features = model(scaled_features_tensor).detach().numpy()
    
    # Find nearest neighbors based on predicted features
    distances = np.linalg.norm(food_features_scaled.numpy() - predicted_features, axis=1)
    indices = np.argsort(distances)[:5]  # Top 5 nearest neighbors
    
    recommended_foods = food_df.iloc[indices]
    return recommended_foods



In [31]:
# Example: Input disease name and get recommended foods
disease_name = input("Enter the name of the disease: ")
recommended_foods = recommend_foods_for_disease(disease_name)
if recommended_foods is not None:
    print("Recommended foods for", disease_name, ":", recommended_foods['Name'].values)

Recommended foods for AcidReflux : ['Potato, mashed, from school lunch'
 'Noodles with vegetables in tomato-based sauce, diet frozen meal'
 'Squash, summer, casserole, with rice and tomato sauce'
 'Sweet potato, casserole or mashed' 'Stuffed tomato, with rice and meat']
