In [139]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [80]:
df = pd.read_csv('nutrients_csvfile.csv')


In [81]:
df.sample(10)

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs,Category
174,Cantaloupe,1/2 med.,380,40,1,t,0,2.2,9,Fruits A-F
101,Broccoli,1 cup,150,45,5,t,0,1.9,8,Vegetables A-E
118,Endive,2 oz.,57,10,1,t,0,0.6,2,Vegetables A-E
8,Goats' milk,1 cup,244,165,8,10,8,0.0,11,Dairy products
224,Whole-wheat,1-lb. loaf,454,1100,48,14,10,67.5,216,"Breads, cereals, fastfood,grains"
124,Iceberg,1/4 head,100,13,t,t,0,0.5,3,Vegetables F-P
5,Powdered milk,1 cup,103,515,27,28,24,0.0,39,Dairy products
139,Peppers with beef and crumbs,1 med.,150,255,19,9,8,1.0,24,Vegetables R-Z
76,Flounder,3 1/2 oz.,100,200,30,8,0,0.0,0,"Fish, Seafood"
39,Safflower seed oil,1 T.,14,125,0,14,3,0.0,0,"Fats, Oils, Shortenings"


In [120]:
df.columns

Index(['Food', 'Grams', 'Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber',
       'Carbs', 'Category'],
      dtype='object')

In [82]:
df.shape

(335, 10)

In [95]:
df.isnull().sum()

Food        0
Grams       0
Calories    0
Protein     0
Fat         0
Sat.Fat     0
Fiber       0
Carbs       0
Category    0
dtype: int64

In [84]:
 df[df.isna().any(axis=1)]

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs,Category
42,Salt pork,2 oz.,60,470.0,3,55,,0,0.0,"Meat, Poultry"
100,Beetroots,1 cup,165,1.0,12,0,,t,0.8,Vegetables A-E
134,Frozen peas,1 cup,100,,5,t,0.0,1.8,12.0,Vegetables R-Z


In [94]:
df = df.fillna(0)

In [86]:
df = df.drop('Measure', axis=1)

In [87]:
df.iloc[:, 1:8] = df.iloc[:, 1:8].replace('t', 0)

In [88]:
def clean_numeric_columns(df, columns):
    for col in columns:
        # Apply string operations only to string columns
        df[col] = df[col].astype(str).str.replace(',', '').str.replace('-', '0').str.replace('a', '0').str.replace('8-44', '0')
        # Convert columns to numeric
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [89]:
numeric_columns = df.columns[1:8]

In [96]:
df.dtypes

Food         object
Grams         int64
Calories      int64
Protein       int64
Fat         float64
Sat.Fat       int64
Fiber       float64
Carbs       float64
Category     object
dtype: object

In [91]:
df = clean_numeric_columns(df, numeric_columns)

In [127]:
X = df.drop('Calories', axis=1)
y = df['Calories']

In [128]:
transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(sparse=False, drop='first'), ['Food', 'Category']),
    ('num', StandardScaler(), ['Grams', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs'])
], remainder='passthrough')

In [130]:
X_transformed = transformer.fit_transform(X)



In [131]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed , y, test_size=0.2, random_state=42)

In [132]:
rf = RandomForestClassifier(random_state=42)

In [133]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required to be at a leaf node
}

In [134]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', error_score='raise')
grid_search.fit(X_train, y_train)



In [135]:
best_rf = grid_search.best_estimator_

In [137]:
y_pred = best_rf.predict(X_test)


In [140]:
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

Mean Squared Error: 13602.358208955224


In [148]:
# Compare actual vs predicted calorie values
comparison_df = pd.DataFrame({'Actual_Calories': y_test, 'Predicted_Calories': y_pred})
print(comparison_df.head(10))  # Displaying first 10 rows for comparison

     Actual_Calories  Predicted_Calories
25               105                 105
109               20                  30
76               200                 180
196              330                 112
57               290                 245
255              285                 250
182              195                 220
181              130                 220
274              150                 155
198              200                 185


#### for example in the actual dataset at row 76, Food:Flounder and actual calorie was 200 cal/g , predicted calorie is 180 cal/g

In [144]:
y_pred

array([105,  30, 180, 112, 245, 250, 220, 220, 155, 185,  90, 275, 100,
       280, 100, 100, 180, 280, 180,   0,  30, 180, 155, 100, 660,  10,
       290, 230, 260,  30, 260, 110, 285,  55, 115, 265, 170, 340,  80,
       250, 665, 370, 220,  14, 137, 185, 180, 220, 245, 100, 425, 100,
       180,  50, 260, 105, 113,  80, 290, 185,  17, 180,  55, 770,  60,
       180, 100], dtype=int64)