## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## Dataset

In [9]:

file_path = 'FOOD-DATA-GROUP1.csv'
df = pd.read_csv(file_path)


df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,food,Caloric Value,Fat,Saturated Fats,Monounsaturated Fats,Polyunsaturated Fats,Carbohydrates,Sugars,...,Calcium,Copper,Iron,Magnesium,Manganese,Phosphorus,Potassium,Selenium,Zinc,Nutrition Density
0,0,0,cream cheese,51,5.0,2.9,1.3,0.2,0.8,0.5,...,0.008,14.1,0.082,0.027,1.3,0.091,15.5,19.1,0.039,7.07
1,1,1,neufchatel cheese,215,19.4,10.9,4.9,0.8,3.1,2.7,...,99.5,0.034,0.1,8.5,0.088,117.3,129.2,0.054,0.7,130.1
2,2,2,requeijao cremoso light catupiry,49,3.6,2.3,0.9,0.0,0.9,3.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.4
3,3,3,ricotta cheese,30,2.0,1.3,0.5,0.002,1.5,0.091,...,0.097,41.2,0.097,0.096,4.0,0.024,30.8,43.8,0.035,5.196
4,4,4,cream cheese low fat,30,2.3,1.4,0.6,0.042,1.2,0.9,...,22.2,0.072,0.008,1.2,0.098,22.8,37.1,0.034,0.053,27.007


In [3]:
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551 entries, 0 to 550
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0.1          551 non-null    int64  
 1   Unnamed: 0            551 non-null    int64  
 2   food                  551 non-null    object 
 3   Caloric Value         551 non-null    int64  
 4   Fat                   551 non-null    float64
 5   Saturated Fats        551 non-null    float64
 6   Monounsaturated Fats  551 non-null    float64
 7   Polyunsaturated Fats  551 non-null    float64
 8   Carbohydrates         551 non-null    float64
 9   Sugars                551 non-null    float64
 10  Protein               551 non-null    float64
 11  Dietary Fiber         551 non-null    float64
 12  Cholesterol           551 non-null    float64
 13  Sodium                551 non-null    float64
 14  Water                 551 non-null    float64
 15  Vitamin A             5

In [10]:

features = df.drop(columns=['Nutrition Density', 'Unnamed: 0.1', 'Unnamed: 0'])
target = df['Nutrition Density']


In [11]:



numerical_features = features.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [None]:

param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [None, 10]
}


grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validation score (negative MSE): {best_score}")


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)

print(f"Test Mean Squared Error: {test_mse}")
