In [3]:
import pandas as pd

food_df = pd.read_csv('../data/food_dataset_new.csv')
food_df.head()

Unnamed: 0,food,Caloric Value,Fat,Saturated Fats,Monounsaturated Fats,Polyunsaturated Fats,Carbohydrates,Sugars,Protein,Dietary Fiber,...,Phosphorus,Potassium,Selenium,Zinc,Nutrition Density,Carbohydrate_to_Protein_Ratio,Carbohydrate_to_Fat_Ratio,Fiber_to_Carbohydrate_Ratio,Sodium_to_Potassium_Ratio,Caloric_Density
0,cream cheese,51.0,5.0,2.9,1.3,0.2,0.8,0.5,0.9,0.0,...,0.091,15.5,19.1,0.039,7.07,0.888889,0.16,0.0,0.001032,3.566434
1,neufchatel cheese,215.0,19.4,10.9,4.9,0.8,3.1,2.7,7.8,0.0,...,117.3,129.2,0.054,0.7,130.1,0.397436,0.159794,0.0,0.002322,2.562574
2,requeijao cremoso light catupiry,49.0,3.6,2.3,0.9,0.0,0.9,3.4,0.8,0.1,...,0.0,0.0,0.0,0.0,5.4,1.125,0.25,0.111111,0.0,9.245283
3,ricotta cheese,30.0,2.0,1.3,0.5,0.002,1.5,0.091,1.5,0.0,...,0.024,30.8,43.8,0.035,5.196,1.0,0.75,0.0,0.000552,1.522843
4,cream cheese low fat,30.0,2.3,1.4,0.6,0.042,1.2,0.9,1.2,0.0,...,22.8,37.1,0.034,0.053,27.007,1.0,0.521739,0.0,0.00124,2.040816


In [4]:
from sklearn.model_selection import train_test_split

# Define the features (X) and target (y)
# Assuming 'Caloric Value' is the target variable for this example
X = food_df.drop(['Caloric Value', 'food'], axis=1)
y = food_df['Caloric Value']

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

Training samples: 1916, Test samples: 479


In [5]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Initialize and train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate evaluation metrics
print(mean_absolute_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

14.61822493970825
81.34022194245797


In [15]:
# Display coefficients of each feature
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': model.coef_})
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)
print(feature_importance)

                          Feature  Coefficient
23                        Calcium   332.545678
19                      Vitamin C   259.353803
0                             Fat   258.224708
6                         Protein   205.721093
4                   Carbohydrates   202.108827
11                      Vitamin A    46.433448
2            Monounsaturated Fats    31.872706
1                  Saturated Fats    20.059391
25                           Iron    15.026634
7                   Dietary Fiber    14.784530
28                     Phosphorus    12.870404
16                     Vitamin B3     9.045248
10                          Water     5.003308
17                     Vitamin B5     4.465932
33  Carbohydrate_to_Protein_Ratio     4.144453
3            Polyunsaturated Fats     4.133666
14                    Vitamin B12     2.352683
29                      Potassium     2.043167
37                Caloric_Density     1.858853
30                       Selenium     1.539798
18           

In [17]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_mae = -cross_val_score(model, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=5)
cv_rmse = np.sqrt(-cross_val_score(model, X_train_scaled, y_train, scoring='neg_mean_squared_error', cv=5))

print(f"Cross-Validated MAE: {cv_mae.mean():.4f}")
print(f"Cross-Validated RMSE: {cv_rmse.mean():.4f}")

Cross-Validated MAE: 122535007.2427
Cross-Validated RMSE: 2398057095.2943


In [18]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the Ridge model
ridge_model = Ridge()

# Set up a parameter grid for alpha (regularization strength)
param_grid = {'alpha': [0.1, 1, 10, 100, 1000]}

# Use GridSearchCV to find the best regularization parameter
grid_search = GridSearchCV(ridge_model, param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Display best parameters and best score
print(f"Best Alpha for Ridge Regression: {grid_search.best_params_['alpha']}")
print(f"Best Cross-Validated MAE: {-grid_search.best_score_:.4f}")

# Get the best estimator from grid search
best_ridge_model = grid_search.best_estimator_

# Evaluate the model using cross-validated RMSE
cv_rmse = np.sqrt(-cross_val_score(best_ridge_model, X_train_scaled, y_train, scoring='neg_mean_squared_error', cv=5))
print(f"Cross-Validated RMSE for Ridge: {cv_rmse.mean():.4f}")

Best Alpha for Ridge Regression: 10
Best Cross-Validated MAE: 13.4488
Cross-Validated RMSE for Ridge: 35.3007
