In [11]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, RFE
from sklearn.linear_model import Lasso, LogisticRegression
from scipy.stats import zscore

# Load the dataset
data = pd.read_csv('Food_and_Nutrition__.csv')

# Step 1: Handling Missing Values
print("Missing Values Before Imputation:")
print(data.isnull().sum())

# Impute numerical features with the mean
num_imputer = SimpleImputer(strategy='mean')
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

# Impute categorical features with the mode
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

print("\nMissing Values After Imputation:")
print(data.isnull().sum())

# Step 2: Scaling Data
scaler = StandardScaler()
normalizer = MinMaxScaler()

# Standardization
data_std = data.copy()
data_std[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Normalization
data_norm = data.copy()
data_norm[numerical_cols] = normalizer.fit_transform(data[numerical_cols])

# Step 3: Handling Noise
# Inject random noise into the "Calories" column
data_noisy = data.copy()
np.random.seed(42)
data_noisy['Calories'] += np.random.normal(0, 50, size=len(data))

# Smoothing using moving average
data_noisy['Calories_Smoothed'] = data_noisy['Calories'].rolling(window=5).mean()
data_noisy['Calories_Smoothed'].fillna(data_noisy['Calories'], inplace=True)

# Step 4: Handling Outliers
# Detect outliers using Z-scores
data['Z_Score'] = zscore(data['Calories'])
outliers = data[np.abs(data['Z_Score']) > 3]

print(f"\nNumber of Outliers Detected: {len(outliers)}")

# Remove outliers
data_cleaned = data[np.abs(data['Z_Score']) <= 3].drop(columns=['Z_Score'])

# Step 5: Feature Selection
# Filter Method: Correlation with target (e.g., Disease)
# Select only numerical features for correlation calculation
correlation = data_cleaned.select_dtypes(include=['number']).corr()  # Include only numerical features

print("\nCorrelation with Target Variable (Disease):")
# Check if 'Daily Calorie Target' is in the columns before accessing it
if 'Daily Calorie Target' in correlation.columns:
    print(correlation['Daily Calorie Target'].sort_values(ascending=False))
else:
    print("Target variable 'Daily Calorie Target' not found in numerical columns.")

# Wrapper Method: Recursive Feature Elimination (RFE)
X = data_cleaned[numerical_cols]
y = data_cleaned['Disease']
log_reg = LogisticRegression(max_iter=1000)
rfe = RFE(log_reg, n_features_to_select=5)
rfe.fit(X, y)

print("\nSelected Features by RFE:")
print(X.columns[rfe.support_])

# Embedded Method: Lasso Regression
# Convert target variable to numerical using Label Encoding
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

le = LabelEncoder()  # Create a LabelEncoder object
y_encoded = le.fit_transform(y)  # Fit and transform the target variable

lasso = Lasso(alpha=0.1)
lasso.fit(X, y_encoded)  # Use the encoded target variable
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns)

print("\nLasso Regression Feature Importance:")
print(lasso_coefficients[lasso_coefficients != 0].sort_values(ascending=False))


Missing Values Before Imputation:
Ages                    0
Gender                  0
Height                  0
Weight                  0
Activity Level          0
Dietary Preference      0
Daily Calorie Target    0
Protein                 0
Sugar                   0
Sodium                  0
Calories                0
Carbohydrates           0
Fiber                   0
Fat                     0
Breakfast Suggestion    0
Lunch Suggestion        0
Dinner Suggestion       0
Snack Suggestion        0
Disease                 0
dtype: int64

Missing Values After Imputation:
Ages                    0
Gender                  0
Height                  0
Weight                  0
Activity Level          0
Dietary Preference      0
Daily Calorie Target    0
Protein                 0
Sugar                   0
Sodium                  0
Calories                0
Carbohydrates           0
Fiber                   0
Fat                     0
Breakfast Suggestion    0
Lunch Suggestion        0
Dinner Su

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_noisy['Calories_Smoothed'].fillna(data_noisy['Calories'], inplace=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/module


Selected Features by RFE:
Index(['Ages', 'Height', 'Protein', 'Carbohydrates', 'Fat'], dtype='object')

Lasso Regression Feature Importance:
Carbohydrates           0.002190
Weight                  0.002106
Daily Calorie Target    0.001024
Calories                0.000721
Height                 -0.001626
Ages                   -0.001780
Protein                -0.017276
dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
