In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

df = pd.read_csv("FOOD-DATA-GROUP1.csv")

features = df.drop(columns=['Nutrition Density', 'Unnamed: 0.1', 'Unnamed: 0'])
target = df['Nutrition Density']

numerical_features = features.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(X_train_pca.shape)
print(X_test_pca.shape)


(440, 10)
(111, 10)


## Principal Component Analysis (PCA)

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

## t-Distributed Stochastic Neighbor Embedding (t-SNE)

In [18]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
X_train_tsne = tsne.fit_transform(X_train)
X_test_tsne = tsne.fit_transform(X_test)

## Independent Component Analysis (ICA)

In [19]:
from sklearn.decomposition import FastICA

ica = FastICA(n_components=10, random_state=42)
X_train_ica = ica.fit_transform(X_train)
X_test_ica = ica.transform(X_test)

## Feature Selection with SelectKBest

In [20]:
from sklearn.feature_selection import SelectKBest, f_regression

select_kbest = SelectKBest(f_regression, k=10)
X_train_kbest = select_kbest.fit_transform(X_train, y_train)
X_test_kbest = select_kbest.transform(X_test)

## Recursive Feature Elimination (RFE)

In [22]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


rfe = RFE(estimator=LinearRegression(), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)


## Evaluate Machine Learning Models and Training

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def train_and_evaluate(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor(random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  return mse

mse_pca = train_and_evaluate(X_train_pca, X_test_pca, y_train, y_test)
mse_tsne = train_and_evaluate(X_train_tsne, X_test_tsne, y_train, y_test)
mse_ica = train_and_evaluate(X_train_ica, X_test_ica, y_train, y_test)
mse_kbest = train_and_evaluate(X_train_kbest, X_test_kbest, y_train, y_test)
mse_rfe = train_and_evaluate(X_train_rfe, X_test_rfe, y_train, y_test)


print(f"MSE with PCA: {mse_pca}")
print(f"MSE with t-SNE: {mse_tsne}")
print(f"MSE with ICA: {mse_ica}")
print(f"MSE with SelectKBest: {mse_kbest}")
print(f"MSE with RFE: {mse_rfe}")


MSE with PCA: 6050.840597977944
MSE with t-SNE: 32298.664481709016
MSE with ICA: 5033.541170585949
MSE with SelectKBest: 962.7522695686941
MSE with RFE: 871.3214234252631
