In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_obesity_data(file_path):
    # 1. Load the data
    df = pd.read_csv(file_path)
    
    # 2. Check for missing values
    print("Missing values:\n", df.isnull().sum())
    
    # 3. Simple data validation
    # Remove any rows where height or weight are unreasonable
    df = df[(df['Height'] > 1.4) & (df['Height'] < 2.2) &
            (df['Weight'] > 40) & (df['Weight'] < 200)]
    
    # 4. Encode categorical variables using Label Encoder
    categorical_cols = ['Gender', 'family_history', 'FAVC', 'CAEC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS']
    
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    # 5. Scale numerical variables (now without Weight)
    numerical_cols = ['Age', 'Height', 'FCVC', 'NCP', 
                     'CH2O', 'FAF', 'TUE']
    
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    # 6. Split features and target, dropping both BMI and Weight
    X = df.drop(['BMI', 'Weight'], axis=1)
    y = df['BMI']
    
    # 7. Print basic statistics
    print("\nDataset shape:", df.shape)
    print("\nFeature names:", list(X.columns))
    
    return X, y

In [13]:
x,y =preprocess_obesity_data("ObesityPrediction.csv")

Missing values:
 Gender            0
Age               0
Height            0
Weight            0
family_history    0
FAVC              0
FCVC              0
NCP               0
CAEC              0
SMOKE             0
CH2O              0
SCC               0
FAF               0
TUE               0
CALC              0
MTRANS            0
BMI               0
dtype: int64

Dataset shape: (2105, 17)

Feature names: ['Gender', 'Age', 'Height', 'family_history', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']


In [16]:
print(y)

0       24.386526
1       24.238227
2       23.765432
3       26.851852
4       28.342381
          ...    
2106    44.901475
2107    43.741923
2108    43.543817
2109    44.071535
2110    44.144338
Name: BMI, Length: 2105, dtype: float64


In [15]:
print(x)

      Gender       Age    Height  family_history  FAVC      FCVC       NCP  \
0          0 -0.524137 -0.882009               1     0 -0.793282  0.402928   
1          0 -0.524137 -1.956374               1     0  1.087496  0.402928   
2          1 -0.209058  1.051847               1     0 -0.793282  0.402928   
3          1  0.421101  1.051847               0     0  1.087496  0.402928   
4          1 -0.366598  0.836974               0     0 -0.793282 -2.168920   
...      ...       ...       ...             ...   ...       ...       ...   
2106       0 -0.527786  0.092762               1     1  1.087496  0.402928   
2107       0 -0.369285  0.499452               1     1  1.087496  0.402928   
2108       0 -0.284041  0.538365               1     1  1.087496  0.402928   
2109       0  0.005501  0.401319               1     1  1.087496  0.402928   
2110       0 -0.104340  0.394723               1     1  1.087496  0.402928   

      CAEC  SMOKE      CH2O  SCC       FAF       TUE  CALC  MTR