In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_obesity_data(file_path):
    # 1. Load the data
    df = pd.read_csv(file_path)
    
    # 2. Check for missing values
    print("Missing values:\n", df.isnull().sum())
    
    # 3. Simple data validation
    # Remove any rows where height or weight are unreasonable
    df = df[(df['Height'] > 1.4) & (df['Height'] < 2.2) &
            (df['Weight'] > 40) & (df['Weight'] < 200)]
    
    # 4. Encode categorical variables using Label Encoder
    categorical_cols = ['Gender', 'family_history', 'FAVC', 'CAEC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS']
    
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    # 5. Scale numerical variables (now without Weight)
    numerical_cols = ['Age', 'Height', 'FCVC', 'NCP', 
                     'CH2O', 'FAF', 'TUE']
    
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    # 6. Split features and target, dropping both BMI and Weight
    X = df.drop(['BMI', 'Weight'], axis=1)
    y = df['BMI']
    
    # 7. Print basic statistics
    print("\nDataset shape:", df.shape)
    print("\nFeature names:", list(X.columns))
    
    return X, y

In [29]:
x,y =preprocess_obesity_data("ObesityPrediction.csv")

Missing values:
 Gender            0
Age               0
Height            0
Weight            0
family_history    0
FAVC              0
FCVC              0
NCP               0
CAEC              0
SMOKE             0
CH2O              0
SCC               0
FAF               0
TUE               0
CALC              0
MTRANS            0
BMI               0
dtype: int64

Dataset shape: (2105, 17)

Feature names: ['Gender', 'Age', 'Height', 'family_history', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']


In [30]:
print(y)

0       24.386526
1       24.238227
2       23.765432
3       26.851852
4       28.342381
          ...    
2106    44.901475
2107    43.741923
2108    43.543817
2109    44.071535
2110    44.144338
Name: BMI, Length: 2105, dtype: float64


In [31]:
print(x)

      Gender       Age    Height  family_history  FAVC      FCVC       NCP  \
0          0 -0.524137 -0.882009               1     0 -0.793282  0.402928   
1          0 -0.524137 -1.956374               1     0  1.087496  0.402928   
2          1 -0.209058  1.051847               1     0 -0.793282  0.402928   
3          1  0.421101  1.051847               0     0  1.087496  0.402928   
4          1 -0.366598  0.836974               0     0 -0.793282 -2.168920   
...      ...       ...       ...             ...   ...       ...       ...   
2106       0 -0.527786  0.092762               1     1  1.087496  0.402928   
2107       0 -0.369285  0.499452               1     1  1.087496  0.402928   
2108       0 -0.284041  0.538365               1     1  1.087496  0.402928   
2109       0  0.005501  0.401319               1     1  1.087496  0.402928   
2110       0 -0.104340  0.394723               1     1  1.087496  0.402928   

      CAEC  SMOKE      CH2O  SCC       FAF       TUE  CALC  MTR

In [32]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

class BMIDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X.values)
        self.y = torch.FloatTensor(y.values)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class BMINet(nn.Module):
    def __init__(self, input_dim):
        super(BMINet, self).__init__()
        self.model = nn.Sequential(
            # First hidden layer
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            
            # Second hidden layer
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),
            
            # Third hidden layer
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Dropout(0.1),
            
            # Output layer
            nn.Linear(16, 1)
        )
    
    def forward(self, x):
        return self.model(x)

def evaluate_predictions(y_true, y_pred, threshold=2.0):
    """
    Evaluate whether predictions are within threshold BMI points
    Returns binary array where True means prediction was within threshold
    """
    differences = np.abs(y_true - y_pred)
    return differences <= threshold

def train_and_evaluate_bmi_model(X, y, epochs=100, batch_size=32, threshold=2.0):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create datasets and dataloaders
    train_dataset = BMIDataset(X_train, y_train)
    test_dataset = BMIDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize model and optimizer
    model = BMINet(X.shape[1])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    # Training loop
    best_val_loss = float('inf')
    patience = 10
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                outputs = model(batch_X)
                val_loss += criterion(outputs, batch_y.unsqueeze(1)).item()
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(test_loader):.4f}')
    
    # Final evaluation
    model.eval()
    with torch.no_grad():
        # Get all predictions
        train_predictions = []
        train_true = []
        for batch_X, batch_y in train_loader:
            outputs = model(batch_X)
            train_predictions.extend(outputs.numpy().flatten())
            train_true.extend(batch_y.numpy().flatten())
            
        test_predictions = []
        test_true = []
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            test_predictions.extend(outputs.numpy().flatten())
            test_true.extend(batch_y.numpy().flatten())
    
    # Convert to numpy arrays
    train_predictions = np.array(train_predictions)
    train_true = np.array(train_true)
    test_predictions = np.array(test_predictions)
    test_true = np.array(test_true)
    
    # Evaluate predictions within threshold
    train_within_threshold = evaluate_predictions(train_true, train_predictions, threshold)
    test_within_threshold = evaluate_predictions(test_true, test_predictions, threshold)
    
    # Generate classification reports
    print("\nTraining Set Performance:")
    print(f"Accuracy (predictions within {threshold} BMI points): {accuracy_score(train_within_threshold, np.ones_like(train_within_threshold)):.4f}")
    print("\nTest Set Performance:")
    print(f"Accuracy (predictions within {threshold} BMI points): {accuracy_score(test_within_threshold, np.ones_like(test_within_threshold)):.4f}")
    
    # Additional statistics
    print("\nDetailed Error Analysis (Test Set):")
    errors = np.abs(test_true - test_predictions)
    print(f"Mean Absolute Error: {np.mean(errors):.2f} BMI points")
    print(f"Median Absolute Error: {np.median(errors):.2f} BMI points")
    print(f"90th percentile of absolute error: {np.percentile(errors, 90):.2f} BMI points")
    print(f"Percentage of predictions within {threshold} BMI points: {(100 * np.mean(test_within_threshold)):.1f}%")
    
    # Create error distribution bins
    error_bins = pd.cut(errors, bins=[0, 1, 2, 3, 4, 5, float('inf')], 
                       labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5+'])
    error_distribution = pd.value_counts(error_bins, normalize=True).sort_index()
    print("\nError Distribution:")
    for bin_name, percentage in error_distribution.items():
        print(f"Error {bin_name} BMI points: {percentage*100:.1f}%")
    
    return model, errors

# Usage example:
if __name__ == "__main__":
    # Assuming X and y are already preprocessed using your preprocess_obesity_data function
    X, y = preprocess_obesity_data("ObesityPrediction.csv")
    
    # Train and evaluate the model
    model, errors = train_and_evaluate_bmi_model(X, y, threshold=2.0)

Missing values:
 Gender            0
Age               0
Height            0
Weight            0
family_history    0
FAVC              0
FCVC              0
NCP               0
CAEC              0
SMOKE             0
CH2O              0
SCC               0
FAF               0
TUE               0
CALC              0
MTRANS            0
BMI               0
dtype: int64

Dataset shape: (2105, 17)

Feature names: ['Gender', 'Age', 'Height', 'family_history', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
Epoch [10/100], Train Loss: 464.7842, Val Loss: 368.9088
Epoch [20/100], Train Loss: 46.9658, Val Loss: 28.5821
Epoch [30/100], Train Loss: 23.3892, Val Loss: 11.4793
Epoch [40/100], Train Loss: 22.5239, Val Loss: 11.7526
Early stopping at epoch 42

Training Set Performance:
Accuracy (predictions within 2.0 BMI points): 0.5784

Test Set Performance:
Accuracy (predictions within 2.0 BMI points): 0.5321

Detailed Error Analysis (Test Set):
Mean Absolut

  error_distribution = pd.value_counts(error_bins, normalize=True).sort_index()
