In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_obesity_data(file_path):
    # 1. Load the data
    df = pd.read_csv(file_path)
    
    # 2. Check for missing values
    print("Missing values:\n", df.isnull().sum())
    
    # 3. Simple data validation
    # Remove any rows where height or weight are unreasonable
    df = df[(df['Height'] > 1.4) & (df['Height'] < 2.2) &
            (df['Weight'] > 40) & (df['Weight'] < 200)]
    
    # 4. Encode categorical variables using Label Encoder
    categorical_cols = ['Gender', 'family_history', 'FAVC', 'CAEC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS']
    
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    # 5. Scale numerical variables (now without Weight)
    numerical_cols = ['Age', 'Height', 'FCVC', 'NCP', 
                     'CH2O', 'FAF', 'TUE']
    
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    # 6. Split features and target, dropping both BMI and Weight
    X = df.drop(['BMI', 'Weight'], axis=1)
    y = df['BMI']
    
    # 7. Print basic statistics
    print("\nDataset shape:", df.shape)
    print("\nFeature names:", list(X.columns))
    
    return X, y

In [23]:
x,y =preprocess_obesity_data("ObesityPrediction.csv")

Missing values:
 Gender            0
Age               0
Height            0
Weight            0
family_history    0
FAVC              0
FCVC              0
NCP               0
CAEC              0
SMOKE             0
CH2O              0
SCC               0
FAF               0
TUE               0
CALC              0
MTRANS            0
BMI               0
dtype: int64

Dataset shape: (2105, 17)

Feature names: ['Gender', 'Age', 'Height', 'family_history', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']


In [24]:
print(y)

0       24.386526
1       24.238227
2       23.765432
3       26.851852
4       28.342381
          ...    
2106    44.901475
2107    43.741923
2108    43.543817
2109    44.071535
2110    44.144338
Name: BMI, Length: 2105, dtype: float64


In [25]:
print(x)

      Gender       Age    Height  family_history  FAVC      FCVC       NCP  \
0          0 -0.524137 -0.882009               1     0 -0.793282  0.402928   
1          0 -0.524137 -1.956374               1     0  1.087496  0.402928   
2          1 -0.209058  1.051847               1     0 -0.793282  0.402928   
3          1  0.421101  1.051847               0     0  1.087496  0.402928   
4          1 -0.366598  0.836974               0     0 -0.793282 -2.168920   
...      ...       ...       ...             ...   ...       ...       ...   
2106       0 -0.527786  0.092762               1     1  1.087496  0.402928   
2107       0 -0.369285  0.499452               1     1  1.087496  0.402928   
2108       0 -0.284041  0.538365               1     1  1.087496  0.402928   
2109       0  0.005501  0.401319               1     1  1.087496  0.402928   
2110       0 -0.104340  0.394723               1     1  1.087496  0.402928   

      CAEC  SMOKE      CH2O  SCC       FAF       TUE  CALC  MTR

In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import QuantileTransformer
import numpy as np
import pandas as pd

class AdvancedBMINet(nn.Module):
    def __init__(self, input_dim):
        super(AdvancedBMINet, self).__init__()
        
        # Initial feature transformation
        self.input_transform = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.GELU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3)
        )
        
        # Deep feature extraction pathway
        self.deep_pathway = nn.Sequential(
            nn.Linear(512, 256),
            nn.GELU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.GELU(),
            nn.BatchNorm1d(128)
        )
        
        # Wide pathway for direct feature processing
        self.wide_pathway = nn.Sequential(
            nn.Linear(512, 128),
            nn.GELU(),
            nn.BatchNorm1d(128)
        )
        
        # Expert subsystems
        self.expert1 = nn.Sequential(
            nn.Linear(256, 64),
            nn.GELU(),
            nn.BatchNorm1d(64)
        )
        
        self.expert2 = nn.Sequential(
            nn.Linear(256, 64),
            nn.GELU(),
            nn.BatchNorm1d(64)
        )
        
        self.expert3 = nn.Sequential(
            nn.Linear(256, 64),
            nn.GELU(),
            nn.BatchNorm1d(64)
        )
        
        # Gating network for expert weighting
        self.gate = nn.Sequential(
            nn.Linear(256, 3),
            nn.Softmax(dim=1)
        )
        
        # Final prediction layers
        self.final = nn.Sequential(
            nn.Linear(64, 32),
            nn.GELU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.1),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        # Initial transformation
        x = self.input_transform(x)
        
        # Process through pathways
        deep_features = self.deep_pathway(x)
        wide_features = self.wide_pathway(x)
        
        # Combine pathways
        combined = torch.cat([deep_features, wide_features], dim=1)
        
        # Expert processing
        expert1_out = self.expert1(combined)
        expert2_out = self.expert2(combined)
        expert3_out = self.expert3(combined)
        
        # Gate the experts
        gates = self.gate(combined)
        
        # Combine expert outputs
        expert_out = (gates[:, 0].unsqueeze(1) * expert1_out +
                     gates[:, 1].unsqueeze(1) * expert2_out +
                     gates[:, 2].unsqueeze(1) * expert3_out)
        
        # Final prediction
        return self.final(expert_out)

class EnsemblePredictor:
    def __init__(self, input_dim, n_models=5):
        self.models = [AdvancedBMINet(input_dim) for _ in range(n_models)]
        self.n_models = n_models
        self.quantile_transformer = QuantileTransformer(output_distribution='normal')
        
    def preprocess_data(self, X, y=None, train=False):
        if train:
            X_processed = self.quantile_transformer.fit_transform(X)
        else:
            X_processed = self.quantile_transformer.transform(X)
        return torch.FloatTensor(X_processed), None if y is None else torch.FloatTensor(y)
    
    def train(self, X, y, epochs=200, batch_size=128):
        X_processed, y_processed = self.preprocess_data(X.values, y.values, train=True)
        
        # K-fold training for ensemble
        kf = KFold(n_splits=self.n_models, shuffle=True, random_state=42)
        
        for model_idx, (train_idx, val_idx) in enumerate(kf.split(X_processed)):
            print(f"\nTraining Model {model_idx + 1}/{self.n_models}")
            
            # Prepare data
            X_train, X_val = X_processed[train_idx], X_processed[val_idx]
            y_train, y_val = y_processed[train_idx], y_processed[val_idx]
            
            train_dataset = TensorDataset(X_train, y_train.unsqueeze(1))
            val_dataset = TensorDataset(X_val, y_val.unsqueeze(1))
            
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
            
            # Initialize training components
            model = self.models[model_idx]
            criterion = nn.HuberLoss(delta=1.0)
            optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                                factor=0.5, patience=5, verbose=True)
            
            best_val_loss = float('inf')
            patience = 20
            patience_counter = 0
            
            # Training loop
            for epoch in range(epochs):
                model.train()
                train_loss = 0
                for batch_X, batch_y in train_loader:
                    optimizer.zero_grad()
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    # Optimizer step only (scheduler.step() moved to after validation)
                    train_loss += loss.item()
                
                # Validation
                model.eval()
                val_loss = 0
                with torch.no_grad():
                    for batch_X, batch_y in val_loader:
                        outputs = model(batch_X)
                        val_loss += criterion(outputs, batch_y).item()
                
                if (epoch + 1) % 20 == 0:
                    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, '
                          f'Val Loss: {val_loss/len(val_loader):.4f}')
                
                # Early stopping
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                    
                # Update learning rate scheduler
                scheduler.step(val_loss)
                
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
    
    def predict(self, X):
        X_processed, _ = self.preprocess_data(X.values)
        
        predictions = []
        for model in self.models:
            model.eval()
            with torch.no_grad():
                pred = model(X_processed)
                predictions.append(pred.numpy())
        
        # Ensemble prediction (weighted median)
        ensemble_pred = np.median(predictions, axis=0)
        return ensemble_pred.flatten()

def evaluate_ensemble_model(X, y, threshold=2.0):
    # Split the data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train ensemble
    ensemble = EnsemblePredictor(input_dim=X.shape[1])
    ensemble.train(X_train, y_train)
    
    # Make predictions
    train_predictions = ensemble.predict(X_train)
    test_predictions = ensemble.predict(X_test)
    
    # Evaluate predictions
    train_errors = np.abs(y_train - train_predictions)
    test_errors = np.abs(y_test - test_predictions)
    
    train_within_threshold = train_errors <= threshold
    test_within_threshold = test_errors <= threshold
    
    # Print results
    print("\nTraining Set Performance:")
    print(f"Accuracy (predictions within {threshold} BMI points): {np.mean(train_within_threshold):.4f}")
    print(f"Mean Absolute Error: {np.mean(train_errors):.2f}")
    
    print("\nTest Set Performance:")
    print(f"Accuracy (predictions within {threshold} BMI points): {np.mean(test_within_threshold):.4f}")
    print(f"Mean Absolute Error: {np.mean(test_errors):.2f}")
    
    # Detailed error analysis
    print("\nDetailed Error Analysis (Test Set):")
    print(f"Median Absolute Error: {np.median(test_errors):.2f}")
    print(f"90th percentile of absolute error: {np.percentile(test_errors, 90):.2f}")
    
    error_bins = pd.cut(test_errors, bins=[0, 1, 2, 3, 4, 5, float('inf')],
                       labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5+'])
    error_distribution = pd.value_counts(error_bins, normalize=True).sort_index()
    
    print("\nError Distribution:")
    for bin_name, percentage in error_distribution.items():
        print(f"Error {bin_name} BMI points: {percentage*100:.1f}%")
    
    return ensemble, test_errors

# Usage example:
if __name__ == "__main__":
   
    
    # Train and evaluate the ensemble model
    ensemble_model, errors = evaluate_ensemble_model(x, y, threshold=2.0)


Training Model 1/5




Epoch [20/200], Train Loss: 26.3244, Val Loss: 26.5694
Epoch [40/200], Train Loss: 19.2669, Val Loss: 19.2073
Epoch [60/200], Train Loss: 8.9139, Val Loss: 8.6395
Epoch [80/200], Train Loss: 2.4676, Val Loss: 1.8835
Epoch [100/200], Train Loss: 1.9709, Val Loss: 1.7956
Epoch [120/200], Train Loss: 1.8658, Val Loss: 1.6052
Epoch [140/200], Train Loss: 1.7139, Val Loss: 1.4959
Early stopping at epoch 155

Training Model 2/5
Epoch [20/200], Train Loss: 26.5179, Val Loss: 26.0841
Epoch [40/200], Train Loss: 19.5062, Val Loss: 18.7551
Epoch [60/200], Train Loss: 8.8219, Val Loss: 8.5327
Epoch [80/200], Train Loss: 2.4452, Val Loss: 2.0320
Epoch [100/200], Train Loss: 1.9348, Val Loss: 1.6039
Epoch [120/200], Train Loss: 1.6685, Val Loss: 1.5545
Epoch [140/200], Train Loss: 1.5275, Val Loss: 1.4455
Epoch [160/200], Train Loss: 1.5604, Val Loss: 1.4421
Early stopping at epoch 167

Training Model 3/5
Epoch [20/200], Train Loss: 26.4208, Val Loss: 26.4255
Epoch [40/200], Train Loss: 19.3265, Va

  error_distribution = pd.value_counts(error_bins, normalize=True).sort_index()
