In [9]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load data
df = pd.read_excel('../../data_set/hd_sessions/cleaned_session_data.xlsx')

# Create target and features
df['Next Dry weight (kg)'] = df.groupby('Subject_ID')['Dry weight (kg)'].shift(-1)
df['Dry weight adjustment (kg)'] = df['Next Dry weight (kg)'] - df['Dry weight (kg)']
df = df.dropna(subset=['Dry weight adjustment (kg)']).reset_index(drop=True)
# Ensure the dry weight adjustment is between -5 and 5 kg
df = df[(df['Dry weight adjustment (kg)'] >= -5) & (df['Dry weight adjustment (kg)'] <= 5)]

# Rule-based features
df['High_SBP'] = df['SYS (mmHg)'] > 140
df['UFR'] = df['PUF (ml)'] / (df['HD duration (h)'] * df['Pre HD weight (kg)'])
df['UFR_below_15'] = df['UFR'] < 15

df['High_SBP'] = df['High_SBP'].astype(int)
df['UFR_below_15'] = df['UFR_below_15'].astype(int)

# Binary adjustment class
def get_adjustment_class(row, threshold=0.1):
    diff = row['Next Dry weight (kg)'] - row['Dry weight (kg)']
    if abs(diff) >= threshold:
        return 1
    else:
        return 0

df['Adjustment_Class'] = df.apply(get_adjustment_class, axis=1)
df['Adjustment_Direction'] = df['Dry weight adjustment (kg)'].apply(
    lambda x: 1 if x >= 0.5 else (2 if x <= -0.5 else 0)
)

# Additional features
df['Weight_gain_pct'] = df['Weight gain (kg)'] / df['Dry weight (kg)'] * 100

df['Weight_gain_avg_3'] = df.groupby('Subject_ID')['Weight gain (kg)'].transform(
    lambda x: x.rolling(3, min_periods=1).mean()
)
df['SYS_avg_3'] = df.groupby('Subject_ID')['SYS (mmHg)'].transform(
    lambda x: x.rolling(3, min_periods=1).mean()
)


# Outlier handling
df = df[df['UFR'].between(0, 20)]
df = df[df['Weight gain (kg)'] >= 0]
df = df[df['SYS (mmHg)'].between(50, 250)].reset_index(drop=True)


# Selected features 
features = [
    'AP (mmHg)', 'AUF (ml)', 'High_SBP', 'BFR (ml/min)', 'SYS_avg_3', 'HD duration (h)', 'UFR',
    'PUF (ml)', 'TMP (mmHg)', 'VP (mmHg)', 'Weight gain (kg)', 'SYS (mmHg)', 'DIA (mmHg)',
    'Pre HD weight (kg)', 'Post HD weight (kg)', 'Dry weight (kg)', 'Weight_gain_avg_3', 'Weight_gain_pct'
]

# Prepare data
X = df[features].fillna(df[features].median())
y = df['Adjustment_Class']
original_indices = df.index

# Print class distribution
print("Class Distribution:\n", df['Adjustment_Class'].value_counts())


# Train a Random Forest model to get feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Create a DataFrame of feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print feature importance
print("Feature Importance:\n", feature_importance)

# Select top features (e.g., top 15 or based on a threshold)
top_features = feature_importance['Feature'].head(15).tolist()  # Adjust number as needed
print("Selected Top Features:\n", top_features)

# Update X to include only top features
X_selected = X[top_features]

Class Distribution:
 Adjustment_Class
0    3321
1     287
Name: count, dtype: int64
Feature Importance:
                 Feature  Importance
4             SYS_avg_3    0.078858
9             VP (mmHg)    0.077965
16    Weight_gain_avg_3    0.070133
14  Post HD weight (kg)    0.069458
13   Pre HD weight (kg)    0.067400
0             AP (mmHg)    0.066959
17      Weight_gain_pct    0.066491
11           SYS (mmHg)    0.066480
6                   UFR    0.065522
8            TMP (mmHg)    0.065054
12           DIA (mmHg)    0.062294
15      Dry weight (kg)    0.058190
1              AUF (ml)    0.051648
10     Weight gain (kg)    0.048304
7              PUF (ml)    0.044974
3          BFR (ml/min)    0.029558
2              High_SBP    0.005952
5       HD duration (h)    0.004760
Selected Top Features:
 ['SYS_avg_3', 'VP (mmHg)', 'Weight_gain_avg_3', 'Post HD weight (kg)', 'Pre HD weight (kg)', 'AP (mmHg)', 'Weight_gain_pct', 'SYS (mmHg)', 'UFR', 'TMP (mmHg)', 'DIA (mmHg)', 'Dry weight (

In [18]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings('ignore')

# Assuming X, y, and top_15_features are defined from your code


# Initialize SMOTEENN
smoteenn = SMOTEENN(random_state=42, sampling_strategy=1.0)  # Equalize class 0 and class 1

# Initialize LightGBM
base_model = LGBMClassifier(
    random_state=42,
    objective='binary',
    metric='binary_logloss'
)

# Define hyperparameter grid
param_grid = {
    'num_leaves': [15, 31],  # Smaller range for speed
    'max_depth': [5, 7],     # Limits overfitting
    'learning_rate': [0.05, 0.1],  # Controls step size
    'n_estimators': [100, 200]     # Number of trees
}

# Cross-validation with standard KFold (no groupingrivastava
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_selected, y)):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Apply SMOTEENN to training data
    X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train, y_train)
    
    # Perform grid search on resampled data
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        scoring='precision_macro',  # Optimize for precision
        cv=3,  # Inner CV for grid search
        n_jobs=-1
    )
    grid_search.fit(X_train_resampled, y_train_resampled)
    
    # Best model for this fold
    model = grid_search.best_estimator_
    print(f"\nFold {fold + 1} Best Parameters: {grid_search.best_params_}")
    
    # Predict and evaluate on validation data
    y_pred = model.predict(X_val)
    y_scores = model.predict_proba(X_val)[:, 1]
    
    print(f"\nFold {fold + 1} Results:")
    print(classification_report(y_val, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_val, y_scores):.3f}")

# Train final model with SMOTEENN and best parameters from last fold
X_resampled, y_resampled = smoteenn.fit_resample(X_selected, y)
final_model = grid_search.best_estimator_
final_model.fit(X_resampled, y_resampled)

# Save feature importance
feature_importance = pd.DataFrame({
    'Feature': top_15_features,
    'Importance': final_model.feature_importances_ / final_model.feature_importances_.sum()
}).sort_values(by='Importance', ascending=False)
print("\nFinal Model Feature Importance:\n", feature_importance)

[LightGBM] [Info] Number of positive: 2378, number of negative: 1662
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3670
[LightGBM] [Info] Number of data points in the train set: 4040, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.588614 -> initscore=0.358238
[LightGBM] [Info] Start training from score 0.358238

Fold 1 Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'num_leaves': 31}

Fold 1 Results:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       662
           1       0.17      0.25      0.20        60

    accuracy                           0.83       722
   macro avg       0.55      0.57      0.55       722
weighted avg       0.87      0.83      0.85       722

ROC-AUC: 0.657
[LightGBM] [Info] Number of positive: 2386, numbe