In [42]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
import joblib

In [None]:
def load_static_data(filepath):
    """
    Load the LMCH dataset from a CSV file.
    Assumes the file contains a 'patient_id' column and a target column 'risk'.
    """
    data = pd.read_csv(filepath)
    return data


In [None]:

def preprocess_static_data(df, target_col):
    """
    Preprocess the LMCH dataset.
    
    - Drops the identifier column.
    - Fills missing values for numeric columns using the median.
    - For non-numeric columns (excluding the target), fills missing values using the mode.
    - Separates features from the target and applies one-hot encoding for any categorical variables.
    - Scales the features using StandardScaler.
    
    Returns the scaled features, target, fitted scaler, and list of feature column names.
    """

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)
    
    non_num_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    for col in non_num_cols:
        if col != target_col:
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    target_col = 'Class'
    X = df.drop(columns=[target_col])
    y = df[target_col]

    print("Original label distribution:")
    print(Counter(y))

    # Apply SMOTE to balance the imbalanced count of labels
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Display resampled label distribution
    print("\nResampled label distribution:")
    print(Counter(y_resampled))
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_resampled)
    
    # Return feature names for later use in prediction alignment
    return X_scaled, y_resampled, scaler, X.columns


In [45]:

# -----------------------------------------------------------
# 2. Model Training Functions
# -----------------------------------------------------------
def train_risk_model(X_train, y_train):
    """
    Train a RandomForest classifier for multi-class diabetes risk prediction.
    """
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    return clf

def train_anomaly_detector(X_train):
    """
    Train a One-Class SVM for anomaly detection on the static features.
    The detector is designed to flag unusual patient records.
    """
    anomaly_model = OneClassSVM(nu=0.01, kernel='rbf', gamma=0.1)
    anomaly_model.fit(X_train)
    return anomaly_model


In [46]:

# -----------------------------------------------------------
# 3. Integrated Prediction Function
# -----------------------------------------------------------
def predict_patient_risk(patient_features, risk_model, scaler, anomaly_detector, feature_columns):
    """
    Given patient features as a dictionary (or numpy array in the same column order),
    this function returns:
      - The risk category prediction from the risk_model.
      - An anomaly flag indicating if the record is unusual.
    
    It aligns input features to the trained model's feature columns, scales the input,
    and finally produces an integrated prediction.
    """
    # Convert to DataFrame for consistency if input is a dictionary
    if isinstance(patient_features, dict):
        patient_df = pd.DataFrame([patient_features])
    elif isinstance(patient_features, np.ndarray):
        patient_df = pd.DataFrame([patient_features], columns=feature_columns)
    else:
        raise ValueError("patient_features should be of type dict or numpy array")
        
    # Ensure all required feature columns are present; add missing ones with zero values
    for col in feature_columns:
        if col not in patient_df.columns:
            patient_df[col] = 0
    patient_df = patient_df[feature_columns]
    
    # Scale the features
    X_input = scaler.transform(patient_df)
    
    # Risk prediction and anomaly detection
    risk_prediction = risk_model.predict(X_input)[0]
    anomaly = anomaly_detector.predict(X_input)[0]  # 1 indicates inlier, -1 indicates anomaly
    anomaly_flag = (anomaly == -1)
    
    return {
        "risk_prediction": risk_prediction,
        "anomaly_detected": anomaly_flag
    }


In [48]:

# -----------------------------------------------------------
# 4. Main Pipeline Execution
# -----------------------------------------------------------
if __name__ == "__main__":
    # Load the LMCH dataset (ensure 'lmch_static.csv' is in your working directory)
    df = load_static_data('lmch_static.csv')
    
    # Preprocess the static data and get scaled features along with the target
    X_scaled, y, scaler, feature_cols = preprocess_static_data(df, target_col='Class')
    
    # Split data into training and testing sets (using stratification to preserve class distribution)
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train the risk classifier and the anomaly detection model
    risk_model = train_risk_model(X_train, y_train)
    anomaly_detector = train_anomaly_detector(X_train)
    
    # Evaluate the risk classifier on the test set
    y_pred = risk_model.predict(X_test)
    print("Risk Classifier Evaluation:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    
    anomaly_predictions = anomaly_detector.predict(X_test)

    # Create a boolean mask for anomalies
    anomaly_flags = (anomaly_predictions == -1)

    # Calculate the number of anomalies and total samples
    num_anomalies = np.sum(anomaly_flags)
    total_samples = X_test.shape[0]

    print("Anomaly Detection Test Results:")
    print(f"Total test samples: {total_samples}")
    print(f"Number of detected anomalies: {num_anomalies}")
    print("Indices of anomalous samples:", np.where(anomaly_flags)[0])
   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Original label distribution:
Counter({2: 128, 0: 96, 1: 40})

Resampled label distribution:
Counter({0: 128, 1: 128, 2: 128})
Risk Classifier Evaluation:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        25
           1       1.00      0.96      0.98        26
           2       0.96      0.92      0.94        26

    accuracy                           0.96        77
   macro avg       0.96      0.96      0.96        77
weighted avg       0.96      0.96      0.96        77

Accuracy: 0.961038961038961
Anomaly Detection Test Results:
Total test samples: 77
Number of detected anomalies: 14
Indices of anomalous samples: [ 2  7 10 14 18 22 28 29 33 38 55 64 73 74]


In [None]:
 
# -----------------------------------------------------------
# 5. Integrated Prediction Example
# -----------------------------------------------------------
# For demonstration, select a sample patient from the test set
sample_features = pd.DataFrame(X_test, columns=feature_cols).iloc[0].to_dict()

result = predict_patient_risk(
    sample_features, risk_model, scaler, anomaly_detector, feature_cols
)

print("Integrated Prediction:")
print(result)

# -----------------------------------------------------------
# 6. Save the Models and Scaler for Deployment
# -----------------------------------------------------------
joblib.dump(risk_model, 'lmch_risk_model.pkl')
joblib.dump(anomaly_detector, 'lmch_anomaly_detector.pkl')
print("Models and scaler saved successfully.")


Integrated Prediction:
{'risk_prediction': 0, 'anomaly_detected': True}
Models and scaler saved successfully.
