In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import os

# Create models dir if needed
os.makedirs('models', exist_ok=True)

def train_and_save(disease, X, y_col=None, feature_cols=None, y=None, save_path='models'):
    """Flexible: Handles CSV path (str) or pre-loaded data (array/df)."""
    if isinstance(X, str):  # CSV path: Load and extract X/y
        df = pd.read_csv(X)
        if feature_cols is None:
            feature_cols = [col for col in df.columns if col != y_col]
        X_df = df[feature_cols]
        y = LabelEncoder().fit_transform(df[y_col])
    else:  # Pre-loaded: X is array/df, y is passed
        if isinstance(X, pd.DataFrame):
            X_df = X
        else:
            # Assume X is numpy array; need feature_cols
            if feature_cols is None:
                raise ValueError(f"feature_cols required when X is array for {disease}")
            X_df = pd.DataFrame(X, columns=feature_cols)
        if y is None:
            raise ValueError(f"y required when X is pre-loaded for {disease}")
        y = LabelEncoder().fit_transform(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)
    
    # Full pipeline: Impute → Scale → Classify
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),  # Handles zeros/missing
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    pipeline.fit(X_train, y_train)
    
    # Evaluate
    y_pred = pipeline.predict(X_test)
    print(f"\n🩺 {disease} Classification Report:\n", classification_report(y_test, y_pred, target_names=['No Disease', 'Has Disease']))
    
    # Save
    joblib.dump(pipeline, f"{save_path}/{disease}_pipeline.joblib")
    print(f"✅ Saved {disease}_pipeline.joblib (Test accuracy: {pipeline.score(X_test, y_test):.3f})")

# =====================
# 1. Diabetes: Load RAW from UCI URL (no scaled CSV needed)
# =====================
url_diabetes = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
col_names_diabetes = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
raw_diabetes = pd.read_csv(url_diabetes, header=None, names=col_names_diabetes)
X_diabetes = raw_diabetes.drop('Outcome', axis=1)
y_diabetes = raw_diabetes['Outcome'].values
train_and_save("Diabetes", X_diabetes, y_col='dummy', feature_cols=col_names_diabetes[:-1], y=y_diabetes)  # y_col dummy since pre-loaded

# =====================
# 2. Heart: Load from your local RAW CSV (fixed path)
# =====================
heart_path = r"C:\Users\anush\Downloads\Multi-Disease-Prediction-latest-akila\Multi-Disease-Prediction\notebooks\heart_dataset.csv"
train_and_save("Heart", heart_path, "HeartDisease")

# =====================
# 3. Hypertension: Generate RAW synthetic (matches frontend; no scaled CSV needed)
# =====================
np.random.seed(42)  # Reproducible
n_samples = 1000
raw_hypertension = pd.DataFrame({
    'Age': np.random.randint(18, 80, n_samples),
    'Salt_Intake': np.random.uniform(1, 15, n_samples),  # g/day
    'Stress_Score': np.random.uniform(0, 100, n_samples),  # 0-100
    'BP_History': np.random.randint(0, 3, n_samples),  # 0=none, 1=mild, 2=severe
    'Sleep_Duration': np.random.uniform(4, 10, n_samples),  # hours
    'BMI': np.random.uniform(15, 40, n_samples),
    'Medication': np.random.randint(0, 4, n_samples),  # 0=none, 1-3 types
    'Family_History': np.random.randint(0, 2, n_samples),  # 0=no, 1=yes
    'Exercise_Level': np.random.randint(0, 5, n_samples),  # 0=sedentary, 4=high
    'Smoking_Status': np.random.randint(0, 2, n_samples),  # 0=no, 1=yes
    'Has_Hypertension': np.random.choice([0, 1], n_samples, p=[0.5, 0.5])  # Balanced start
})
# Realism: High BMI + family history → more likely positive
mask_positive = (raw_hypertension['BMI'] > 25) & (raw_hypertension['Family_History'] == 1)
raw_hypertension.loc[mask_positive, 'Has_Hypertension'] = 1
X_hypertension = raw_hypertension.drop('Has_Hypertension', axis=1)
y_hypertension = raw_hypertension['Has_Hypertension'].values
feature_cols_hypertension = ['Age', 'Salt_Intake', 'Stress_Score', 'BP_History', 'Sleep_Duration', 'BMI', 'Medication', 'Family_History', 'Exercise_Level', 'Smoking_Status']
train_and_save("Hypertension", X_hypertension, y_col='dummy', feature_cols=feature_cols_hypertension, y=y_hypertension)

# =====================
# 4. Kidney: Load from your local RAW CSV (fixed path)
# =====================
kidney_path = r"C:\Users\anush\Downloads\Multi-Disease-Prediction-latest-akila\Multi-Disease-Prediction\notebooks\kidney_dataset.csv"
train_and_save("Kidney", kidney_path, "CKD")

# =====================
# 5. Liver: Generate RAW synthetic (matches frontend; ~70% positive)
# =====================
np.random.seed(42)
n_samples_liver = 600
raw_liver = pd.DataFrame({
    'Age': np.random.randint(4, 70, n_samples_liver),
    'Gender': np.random.choice([0, 1], n_samples_liver, p=[0.5, 0.5]),  # 0=F, 1=M
    'BMI': np.random.uniform(15, 35, n_samples_liver),
    'AlcoholConsumption': np.random.uniform(0, 50, n_samples_liver),  # units/week
    'Smoking': np.random.choice([0, 1], n_samples_liver, p=[0.6, 0.4]),
    'GeneticRisk': np.random.choice([0, 1], n_samples_liver, p=[0.7, 0.3]),
    'PhysicalActivity': np.random.uniform(0, 10, n_samples_liver),  # hours/week
    'Diabetes': np.random.choice([0, 1], n_samples_liver, p=[0.8, 0.2]),
    'Hypertension': np.random.choice([0, 1], n_samples_liver, p=[0.75, 0.25]),
    'LiverFunctionTest': np.random.uniform(0, 100, n_samples_liver),  # e.g., ALT levels
    'Diagnosis': np.random.choice([0, 1], n_samples_liver, p=[0.3, 0.7])  # ILPD-like ratio
})
# Realism: High alcohol + low activity → higher disease chance
mask_disease = (raw_liver['AlcoholConsumption'] > 20) | (raw_liver['PhysicalActivity'] < 3)
raw_liver.loc[mask_disease, 'Diagnosis'] = 1
X_liver = raw_liver.drop('Diagnosis', axis=1)
y_liver = raw_liver['Diagnosis'].values
feature_cols_liver = ['Age', 'Gender', 'BMI', 'AlcoholConsumption', 'Smoking', 'GeneticRisk', 'PhysicalActivity', 'Diabetes', 'Hypertension', 'LiverFunctionTest']
train_and_save("Liver", X_liver, y_col='dummy', feature_cols=feature_cols_liver, y=y_liver)

print("\n🎉 All models retrained and saved! Restart Flask and test.")


🩺 Diabetes Classification Report:
               precision    recall  f1-score   support

  No Disease       0.79      0.78      0.78        99
 Has Disease       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154

✅ Saved Diabetes_pipeline.joblib (Test accuracy: 0.721)

🩺 Heart Classification Report:
               precision    recall  f1-score   support

  No Disease       1.00      0.99      0.99       166
 Has Disease       1.00      1.00      1.00       434

    accuracy                           1.00       600
   macro avg       1.00      0.99      1.00       600
weighted avg       1.00      1.00      1.00       600

✅ Saved Heart_pipeline.joblib (Test accuracy: 0.997)

🩺 Hypertension Classification Report:
               precision    recall  f1-score   support

  No Disease       0.59      0.53      0.56        77
 Has Disease       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
