In [None]:
# Appendicitis dataset pipeline: cleaning, EDA, split-before-impute, modelling, feature importance
# Paste into a Jupyter cell and run. Edit `DATA_PATH` if needed.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.utils import resample

# --------------------------
# 0. Configuration
# --------------------------
DATA_PATH = "C:/Users/HP/Desktop/PYTHON PROJECT FROM DATALABS/appendicitis_data.csv"   # ← change to your csv path if different
SAVE_DIR = "C:/Users/HP/Desktop/PYTHON PROJECT FROM DATALABS"                          # ← change to a local folder if needed
N_FEATURES_SELECT = 20                          # number of top features to keep per target
RANDOM_STATE = 42

# --------------------------
# 1. Load dataset
# --------------------------
df = pd.read_csv("appendicitis_data.csv")
print("Loaded dataset shape:", df.shape)
print("Columns (first 40):", list(df.columns[:40]))

# --------------------------
# 2. Quick cleaning steps
# --------------------------
# 2.1 normalize column names
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

# 2.2 Drop columns that are entirely empty
empty_cols = df.columns[df.isna().all()].tolist()
if empty_cols:
    print("Dropping empty columns:", empty_cols)
    df = df.drop(columns=empty_cols)

# 2.3 Remove exact duplicate rows
dup_count = df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")
if dup_count > 0:
    df = df.drop_duplicates().reset_index(drop=True)

# --------------------------
# 3. Identify targets (heuristic)
# --------------------------
# The script tries to find typical column names; change if your target columns differ.
candidates = ['diagnosis','diagnose','appendicitis','management','severity','complicated','pas','score','paediatric']
targets = []
for c in df.columns:
    for t in candidates:
        if t in c and c not in targets:
            targets.append(c)
# ensure we have 3 targets (fallback to last 3 columns)
if len(targets) < 3:
    targets = list(df.columns[-3:])   # fallback
targets = targets[:3]
print("Selected target columns:", targets)

# --------------------------
# 4. Basic datatype fixes
# --------------------------
# Convert object columns that are numeric-like into numeric
for c in df.select_dtypes(include=['object']).columns:
    cleaned = df[c].astype(str).str.replace('[^0-9.+-]', '', regex=True)
    non_empty = cleaned[cleaned!='']
    if len(non_empty) > 0:
        frac_numeric = non_empty.str.replace('.','',1).str.lstrip('+-').str.isdigit().mean()
        if frac_numeric > 0.6:
            df[c] = pd.to_numeric(cleaned, errors='coerce')

# Replace negative numeric values with NaN for later imputation (common error)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
negative_summary = {c: int((df[c] < 0).sum()) for c in numeric_cols if (df[c] < 0).sum() > 0}
if negative_summary:
    print("Negative values found (set to NaN):", negative_summary)
    for c in negative_summary:
        df.loc[df[c] < 0, c] = np.nan

# Remove control characters in object fields
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.replace(r'[\x00-\x1f]', '', regex=True).str.strip()
    df.loc[df[c] == '', c] = np.nan

# --------------------------
# 5. Split into train/test BEFORE imputation
# --------------------------
features = [c for c in df.columns if c not in targets]
X = df[features]
y_df = df[targets]

# Try to find a good stratify column (one of the targets with at least 2 examples per class)
stratify_col = None
for t in targets:
    vc = y_df[t].value_counts(dropna=True)
    if len(vc) > 1 and vc.min() >= 2:
        stratify_col = y_df[t]
        stratify_target_name = t
        break

if stratify_col is not None:
    print(f"Stratified split on '{stratify_target_name}'")
    X_train, X_test, y_train_df, y_test_df = train_test_split(X, y_df, test_size=0.2, random_state=RANDOM_STATE, stratify=stratify_col)
else:
    print("Random split (no suitable stratify column).")
    X_train, X_test, y_train_df, y_test_df = train_test_split(X, y_df, test_size=0.2, random_state=RANDOM_STATE)

print("Train/test shapes:", X_train.shape, X_test.shape)

# --------------------------
# 6. Impute missing values (fit imputers on TRAIN only)
# --------------------------
numeric_feats = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_feats = [c for c in X_train.columns if c not in numeric_feats]

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train[numeric_feats]), columns=numeric_feats, index=X_train.index)
X_test_num = pd.DataFrame(num_imputer.transform(X_test[numeric_feats]), columns=numeric_feats, index=X_test.index)

# Reduce cardinality on categorical cols (keep top 10 categories in train; others -> '__other__')
X_train_cat = pd.DataFrame(index=X_train.index)
X_test_cat = pd.DataFrame(index=X_test.index)
if cat_feats:
    X_train_cat_full = X_train[cat_feats].astype(object).copy()
    X_test_cat_full = X_test[cat_feats].astype(object).copy()
    for c in cat_feats:
        top = X_train_cat_full[c].value_counts().nlargest(10).index.tolist()
        X_train_cat_full[c] = X_train_cat_full[c].where(X_train_cat_full[c].isin(top), other='__other__')
        X_test_cat_full[c] = X_test_cat_full[c].where(X_test_cat_full[c].isin(top), other='__other__')
    X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat_full), columns=cat_feats, index=X_train.index)
    X_test_cat = pd.DataFrame(cat_imputer.transform(X_test_cat_full), columns=cat_feats, index=X_test.index)

# One-hot encode categorical features (fit on train)
if not X_train_cat.empty:
    # scikit-learn changed the OneHotEncoder parameter name from 'sparse' to 'sparse_output' in newer versions.
    # Try the newer parameter first and fall back to the older name for compatibility.
    try:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
    X_train_cat_ohe = pd.DataFrame(ohe.fit_transform(X_train_cat), columns=ohe.get_feature_names_out(cat_feats), index=X_train.index)
    X_test_cat_ohe = pd.DataFrame(ohe.transform(X_test_cat), columns=ohe.get_feature_names_out(cat_feats), index=X_test.index)
else:
    X_train_cat_ohe = pd.DataFrame(index=X_train.index)
    X_test_cat_ohe = pd.DataFrame(index=X_test.index)

# Reconstruct final X_train and X_test (imputed + encoded)
X_train_clean = pd.concat([X_train_num, X_train_cat_ohe], axis=1)
X_test_clean  = pd.concat([X_test_num, X_test_cat_ohe], axis=1)

print("After imputation/encoding - shapes:", X_train_clean.shape, X_test_clean.shape)

# --------------------------
# 7. Scale: fit scaler on X_train only, apply to both sets
# --------------------------
scaler = StandardScaler()
scaler.fit(X_train_clean)
X_train_scaled = pd.DataFrame(scaler.transform(X_train_clean), columns=X_train_clean.columns, index=X_train_clean.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_clean), columns=X_test_clean.columns, index=X_test_clean.index)

# --------------------------
# 8. Model training loop (per target)
#     - map common textual labels into binary where logical
#     - select top K features via mutual_info_classif
#     - upsample minority in train
#     - train LogisticRegression, RandomForest, KNN, GradientBoost
#     - evaluate on test (independent)
# --------------------------
from sklearn.feature_selection import SelectKBest, mutual_info_classif

models = {
    'logistic': LogisticRegression(max_iter=1000),
    'random_forest': RandomForestClassifier(n_estimators=150, random_state=RANDOM_STATE),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'gradient_boost': GradientBoostingClassifier(n_estimators=150, random_state=RANDOM_STATE)
}

results = {}  # store each target results

for target in targets:
    print("\n" + "="*80)
    print("Target:", target)
    # Normalize strings, map to binary for common cases
    y_train_raw = y_train_df[target].astype(str).str.lower().str.strip()
    y_test_raw = y_test_df[target].astype(str).str.lower().str.strip()

    if 'append' in target or 'diagn' in target:
        y_train = y_train_raw.apply(lambda s: 0 if ('no' in s or s in ['0','none','nan','na','n']) else 1)
        y_test  = y_test_raw.apply(lambda s: 0 if ('no' in s or s in ['0','none','nan','na','n']) else 1)
    elif 'manage' in target:
        y_train = y_train_raw.apply(lambda s: 1 if ('surg' in s or 'oper' in s or 'lapar' in s or 'appendectomy' in s) else 0)
        y_test  = y_test_raw.apply(lambda s: 1 if ('surg' in s or 'oper' in s or 'lapar' in s or 'appendectomy' in s) else 0)
    elif 'sever' in target or 'complic' in target:
        y_train = y_train_raw.apply(lambda s: 1 if ('complic' in s or 'complicated' in s or 'perfor' in s or 'gangre' in s) else 0)
        y_test  = y_test_raw.apply(lambda s: 1 if ('complic' in s or 'complicated' in s or 'perfor' in s or 'gangre' in s) else 0)
    else:
        # fallback label-encoding of train unique values
        uniques = pd.Series(y_train_raw.dropna().unique())
        mapping = {v:i for i,v in enumerate(uniques)}
        y_train = y_train_raw.map(mapping).fillna(-1).astype(int)
        y_test  = y_test_raw.map(mapping).fillna(-1).astype(int)

    # keep rows where train mapping exists
    valid_train_idx = y_train[y_train != -1].index
    valid_test_idx  = y_test[y_test != -1].index

    X_tr = X_train_scaled.loc[valid_train_idx]
    y_tr = y_train.loc[valid_train_idx]
    X_te = X_test_scaled.loc[valid_test_idx]
    y_te = y_test.loc[valid_test_idx]

    if X_tr.shape[0] == 0 or X_te.shape[0] == 0:
        print("No valid train/test rows for this target after mapping — skipping.")
        continue

    # require at least 2 classes to proceed
    if len(y_tr.unique()) < 2:
        print("Not enough classes in train after mapping — skipping.")
        continue

    # Feature selection: mutual_info_classif. Choose top K features
    k = min(N_FEATURES_SELECT, X_tr.shape[1])
    try:
        selector = SelectKBest(mutual_info_classif, k=k)
        selector.fit(X_tr.fillna(0), y_tr)
        selected_cols = list(X_tr.columns[selector.get_support()])
        print(f"Selected top {len(selected_cols)} features for {target}.")
    except Exception as e:
        print("Feature selection failed, using all features. Error:", e)
        selected_cols = X_tr.columns.tolist()

    X_tr_sel = X_tr[selected_cols]
    X_te_sel = X_te[selected_cols]

    # Upsample training set to balance classes
    df_tr = pd.concat([X_tr_sel, y_tr.rename('target')], axis=1)
    max_n = df_tr['target'].value_counts().max()
    parts = []
    for cl, grp in df_tr.groupby('target'):
        if len(grp) < max_n:
            parts.append(resample(grp, replace=True, n_samples=max_n, random_state=RANDOM_STATE))
        else:
            parts.append(grp)
    df_bal = pd.concat(parts)
    X_tr_bal = df_bal.drop(columns=['target'])
    y_tr_bal = df_bal['target']

    # Train models and evaluate
    target_results = {}
    for name, model in models.items():
        try:
            model.fit(X_tr_bal, y_tr_bal)
            preds = model.predict(X_te_sel)
            # Get probabilities where available
            prob = None
            roc = None
            if hasattr(model, "predict_proba") and len(np.unique(y_te)) == 2:
                try:
                    prob = model.predict_proba(X_te_sel)[:, 1]
                    roc = roc_auc_score(y_te, prob)
                except Exception:
                    prob = None
                    roc = None

            acc = accuracy_score(y_te, preds)
            cls_report = classification_report(y_te, preds, zero_division=0)
            cm = confusion_matrix(y_te, preds)

            # Permutation importance (fast - few repeats)
            try:
                pi = permutation_importance(model, X_te_sel, y_te, n_repeats=3, random_state=RANDOM_STATE, n_jobs=1)
                imp_df = pd.DataFrame({'feature': X_te_sel.columns, 'importance_mean': pi.importances_mean}).sort_values('importance_mean', ascending=False)
            except Exception:
                imp_df = pd.DataFrame({'feature': X_te_sel.columns, 'importance_mean': 0})

            target_results[name] = {
                'accuracy': acc,
                'roc_auc': roc,
                'classification_report': cls_report,
                'confusion_matrix': cm.tolist(),
                'permutation_importance': imp_df.head(15)
            }

            print(f"[{target}] Model {name}: acc={acc:.3f}, roc={roc}")
        except Exception as e:
            print(f"Model {name} failed for target {target}: {e}")

    results[target] = {
        'selected_features': selected_cols,
        'models': target_results,
        'train_shape': X_tr_sel.shape,
        'test_shape': X_te_sel.shape
    }

# --------------------------
# 9. Save cleaned dataset and results
# --------------------------
Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)
df.to_csv(Path(SAVE_DIR) / "appendicitis_cleaned.csv", index=False)
pd.to_pickle(results, Path(SAVE_DIR) / "modeling_results.pkl")

# Also save a CSV summary of model metrics
rows = []
for t, info in results.items():
    for mname, mobj in info['models'].items():
        rows.append({
            "target": t,
            "model": mname,
            "accuracy": mobj.get('accuracy'),
            "roc_auc": mobj.get('roc_auc')
        })
summary_df = pd.DataFrame(rows)
summary_df.to_csv(Path(SAVE_DIR) / "model_summary.csv", index=False)

print("\nSaved: appendicitis_cleaned.csv, modeling_results.pkl, model_summary.csv in", SAVE_DIR)

# --------------------------
# 10. Quick descriptive analysis & plots (examples)
# --------------------------
# Descriptive table (first 20 rows)
display(df.describe(include='all').T.head(20))

# Correlations for numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_features) >= 2:
    corr = df[numeric_features].corr()
    plt.figure(figsize=(8,6)); plt.title("Correlation matrix (numeric features)")
    plt.imshow(corr, aspect='auto'); plt.colorbar()
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.index)), corr.index)
    plt.tight_layout(); plt.show()

# Example histogram for top numeric columns (first 4)
for c in numeric_features[:4]:
    plt.figure(figsize=(5,3)); plt.title(f"Histogram: {c}")
    plt.hist(df[c].dropna(), bins=30)
    plt.xlabel(c); plt.ylabel("count"); plt.tight_layout(); plt.show()

# If desired: show top features for one target's best model
for t, info in results.items():
    print("\nTarget:", t)
    for m, mv in info['models'].items():
        print(" Model:", m, " Accuracy:", mv['accuracy'])
        print(" Top features by permutation importance:")
        display(mv['permutation_importance'].head(10))
    break  # remove break if you want all targets

# --------------------------
# End of notebook
# --------------------------


Loaded dataset shape: (782, 69)
Columns (first 40): ['Age', 'BMI', 'Sex', 'Height', 'Weight', 'Length_of_Stay', 'Management', 'Severity', 'Diagnosis_Presumptive', 'Diagnosis', 'Alvarado_Score', 'Paedriatic_Appendicitis_Score', 'Appendix_on_US', 'Appendix_Diameter', 'Migratory_Pain', 'Lower_Right_Abd_Pain', 'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Nausea', 'Loss_of_Appetite', 'Body_Temperature', 'WBC_Count', 'Neutrophil_Percentage', 'Segmented_Neutrophils', 'Neutrophilia', 'RBC_Count', 'Hemoglobin', 'RDW', 'Thrombocyte_Count', 'Ketones_in_Urine', 'RBC_in_Urine', 'WBC_in_Urine', 'CRP', 'Dysuria', 'Stool', 'Peritonitis', 'Psoas_Sign', 'Ipsilateral_Rebound_Tenderness', 'US_Performed', 'US_Number']
Dropping empty columns: ['unnamed:_58', 'unnamed:_59', 'unnamed:_60', 'unnamed:_61', 'unnamed:_62', 'unnamed:_63', 'unnamed:_64', 'unnamed:_65', 'unnamed:_66', 'unnamed:_67', 'unnamed:_68']
Duplicate rows: 0
Selected target columns: ['management', 'severity', 'diagnosis_presumptive']

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'