In [None]:
# If you're in Colab and want SMOTE; otherwise you can skip this cell.
!pip -q install imbalanced-learn

In [None]:
!pip install tabpfn

In [None]:
!pip install shap

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "iframe"

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

import pickle

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Laminitis_27_Oct/preprocessed.xlsx')
# Loop only over object (string) columns
for col in df.select_dtypes(include='number').columns:
    num_missing = df[col].isna().sum()
    if num_missing > 0:
        print(f"{col}: {num_missing} NaN values")

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming your features and label are:
x = df.drop(columns=['Class'])  # all features
X = x.apply(pd.to_numeric, errors='coerce').fillna(0)
y = df['Class']

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Compute correlation matrix
correlation_matrix = X.corr().abs()

# Calculate the correlation matrix
# correlation_matrix = feat_corr.corr()

# Create the heatmap with annotations (to show correlation values)
plt.figure(figsize=(16, 14))  # Adjust the figure size for better clarity
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", linewidths=0.8, cmap='coolwarm', annot_kws={"size": 10})

# Add a title
plt.title('Correlation Matrix', fontsize=18)

# Rotate x-axis labels for better visibility
plt.xticks(rotation=90)

# Rotate y-axis labels for better visibility
plt.yticks(rotation=0)

plt.savefig("Correlation matrix.pdf",format='pdf', dpi=500)
plt.show()

In [None]:
# Upper triangle matrix of correlations
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# List of correlated pairs
high_corr = [
    (column, idx, corr)
    for column in upper.columns
    for idx, corr in upper[column].items()
    if corr > 0.7
]
pd.DataFrame(high_corr, columns=["Feature_1", "Feature_2", "Correlation"])


In [None]:
threshold = 0.7

# Drop one of each correlated pair
to_drop = [
    column for column in upper.columns if any(upper[column] > threshold)
]
X_uncorr = X.drop(columns=to_drop, errors='ignore')

print("Dropped:", to_drop)
print("Remaining features:", X_uncorr.shape[1])
X = X_uncorr

In [None]:
X_uncorr.columns

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
X_test

In [None]:
y_test

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score, matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier

In [None]:
logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        penalty='l2', solver='lbfgs', max_iter=1000, class_weight='balanced', random_state=42
    ))
])

rf = RandomForestClassifier(
    n_estimators=300, max_depth=5, min_samples_split=5, class_weight='balanced', random_state=42
)

models = {
    "LogisticRegression": logreg,
    "RandomForest": rf,
}

In [None]:
# from sklearn.model_selection import permutation_test_score
# from sklearn.model_selection import StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# score, perm_scores, pvalue = permutation_test_score(
#     rf, x, y, cv=cv, n_permutations=100, scoring="accuracy", n_jobs=-1
# )
# print(f"Permutation Accuracy: {score:.3f}, p-value: {pvalue:.5f}")

In [None]:
rf = RandomForestClassifier()
rf.fit(x, y)
importances = pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)
# print(importances.head(10))

importances.plot(kind='bar', figsize=(8,4))
plt.title("Top 10 Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()

## Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, ConfusionMatrixDisplay
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

RNG = 42

def is_binary(y):
    return len(np.unique(y)) == 2

def metric_avg(y_true):
    # For multi-class: use macro avg so each class is weighted equally
    return 'binary' if is_binary(y_true) else 'macro'

def safe_proba(estimator, X):
    # All models below support predict_proba when configured; this is just a guard
    if hasattr(estimator, "predict_proba"):
        return estimator.predict_proba(X)
    # Fallback via decision function -> fake probs with min-max scaling (not ideal)
    if hasattr(estimator, "decision_function"):
        z = estimator.decision_function(X)
        if z.ndim == 1:  # binary
            from scipy.special import expit
            p1 = expit(z)
            return np.vstack([1 - p1, p1]).T
        # multiclass: softmax
        z = z - z.max(axis=1, keepdims=True)
        ez = np.exp(z)
        return ez / ez.sum(axis=1, keepdims=True)
    raise ValueError("Estimator has neither predict_proba nor decision_function.")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RNG, stratify=y
)
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

n_classes = len(np.unique(y))
classes = np.unique(y)
print("Train:", X_train.shape, "Test:", X_test.shape, "Classes:", classes)


In [None]:
models = {
    "LogReg": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=2000, class_weight='balanced', random_state=RNG))
    ]),
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=True, class_weight='balanced', random_state=RNG))
    ]),
    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=5))
    ]),
    "RandomForest": RandomForestClassifier(
        n_estimators=400, max_depth=None, class_weight='balanced', random_state=RNG, n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9,
        reg_lambda=1.0, objective="binary:logistic" if n_classes==2 else "multi:softprob",
        eval_metric="logloss", random_state=RNG, n_jobs=-1
    ),
    "DecisionTree": DecisionTreeClassifier(
        max_depth=None, class_weight='balanced', random_state=RNG
    ),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3, random_state=RNG
    ),
    "TabularPFN":  TabPFNClassifier(),
    "ANN": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(64,32), activation="relu",
                              max_iter=500, random_state=RNG)),
    ])
}


In [None]:
# ========================================
# Remove TabularPFN from your models dictionary
# ========================================
models_to_remove = ['TabularPFN', 'TabularPFNClassifier']

# Create a new models dictionary without TabularPFN
filtered_models = {}
for name, model in models.items():
    if name not in models_to_remove:
        filtered_models[name] = model
    else:
        print(f"‚ö†Ô∏è  Skipping {name} (requires authentication)")

# Use filtered_models instead of models
models = filtered_models

print("Remaining models:", list(models.keys()))

In [None]:
# ========================================
# 3) 5-Fold Stratified Cross Validation + collect predictions
# ========================================
def is_binary(y): return len(np.unique(y)) == 2
def avg_mode(y): return "binary" if is_binary(y) else "macro"
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score
)
import numpy as np
import pandas as pd

def is_binary(y): return len(np.unique(y)) == 2
def avg_mode(y): return "binary" if is_binary(y) else "macro"

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RNG)

final_results = []
roc_data = {}         # {model: (fpr, tpr, auc)}
conf_matrices = {}    # {model: cm}
labels_global = np.unique(y)

for name, model in models.items():
    print(f"\n===== {name} =====")
    fold_metrics = []

    # aggregate across folds
    y_true_all, y_pred_all, y_proba_all = [], [], []

    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # probabilities for ROC
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)
        elif hasattr(model, "decision_function"):
            z = model.decision_function(X_test)
            from scipy.special import expit
            if z.ndim == 1:
                y_proba = np.vstack([1 - expit(z), expit(z)]).T
            else:
                ez = np.exp(z - z.max(axis=1, keepdims=True))
                y_proba = ez / ez.sum(axis=1, keepdims=True)
        else:
            # fallback: zeros; ROC will be meaningless; better ensure proba/decision_function exists
            y_proba = np.zeros((len(y_pred), len(np.unique(y))))

        # store for aggregate CM/ROC
        y_true_all.extend(y_test)
        y_pred_all.extend(y_pred)
        # for ROC store positive-class prob in binary, else full matrix
        if is_binary(y):
            y_proba_all.extend(y_proba[:, 1])
        else:
            y_proba_all.extend(list(y_proba))  # keep rows

        # per-fold metrics
        avg = avg_mode(y)
        acc  = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average=avg, zero_division=0)
        rec  = recall_score(y_test, y_pred, average=avg, zero_division=0)
        f1   = f1_score(y_test, y_pred, average=avg, zero_division=0)

        if is_binary(y):
            roc_auc = roc_auc_score(y_test, y_proba[:, 1])
        else:
            from sklearn.preprocessing import label_binarize
            classes_ = np.unique(y)
            Y_test_bin = label_binarize(y_test, classes=classes_)
            roc_auc = roc_auc_score(Y_test_bin, y_proba, average='macro', multi_class='ovr')

        fold_metrics.append([acc, prec, rec, f1, roc_auc])
        print(f"Fold {fold}:  Acc={acc:.3f}, Prec={prec:.3f}, Rec={rec:.3f}, F1={f1:.3f}, AUC={roc_auc:.3f}")

    # aggregate mean ¬± std
    arr = np.array(fold_metrics)
    mean = arr.mean(axis=0); std = arr.std(axis=0)
    print(f"Mean ¬± SD:  Acc={mean[0]:.3f}¬±{std[0]:.3f}, Prec={mean[1]:.3f}¬±{std[1]:.3f}, "
          f"Rec={mean[2]:.3f}¬±{std[2]:.3f}, F1={mean[3]:.3f}¬±{std[3]:.3f}, AUC={mean[4]:.3f}¬±{std[4]:.3f}")

    final_results.append({
        "Model": name,
        "Acc_mean": mean[0], "Acc_std": std[0],
        "Prec_mean": mean[1], "Prec_std": std[1],
        "Rec_mean": mean[2], "Rec_std": std[2],
        "F1_mean": mean[3], "F1_std": std[3],
        "AUC_mean": mean[4], "AUC_std": std[4],
    })

    # ===== aggregate ROC and CM =====
    from sklearn.metrics import roc_curve, auc, confusion_matrix
    y_true_all = np.array(y_true_all)
    y_pred_all = np.array(y_pred_all)

    if is_binary(y):
        y_proba_all = np.array(y_proba_all)  # shape (N,)
        fpr, tpr, _ = roc_curve(y_true_all, y_proba_all)
        roc_auc = auc(fpr, tpr)
        roc_data[name] = (fpr, tpr, roc_auc)
    else:
        # macro-avg ROC (OvR)
        from sklearn.preprocessing import label_binarize
        classes_ = np.unique(y)
        Y_bin = label_binarize(y_true_all, classes=classes_)
        Yp = np.vstack(y_proba_all)  # shape (N, C)
        # compute macro curve by averaging TPR across classes on union FPR grid
        from sklearn.metrics import roc_curve as _roc, roc_auc_score as _auc
        fpr_all = []
        tpr_all = []
        for i in range(Y_bin.shape[1]):
            fpr_i, tpr_i, _ = _roc(Y_bin[:, i], Yp[:, i])
            fpr_all.append(fpr_i); tpr_all.append(tpr_i)
        all_fpr = np.unique(np.concatenate(fpr_all))
        mean_tpr = np.zeros_like(all_fpr)
        for tpr_i, fpr_i in zip(tpr_all, fpr_all):
            mean_tpr += np.interp(all_fpr, fpr_i, tpr_i)
        mean_tpr /= Y_bin.shape[1]
        macro_auc = _auc(Y_bin, Yp, average='macro', multi_class='ovr')  # same as earlier
        roc_data[name] = (all_fpr, mean_tpr, macro_auc)

    cm = confusion_matrix(y_true_all, y_pred_all, labels=labels_global)
    conf_matrices[name] = cm

df_final = pd.DataFrame(final_results).sort_values("F1_mean", ascending=False).round(3)
df_final


### Loading a Saved Model for Prediction

In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone


# ========================================
# Create Website-Ready Model Package
# ========================================
def create_website_package(models, X, y, df_final,
                           best_model_name=None,
                           out_path="website_model_package.pkl"):
    """
    Create a clean, website-ready package.

    - models: dict of trained models  (e.g. {"LogReg": logreg_clf, ...})
    - X: pandas DataFrame with training features
    - y: array-like target
    - df_final: DataFrame with columns ["Model", "Acc_mean", "F1_mean"]
    - best_model_name: name in `models` (if None ‚Üí take best F1 from df_final)
    """

    # 1) Choose the best model
    if best_model_name is None:
        # Sort df_final by F1_mean descending and take top model name
        best_row = df_final.sort_values("F1_mean", ascending=False).iloc[0]
        best_model_name = best_row["Model"]
    else:
        best_row = df_final[df_final["Model"] == best_model_name].iloc[0]

    # 2) Clone the model so we don't overwrite the original
    base_model = models[best_model_name]
    best_model = clone(base_model)

    # 3) Build a sklearn Pipeline with scaler + model
    website_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", best_model),
    ])

    # 4) Fit on full dataset
    website_pipeline.fit(X, y)

    # 5) Build a clean dict to save
    feature_names = X.columns.tolist()

    website_package = {
        # The main object used by the website
        "pipeline": website_pipeline,              # sklearn Pipeline (safe to unpickle)

        # Feature information
        "feature_names": feature_names,
        "feature_ranges": {
            col: {
                "min": float(X[col].min()),
                "max": float(X[col].max())
            }
            for col in feature_names
        },

        # Model metadata
        "model_type": best_model_name,
        "is_binary": len(np.unique(y)) == 2,
        "classes": np.unique(y).tolist(),

        # Performance info (for display on website)
        "performance": {
            "accuracy": float(best_row["Acc_mean"]),
            "f1_score": float(best_row["F1_mean"]),
        },
    }

    # 6) Save with joblib (recommended for sklearn objects)
    joblib.dump(website_package, out_path, compress=3)
    print(f"üéØ Website package saved ‚Üí {out_path}")
    print(
        f"   Best model: {best_model_name} "
        f"(F1: {website_package['performance']['f1_score']:.3f})"
    )

    return website_package


# ===== Example call (adapt to your notebook) =====
best_model_name = df_final.sort_values("F1_mean", ascending=False).iloc[0]["Model"]
website_package = create_website_package(models, X, y, df_final, best_model_name)


In [None]:
# test_model_loading.py
import joblib
import pandas as pd

def test_website_model(path="website_model_package.pkl"):
    try:
        model_data = joblib.load(path)
        print("‚úÖ Model loaded successfully!")
        print(f"ü§ñ Model Type: {model_data['model_type']}")
        print(f"üìä Features: {len(model_data['feature_names'])}")
        print(f"üìà Performance: F1={model_data['performance']['f1_score']:.3f}")
        print(f"üî¢ Binary Classification: {model_data['is_binary']}")
        print(f"üéØ Classes: {model_data['classes']}")

        # Test prediction with dummy data
        dummy = pd.DataFrame(
            [[0.0] * len(model_data["feature_names"])],
            columns=model_data["feature_names"]
        )
        pred = model_data["pipeline"].predict(dummy)
        print(f"üß™ Test prediction: {pred[0]}")

        return True
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return False

# Run locally in your training environment
test_website_model()


# Confusion Matrix

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

RNG = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RNG)

labels_global = np.unique(y)
cm_mean_per_model = {}   # {model_name: mean_normalized_cm}

for name, model in models.items():
    print(f"\n===== {name} (mean CM over 5 folds) =====")
    cms_norm = []

    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # raw CM for this fold
        cm = confusion_matrix(y_test, y_pred, labels=labels_global)

        # row-normalize (per true class) to get recall-style percentages
        row_sums = cm.sum(axis=1, keepdims=True).clip(min=1)
        cm_norm = cm.astype(float) / row_sums
        cms_norm.append(cm_norm)

    # mean normalized CM across folds
    cm_mean = np.mean(np.stack(cms_norm, axis=0), axis=0)  # shape (C, C)
    cm_mean_per_model[name] = cm_mean



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from math import ceil

num_models = len(cm_mean_per_model)
cols = 3
rows = ceil(num_models / cols)

fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 6*rows), dpi=500)
axes = axes.ravel()

for i, (name, cm_mean) in enumerate(cm_mean_per_model.items()):
    # scale to percent for display
    cm_percent = cm_mean * 100.0

    disp = ConfusionMatrixDisplay(confusion_matrix=cm_percent,
                                  display_labels=labels_global)
    disp.plot(ax=axes[i],
              cmap='viridis',      # bright, matches your style
              colorbar=False,
              values_format=".0f") # show as integer percent; we'll append the % sign below

    axes[i].set_title(name, fontsize=18, weight='bold')
    axes[i].set_xlabel("Predicted label", fontsize=14)
    axes[i].set_ylabel("True label", fontsize=14)

    # enlarge and add % sign to each cell value
    for txt in axes[i].texts:
        txt.set_text(f"{txt.get_text()}%")
        txt.set_fontsize(25)
        txt.set_weight('bold')

# hide unused subplots (if any)
for j in range(i+1, len(axes)):
    axes[j].axis('off')

fig.suptitle("Confusion Matrices over 5 Folds ‚Äî All Models",
             fontsize=22, weight='bold')
plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()

# Optional: save as PDF (vector quality for Overleaf)
fig.savefig("mean_confusion_matrices_all_models.pdf",
            format='pdf', dpi=500)


In [None]:
# ========================================
# ROC Curves ‚Äî All Models (Publication Quality)
# ========================================
import matplotlib.pyplot as plt
from itertools import cycle

# Define elegant color palette (colorblind-friendly)
colors = cycle([
    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
    "#9467bd", "#8c564b", "#e377c2", "#7f7f7f",
    "#bcbd22", "#17becf"
])

plt.figure(figsize=(8, 7), dpi=500)

# Plot each model‚Äôs ROC
for (name, (fpr, tpr, roc_auc)), color in zip(roc_data.items(), colors):
    plt.plot(
        fpr, tpr, color=color, lw=2.5,
        label=f"{name} (AUC = {roc_auc:.3f})"
    )

# Add diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', lw=1.2, alpha=0.6)

# === Style & labels ===
plt.title("Receiver Operating Characteristic (ROC) ‚Äî All Models", fontsize=10, weight='bold', pad=15)
plt.xlabel("False Positive Rate", fontsize=15)
plt.ylabel("True Positive Rate", fontsize=15)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

# === Grid and legend ===
plt.grid(True, which="major", linestyle="--", linewidth=0.6, alpha=0.4)
plt.legend(
    loc="lower right",
    fontsize=11,
    frameon=True,
    edgecolor='gray',
    fancybox=True,
    shadow=False
)

# === Adjust margins and save ===
plt.tight_layout()
plt.savefig("roc_curves.pdf", format='pdf', dpi=500)
plt.show()

## Explainability

In [None]:
pip install scikit-learn shap lime eli5 captum

In [None]:
X_uncorr.columns

In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

feature_names = ['Age(years)', 'Sex', 'HeartRate', 'Respiratoryrate',
       'Rectaltemperature', 'Gutsounds', 'Digitalpulses', 'Bodyweight(kg)',
       'BodyConditionScoring(outof9)', 'LengthRF', 'LengthLF', 'LengthRH',
       'WidthRF', 'WidthLF', 'WidthRH', 'HTRF', 'HTRH', 'LERF']


In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)

In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train the model
model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=RNG)
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)

In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC # Import SVC

# Train the model
model = SVC(kernel="linear") # Changed linear to "linear" and added quotes
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)

In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train the model
model = XGBClassifier(
        n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9,
        reg_lambda=1.0, objective="binary:logistic" if n_classes==2 else "multi:softprob",
        eval_metric="logloss", random_state=RNG, n_jobs=-1
    )
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)

In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train the model
model = GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3, random_state=RNG
    )
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)

In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train the model
model = DecisionTreeClassifier(
        max_depth=None, class_weight='balanced', random_state=RNG
    )
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)

In [None]:
import eli5
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train the model
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_test, y_test)

# Display feature importances
eli5.show_weights(model, feature_names=feature_names)



## Lamintis