## Imports

In [49]:
import os, platform, sys, time
import pandas as pd
import numpy as np
import psutil
import time
from pathlib import Path

sys.path.append(str(Path("..") / "src"))
from data_utils import load_data
from preprocessing import build_preprocessor
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, matthews_corrcoef, accuracy_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Data

In [7]:
train, test, sample = load_data()

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample shape:", sample.shape)

Looking in: C:\Codes\CSE 572 DM\binary_prediction_of_poisonous_mushrooms\data\train.csv
Train shape: (3116945, 22)
Test shape: (2077964, 21)
Sample shape: (2077964, 2)


In [8]:
if "class" in train.columns:
    print("Target distribution:")
    print(train["class"].value_counts())
    print(train["class"].value_counts(normalize=True))

Target distribution:
class
p    1705396
e    1411549
Name: count, dtype: int64
class
p    0.547137
e    0.452863
Name: proportion, dtype: float64


In [9]:
missing = train.isna().mean().sort_values(ascending=False)
print("\nTop 10 columns by missing fraction:")
print(missing.head(10))


Top 10 columns by missing fraction:
veil-type            0.948843
spore-print-color    0.914255
stem-root            0.884527
veil-color           0.879370
stem-surface         0.635514
gill-spacing         0.403740
cap-surface          0.215282
gill-attachment      0.168093
ring-type            0.041348
gill-color           0.000018
dtype: float64


In [10]:
cat_cols = [c for c in train.columns if train[c].dtype == "object" and c != "class"]
num_cols = [c for c in train.columns if pd.api.types.is_numeric_dtype(train[c])]
print("\nCategorical columns (first 10):", cat_cols[:10])
print("Numeric columns:", num_cols)
print(f"\nCount: {len(cat_cols)} categorical, {len(num_cols)} numeric")


Categorical columns (first 10): ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color']
Numeric columns: ['id', 'cap-diameter', 'stem-height', 'stem-width']

Count: 17 categorical, 4 numeric


In [11]:
# Drop columns with too many missing values
drop_cols = ["veil-type", "spore-print-color", "stem-root"]
train = train.drop(columns=drop_cols)

# Target variable: map edible = 0, poisonous = 1
y = train["class"].map({"e": 0, "p": 1})

# Features (exclude id + class)
X = train.drop(columns=["class", "id"])

# Separate categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Categorical columns:", len(cat_cols))
print("Numeric columns:", num_cols)

X shape: (3116945, 17)
y shape: (3116945,)
Categorical columns: 14
Numeric columns: ['cap-diameter', 'stem-height', 'stem-width']


In [13]:
X_small = X.sample(100000, random_state=42)
y_small = y.loc[X_small.index]

print("Sample shape:", X_small.shape, y_small.shape)

Sample shape: (100000, 17) (100000,)


In [17]:
X_train, X_val, y_train, y_val = train_test_split(
    X_small, y_small, test_size=0.2, stratify=y_small, random_state=42
)

In [19]:
# preprocess
preprocessor = build_preprocessor(cat_cols, num_cols)
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

## XGBoost

In [20]:
start = time.time()

clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

print("Fitting XGBoost...")
clf.fit(X_train_transformed, y_train)

fit_time = time.time() - start
print(f"Trained in {fit_time:.1f}s")

Fitting XGBoost...


Parameters: { "use_label_encoder" } are not used.



Trained in 0.9s


In [23]:
y_pred = clf.predict(X_val_transformed)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
mcc = matthews_corrcoef(y_val, y_pred)

print("XGBoost baseline (100k sample):")
print(f"Accuracy: {acc:.6f}")
print(f"F1:       {f1:.6f}")
print(f"MCC:      {mcc:.6f}")

XGBoost baseline (100k sample):
Accuracy: 0.988500
F1:       0.989471
MCC:      0.976807


In [24]:
print("\nTop 20 feature importances (if feature names available):")
try:
    feat_names = None
    try:
        feat_names = preprocessor.get_feature_names_out()
    except Exception:
        # try nested access (if you kept the preprocessor object)
        try:
            feat_names = preprocessor.named_transformers_["cat"].named_steps["encoder"].get_feature_names_out().tolist()
            # numeric names appended
            num_cols_local = preprocessor.transformers_[1][2]
            feat_names = list(feat_names) + list(num_cols_local)
        except Exception:
            feat_names = None

    if feat_names is not None and len(feat_names) == len(clf.feature_importances_):
        imps = pd.Series(clf.feature_importances_, index=feat_names).sort_values(ascending=False)
        display(imps.head(20))
    else:
        # fallback: just show top indices
        imps = pd.Series(clf.feature_importances_).sort_values(ascending=False)
        display(imps.head(20))
except Exception as e:
    print("Could not render feature names â€” showing raw importances. Error:", repr(e))
    imps = pd.Series(clf.feature_importances_).sort_values(ascending=False)
    display(imps.head(20))


Top 20 feature importances (if feature names available):


cat__ring-type_z               0.049022
cat__does-bruise-or-bleed_t    0.035177
cat__cap-surface_l             0.032884
cat__cap-shape_c               0.032462
cat__stem-surface_g            0.028171
cat__cap-surface_g             0.027755
cat__gill-attachment_p         0.027374
cat__cap-color_r               0.023689
cat__stem-color_w              0.023509
cat__stem-color_p              0.022374
cat__gill-spacing_d            0.021660
cat__cap-color_e               0.021154
cat__gill-spacing_c            0.020627
cat__cap-surface_w             0.018939
cat__cap-surface_i             0.018208
cat__cap-surface_k             0.017328
cat__veil-color_w              0.016732
cat__habitat_u                 0.015018
cat__cap-surface_h             0.014952
cat__stem-surface_s            0.014900
dtype: float32

## SVM

In [28]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_val_scaled = scaler.transform(X_val_transformed)

In [29]:
svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm.fit(X_train_scaled, y_train)

y_pred = svm.predict(X_val_scaled)

In [30]:
print("SVM (RBF) baseline:")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1:", f1_score(y_val, y_pred))
print("MCC:", matthews_corrcoef(y_val, y_pred))

SVM (RBF) baseline:
Accuracy: 0.9848
F1: 0.9860703812316716
MCC: 0.9693575433178304


## RF Fine tune

In [34]:
mcc_scorer = make_scorer(matthews_corrcoef)

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [35]:
# parameter grid for randomized search
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "max_features": ["sqrt", "log2", 0.5, 0.7],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}


In [36]:
start = time.time()
rf_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=15,                
    scoring=mcc_scorer,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rf_search.fit(X_train_transformed, y_train)
elapsed = time.time() - start

Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [37]:
print(f"\nRandom Forest tuning completed in {elapsed/60:.2f} min")
print("Best Parameters:\n", rf_search.best_params_)
print("Best MCC (CV):", rf_search.best_score_)


Random Forest tuning completed in 6.95 min
Best Parameters:
 {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': True}
Best MCC (CV): 0.9798087491103149


In [38]:
best_rf = rf_search.best_estimator_
y_pred = best_rf.predict(X_val_transformed)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
mcc = matthews_corrcoef(y_val, y_pred)

In [39]:
print("\nValidation Metrics with tuned RF:")
print(f"Accuracy: {acc:.6f}")
print(f"F1:       {f1:.6f}")
print(f"MCC:      {mcc:.6f}")


Validation Metrics with tuned RF:
Accuracy: 0.990400
F1:       0.991215
MCC:      0.980635


## XGBoost Fine tune

In [40]:
mcc_scorer = make_scorer(matthews_corrcoef)

# Base estimator
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

In [41]:
# Parameter distributions for randomized search 
param_dist = {
    "n_estimators": [200, 300, 500, 800],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "max_depth": [3, 5, 7, 9, 12],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.7, 0.9, 1.0],
    "gamma": [0, 0.1, 0.3, 1.0],
    "reg_alpha": [0, 0.5, 1.0],
    "reg_lambda": [1.0, 1.5, 2.0]
}

In [42]:
rnd_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,
    scoring=mcc_scorer,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [46]:
start = time.time()
rnd_search.fit(X_train_transformed, y_train)
elapsed = time.time() - start

print(f"\nXGBoost tuning completed in {elapsed/60:.2f} minutes")
print("Best parameters:\n", rnd_search.best_params_)
print("Best CV MCC:", rnd_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.




XGBoost tuning completed in 0.46 minutes
Best parameters:
 {'subsample': 1.0, 'reg_lambda': 1.0, 'reg_alpha': 1.0, 'n_estimators': 500, 'max_depth': 12, 'learning_rate': 0.03, 'gamma': 0.1, 'colsample_bytree': 0.7}
Best CV MCC: 0.9787479347262836


In [47]:
best_xgb = rnd_search.best_estimator_
y_pred = best_xgb.predict(X_val_transformed)

acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
mcc = matthews_corrcoef(y_val, y_pred)

In [48]:
print("\nValidation Metrics with tuned XGBoost:")
print(f"Accuracy: {acc:.6f}")
print(f"F1:       {f1:.6f}")
print(f"MCC:      {mcc:.6f}")


Validation Metrics with tuned XGBoost:
Accuracy: 0.990100
F1:       0.990938
MCC:      0.980032


## Ensemble

In [50]:
probs_rf = best_rf.predict_proba(X_val_transformed)[:, 1]
probs_xgb = best_xgb.predict_proba(X_val_transformed)[:, 1]

In [51]:
probs_avg = (probs_rf + probs_xgb) / 2.0
y_pred_avg_05 = (probs_avg >= 0.5).astype(int)

acc_avg = accuracy_score(y_val, y_pred_avg_05)
f1_avg = f1_score(y_val, y_pred_avg_05)
mcc_avg = matthews_corrcoef(y_val, y_pred_avg_05)
cm = confusion_matrix(y_val, y_pred_avg_05)

In [52]:
print("Simple average ensemble @ 0.5")
print(f"Accuracy: {acc_avg:.6f}")
print(f"F1:       {f1_avg:.6f}")
print(f"MCC:      {mcc_avg:.6f}")
print("Confusion matrix (rows=true, cols=pred):")
print(cm)

Simple average ensemble @ 0.5
Accuracy: 0.990450
F1:       0.991257
MCC:      0.980739
Confusion matrix (rows=true, cols=pred):
[[ 8981    82]
 [  109 10828]]


In [53]:
ths = np.linspace(0.00, 1.00, 101)
best_mcc = -1.0
best_thr = None
mccs = []

for t in ths:
    y_pred_t = (probs_avg >= t).astype(int)
    m = matthews_corrcoef(y_val, y_pred_t)
    mccs.append(m)
    if m > best_mcc:
        best_mcc = m
        best_thr = t

print(f"Best MCC for simple average across thresholds: {best_mcc:.6f} at threshold {best_thr:.2f}")

Best MCC for simple average across thresholds: 0.981145 at threshold 0.53


In [54]:
mcc_rf = matthews_corrcoef(y_val, (probs_rf >= 0.5).astype(int))
mcc_xgb = matthews_corrcoef(y_val, (probs_xgb >= 0.5).astype(int))
print(f"Validation MCCs individually -> RF: {mcc_rf:.6f}, XGB: {mcc_xgb:.6f}")

Validation MCCs individually -> RF: 0.980635, XGB: 0.980032


In [55]:
w_rf = max(mcc_rf, 0.0)
w_xgb = max(mcc_xgb, 0.0)
if (w_rf + w_xgb) == 0:
    w_rf = w_xgb = 0.5
else:
    s = w_rf + w_xgb
    w_rf /= s
    w_xgb /= s

print(f"Using weights -> RF: {w_rf:.3f}, XGB: {w_xgb:.3f}")

Using weights -> RF: 0.500, XGB: 0.500


In [56]:
probs_weighted = w_rf * probs_rf + w_xgb * probs_xgb

# finding best threshold for weighted ensemble
best_mcc_w = -1.0
best_thr_w = None
for t in ths:
    y_pred_t = (probs_weighted >= t).astype(int)
    m = matthews_corrcoef(y_val, y_pred_t)
    if m > best_mcc_w:
        best_mcc_w = m
        best_thr_w = t

print(f"Best MCC for weighted ensemble across thresholds: {best_mcc_w:.6f} at threshold {best_thr_w:.2f}")

Best MCC for weighted ensemble across thresholds: 0.981145 at threshold 0.53


In [57]:
def report_for(probs, thr, label):
    import numpy as np
    from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
    y_pred = (probs >= thr).astype(int)
    return accuracy_score(y_val, y_pred), f1_score(y_val, y_pred), matthews_corrcoef(y_val, y_pred)

acc_w_05, f1_w_05, mcc_w_05 = report_for(probs_weighted, 0.5, "weighted@0.5")
acc_w_best, f1_w_best, mcc_w_best = report_for(probs_weighted, best_thr_w, "weighted@best")

In [59]:
print("Weighted ensemble metrics:")
print(f"@0.5 -> ACC: {acc_w_05:.6f}, F1: {f1_w_05:.6f}, MCC: {mcc_w_05:.6f}")
print(f"@best({best_thr_w:.2f}) -> ACC: {acc_w_best:.6f}, F1: {f1_w_best:.6f}, MCC: {mcc_w_best:.6f}")

Weighted ensemble metrics:
@0.5 -> ACC: 0.990450, F1: 0.991257, MCC: 0.980739
@best(0.53) -> ACC: 0.990650, F1: 0.991439, MCC: 0.981145
