In [2]:
import pandas as pd
import sys
from pathlib import Path

# make sure src/ is on Python path
sys.path.append(str(Path("..") / "src"))

from data_utils import load_data
from modeling import train_baseline
from preprocessing import build_preprocessor


In [3]:
train, test, sample = load_data()

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample shape:", sample.shape)

train.head()


Looking in: C:\Codes\CSE 572 DM\binary_prediction_of_poisonous_mushrooms\data\train.csv
Train shape: (3116945, 22)
Test shape: (2077964, 21)
Sample shape: (2077964, 2)


Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [4]:
# Drop columns with too many missing values
drop_cols = ["veil-type", "spore-print-color", "stem-root"]
train = train.drop(columns=drop_cols)

# Target variable: map edible = 0, poisonous = 1
y = train["class"].map({"e": 0, "p": 1})

# Features (exclude id + class)
X = train.drop(columns=["class", "id"])

# Separate categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Categorical columns:", len(cat_cols))
print("Numeric columns:", num_cols)


X shape: (3116945, 17)
y shape: (3116945,)
Categorical columns: 14
Numeric columns: ['cap-diameter', 'stem-height', 'stem-width']


In [5]:
# Using a smaller sample for quick training (e.g., 100k rows)
X_small = X.sample(100000, random_state=42)
y_small = y.loc[X_small.index]

model, metrics = train_baseline(X_small, y_small, cat_cols, num_cols)

print("Baseline Logistic Regression Metrics (100k sample):")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Baseline Logistic Regression Metrics (100k sample):
accuracy: 0.8034
f1: 0.8165
mcc: 0.6057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef

# smaller sample again (100k for now, to keep runtime manageable)
X_small = X.sample(100000, random_state=42)
y_small = y.loc[X_small.index]

# train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_small, y_small, test_size=0.2, stratify=y_small, random_state=42
)

# preprocess
preprocessor = build_preprocessor(cat_cols, num_cols)
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Random Forest baseline
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_transformed, y_train)

y_pred = rf.predict(X_val_transformed)

print("Random Forest Baseline (100k sample):")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1:", f1_score(y_val, y_pred))
print("MCC:", matthews_corrcoef(y_val, y_pred))


Random Forest Baseline (100k sample):
Accuracy: 0.99005
F1: 0.9908978639710927
MCC: 0.9799264272821623


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef

# smaller sample again (100k for consistency with other baselines)
X_small = X.sample(100000, random_state=42)
y_small = y.loc[X_small.index]

# train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_small, y_small, test_size=0.2, stratify=y_small, random_state=42
)

# preprocess
preprocessor = build_preprocessor(cat_cols, num_cols)
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Decision Tree baseline
dt = DecisionTreeClassifier(
    max_depth=None,      # allow full tree growth (baseline)
    random_state=42
)
dt.fit(X_train_transformed, y_train)

y_pred = dt.predict(X_val_transformed)

print("Decision Tree Baseline (100k sample):")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1:", f1_score(y_val, y_pred))
print("MCC:", matthews_corrcoef(y_val, y_pred))

Decision Tree Baseline (100k sample):
Accuracy: 0.9762
F1: 0.9782370153621068
MCC: 0.9519793123645256
