# Binary Prediction of Poisonous Mushrooms

Implements a mushroom classification system using XGBoost. It handles both numerical and categorical features, performs missing value imputation, and uses label encoding for categorical variables. The model achieves a high Matthews Correlation Coefficient score on the test set. The code is structured into modular functions for data preparation, feature handling, model training, and prediction generation.

Dataset: https://www.kaggle.com/competitions/playground-series-s4e8/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/mushroom-edibility

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
import pickle

def prepare_data(train_path, test_path):
    df_train = pd.read_csv(train_path).drop('id', axis=1)
    df_test = pd.read_csv(test_path).drop('id', axis=1)
    
    numeric_cols = df_train.select_dtypes(include=['int64', 'float64']).columns
    numeric_cols = [col for col in numeric_cols if col != 'class']
    categorical_cols = df_train.select_dtypes(include=['object']).columns
    categorical_cols = [col for col in categorical_cols if col != 'class']
    
    encoders = handle_features(df_train, df_test, numeric_cols, categorical_cols)
    
    return df_train, df_test, encoders, numeric_cols, categorical_cols

def handle_features(df_train, df_test, numeric_cols, categorical_cols):
    for col in numeric_cols:
        median = df_train[col].median()
        df_train[col].fillna(median, inplace=True)
        df_test[col].fillna(median, inplace=True)
    
    encoders = {}
    for col in categorical_cols:
        mode = df_train[col].mode()[0]
        df_train[col].fillna(mode, inplace=True)
        df_test[col].fillna(mode, inplace=True)
        
        encoders[col] = LabelEncoder()
        df_train[col] = encoders[col].fit_transform(df_train[col])
        df_test[col] = df_test[col].map(lambda x: x if x in set(encoders[col].classes_) else mode)
        df_test[col] = encoders[col].transform(df_test[col])
    
    return encoders

def train_model(X_train, y_train):
    model = XGBClassifier(
        alpha=0.1, 
        subsample=0.8, 
        colsample_bytree=0.6,
        max_depth=14, 
        min_child_weight=7, 
        gamma=1e-6,
        n_estimators=100, 
        objective='binary:logistic'
    )
    model.fit(X_train, y_train)
    return model

def save_artifacts(model, target_encoder, encoders, numeric_cols, categorical_cols):
    artifacts = {
        'model': model,
        'target_encoder': target_encoder,
        'feature_encoders': encoders,
        'numeric_columns': numeric_cols,
        'categorical_columns': categorical_cols
    }
    with open('model_artifacts.pkl', 'wb') as f:
        pickle.dump(artifacts, f)

def create_submission(model, df_test, target_encoder, test_path):
    predictions = model.predict(df_test)
    predictions = target_encoder.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        'id': pd.read_csv(test_path)['id'],
        'class': predictions
    })
    submission.to_csv('submission.csv', index=False)

def main():
    train_path = "/kaggle/input/playground-series-s4e8/train.csv"
    test_path = "/kaggle/input/playground-series-s4e8/test.csv"
    
    df_train, df_test, encoders, numeric_cols, categorical_cols = prepare_data(train_path, test_path)
    
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(df_train['class'])
    X = df_train.drop('class', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = train_model(X_train, y_train)
    score = matthews_corrcoef(y_test, model.predict(X_test))
    print(f'MCC Score: {score}')
    
    save_artifacts(model, target_encoder, encoders, numeric_cols, categorical_cols)
    create_submission(model, df_test, target_encoder, test_path)

if __name__ == "__main__":
    main()

Train columns types: id                        int64
class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-root                object
stem-surface             object
stem-color               object
veil-type                object
veil-color               object
has-ring                 object
ring-type                object
spore-print-color        object
habitat                  object
season                   object
dtype: object

Test columns types: id                        int64
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing    