# Nakaam Mushrooms - A Deeplearning project to tell poisonous and non-poisonous mushrooms apart

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report, confusion_matrix


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

## Importing Data

In [None]:
test_data = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv', low_memory=False)
test_data.drop_duplicates(inplace = True)
test_data.head()

In [None]:
df = test_data[5000:]

final_test_df = test_data[:5000]

In [None]:
df.isnull().sum()

## Encoding/standardisation of data

In [None]:
label_encode = ['class']

cat_cols = [
    'cap-shape', 'cap-color', 'does-bruise-or-bleed', 'gill-color',
    'stem-color', 'has-ring', 'ring-type', 'habitat', 'season'
]

num_cols = [
    'cap-diameter', 'stem-height', 'stem-width'
]

columns_to_drop = [
    'id',
    'cap-surface', 'gill-attachment', 'gill-spacing', 'stem-root', 
    'stem-surface', 'veil-type', 'veil-color', 'spore-print-color'
]

In [None]:
le = LabelEncoder()
y = le.fit_transform(df['class'])
df.drop(columns = columns_to_drop, inplace = True)

### Preprocessing Pipeline

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)


def fit_pipeline(df, target):
    X = df.drop(columns=[target])
    y = df[target]

    preprocessor.fit(X)
    
    X_transformed = preprocessor.transform(X)

    selector = SelectKBest(score_func=f_classif, k=20)
    selector.fit(X_transformed, y)
    
    return preprocessor, selector

def transform_pipeline(df, preprocessor, selector):

    X_transformed = preprocessor.transform(df)

    X_selected = selector.transform(X_transformed)
    
    num_col_names = num_cols
    cat_col_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)
    all_col_names = list(num_col_names) + list(cat_col_names)
    selected_features = selector.get_support(indices=True)
    selected_col_names = [all_col_names[i] for i in selected_features]

    transformed_df = pd.DataFrame(X_selected, columns=selected_col_names)
    
    return transformed_df

preprocessor, selector = fit_pipeline(df, 'class')

In [None]:
df.drop(columns = ['class'], inplace = True)
df = transform_pipeline(df, preprocessor, selector)
df['class'] = y

In [None]:
df

## Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
l = df.shape[1]
col = []
for i in range(l-1):
    col.append(i)
    
df.columns = col + ['target']

## Applying Models

In [None]:
pAUCscores = []
confMatrix = []
cfReport = []

### Model Evaluation Parameter

In [None]:
class ParticipantVisibleError(Exception):
    pass

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80) -> float:
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    if not pd.api.types.is_numeric_dtype(submission.values):
        raise ParticipantVisibleError('Submission target column must be numeric')

    v_gt = abs(np.asarray(solution.values) - 1)
    v_pred = -1.0 * np.asarray(submission.values)

    max_fpr = abs(1 - min_tpr)

    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)

    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)

    return partial_auc

### LGBM Classifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)

y_scores = lgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_scores > 0.5).astype(int)

solution = pd.DataFrame({'row_id': range(len(y_test)), 'target': y_test})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_test, y_pred))
cfReport.append(classification_report(y_test, y_pred))

### XGB Clasifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_scores = xgb_model.predict_proba(X_test)[:, 1]
y_true = y_test
y_pred = (y_scores > 0.5).astype(int)

solution = pd.DataFrame({'row_id': range(len(y_test)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_test, y_pred))
cfReport.append(classification_report(y_test, y_pred))

### CatBoost Clasifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catboost_model = CatBoostClassifier(verbose=0)
catboost_model.fit(X_train, y_train)

y_pred = catboost_model.predict(X_test)
y_true = y_test
y_scores = catboost_model.predict_proba(X_test)[:, 1]

solution = pd.DataFrame({'row_id': range(len(y_true)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_true, y_pred))
cfReport.append(classification_report(y_true, y_pred))

### AdaBoost Classifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost_model.fit(X_train, y_train)

y_pred = adaboost_model.predict(X_test)
y_scores = adaboost_model.predict_proba(X_test)[:, 1]

solution = pd.DataFrame({'row_id': range(len(y_true)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)
conf_matrix = confusion_matrix(y_test, y_pred)
cf_report = classification_report(y_test, y_pred)

pAUCscores.append(pAUC)
confMatrix.append(conf_matrix)
cfReport.append(cf_report)

### GBR Classifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbm_model = GradientBoostingClassifier()
gbm_model.fit(X_train, y_train)

y_pred = gbm_model.predict(X_test)
y_true = y_test

y_scores = gbm_model.predict_proba(X_test)[:, 1]

solution = pd.DataFrame({'row_id': range(len(y_true)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_true, y_pred))
cfReport.append(classification_report(y_true, y_pred))

### SVM Classifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
y_true = y_test

y_scores = svm_model.predict_proba(X_test)[:, 1]

solution = pd.DataFrame({'row_id': range(len(y_true)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_true, y_pred))
cfReport.append(classification_report(y_true, y_pred))


### Random Forest Classifier

In [None]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

y_pred = random_forest_model.predict(X_test)
y_true = y_test

y_scores = random_forest_model.predict_proba(X_test)[:, 1]

solution = pd.DataFrame({'row_id': range(len(y_true)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_true, y_pred))
cfReport.append(classification_report(y_true, y_pred))

### Deep Neural Networks

In [None]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],),),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=40, validation_split=0.2, batch_size=64, verbose=3)

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

y_scores = model.predict(X_test).flatten()

solution = pd.DataFrame({'row_id': range(len(y_true)), 'target': y_true})
submission = pd.DataFrame({'row_id': range(len(y_scores)), 'prediction': y_scores})

pAUC = score(solution, submission, 'row_id', min_tpr=0.80)

pAUCscores.append(pAUC)
confMatrix.append(confusion_matrix(y_true, y_pred))
cfReport.append(classification_report(y_true, y_pred))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
'''pAUCscores.pop(-1)
confMatrix.pop(-1)
cfReport.pop(-1)'''

## Models Evaluation

In [None]:
models = ['LGBM', 'XGB', 'CAT', 'ADB', 'GBM', 'SVM', 'RF', 'DNN']

for ps, conf, cf, md in zip(pAUCscores, confMatrix, cfReport, models):
    print('----------------------------------')
    print('Metrics for', md)
    print('pAUC Score:\n', ps)
    print('Confusion Matrix:\n', conf)
    print('Other Metrics Report:\n', cf)

## Best Model

In [None]:
def predict(test_data: pd.DataFrame, model_) -> pd.DataFrame:
    ids = test_data['id']
    
    test_data_processed = transform_pipeline(test_data, preprocessor, selector)
    
    test_data_processed.columns = col
    
    try:
        test_predictions = model_.predict_proba(test_data_processed)[:, 1]
    except:
        test_predictions = model_.predict(test_data_processed).flatten()
    
    predictions_df = pd.DataFrame({'id': ids, 'target': test_predictions})
    
    return predictions_df


X = final_test_df.drop(columns = ['class'])
y = le.transform(final_test_df['class'])

mdls = [lgb_model, xgb_model, catboost_model, adaboost_model, gbm_model, svm_model, random_forest_model, model]
mdls_name = ['lgb_model','xgb_model', 'catboost_model', 'adaboost_model', 'gbm_model', 'svm_model', 'random_forest_model', 'dnn']

best_model = None
best_model_name = None
max_puac = 0
sum_puac = 0
for m, n in zip(mdls, mdls_name):
    print(n)
    predicted_df = predict(X, m)
    
    predicted_df['actual_target'] = y
    solution = pd.DataFrame({'row_id': range(len(predicted_df['actual_target'])), 'target': predicted_df['actual_target']})
    submission = pd.DataFrame({'row_id': range(len(predicted_df['target'])), 'prediction': predicted_df['target']})
    pAUC = score(solution, submission, 'row_id', min_tpr=0.80)
    print(pAUC)
        
    sum_puac += pAUC
    if(pAUC > max_puac):
        max_puac = pAUC
        best_model = m
        best_model_name = n


print(f'Best pAUC Model: {best_model_name}')
print(f'Average pAUC:{sum_puac/len(mdls)}')

## Final Predictions

In [None]:
test_data = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv', low_memory=False)
predicted_df = predict(test_data, best_model)
predicted_df.to_csv('submission.csv', index=False)
predicted_df