In [1]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.metrics import roc_curve, roc_auc_score

In [2]:
mushrooms = pd.read_csv('/kaggle/input/catalanmushrooms/train.csv')

In [3]:
mushrooms_test = pd.read_csv('/kaggle/input/catalanmushrooms/test.csv')

In [4]:
keep = mushrooms_test.columns.to_list()
keep.append('poisonous')

In [5]:
train_mush = mushrooms.filter(keep)

In [6]:
train_mush.isna().sum()

In [7]:
X = train_mush.drop(columns=['poisonous'])
y = train_mush['poisonous']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=8)
X_train.head()

In [8]:
cat_col= list(X_train.select_dtypes(include=["object"]))

In [9]:
categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

full_processor = ColumnTransformer(transformers=[
    ('category', categorical_pipeline, cat_col)
])

tree_pipeline = Pipeline(steps=[
    ('preprocess', full_processor), 
    ('model', DecisionTreeClassifier(max_depth=9))
])

dt_model = tree_pipeline.fit(X_train, y_train)

In [10]:
dt_preds = tree_pipeline.predict(X_test)

In [11]:
plot_confusion_matrix(tree_pipeline, X_test, y_test, display_labels=['Not poisonous', 'Poisonous']);

In [12]:
rf_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', RandomForestClassifier(max_features=5, n_estimators=500))
])

rf_model = rf_pipeline.fit(X_train, y_train)

In [13]:
rf_preds = rf_pipeline.predict(X_test)

In [14]:
plot_confusion_matrix(rf_pipeline, X_test, y_test, display_labels=['Not poisonous', 'Poisonous']);

In [15]:
knn_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', KNeighborsClassifier(n_neighbors=6))
])

knn_model = knn_pipeline.fit(X_train, y_train)

In [16]:
knn_preds = knn_pipeline.predict(X_test)

In [17]:
plot_confusion_matrix(knn_pipeline, X_test, y_test, display_labels=['Not poisonous', 'Poisonous']);

In [18]:
r_proba = [0 for _ in range(len(y_test))]
dt_proba = tree_pipeline.predict_proba(X_test)
rf_proba = rf_pipeline.predict_proba(X_test)
knn_proba = knn_pipeline.predict_proba(X_test)

In [19]:
dt_proba = dt_proba[:,1]
rf_proba = rf_proba[:,1]
knn_proba = knn_proba[:,1]

In [20]:
r_auc = roc_auc_score(y_test,r_proba)
dt_auc = roc_auc_score(y_test,dt_proba)
rf_auc = roc_auc_score(y_test,rf_proba)
knn_auc = roc_auc_score(y_test,knn_proba)

In [21]:
print('No-Skill Prediction: AUC = %.3f' % (r_auc))
print('Decision Tree: AUC = %.3f' % (dt_auc))
print('Random Forest: AOC = %.3f' % (rf_auc))
print('KNN: AOC = %.3f' % (knn_auc))

In [22]:
r_fpr, r_tpr, _ = roc_curve(y_test, r_proba)
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_proba)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_proba)
knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_proba)

In [23]:
plt.plot(r_fpr, r_tpr, linestyle='--', label='Random prediction (AUC = %0.3f)' % r_auc)
plt.plot(dt_fpr, dt_tpr, marker='.', label='DecisionTreeClassifier (AUC = %0.3f)' % dt_auc)
plt.plot(rf_fpr, rf_tpr, marker='.', label='Random forest (AUC = %0.3f)' % rf_auc)
plt.plot(knn_fpr, knn_tpr, marker='.', label='Random forest (AUC = %0.3f)' % knn_auc)


plt.title('ROC Plot')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()  
plt.show()

In [24]:
testy = y_test.array
testy

In [25]:
sub_preds = rf_pipeline.predict(mushrooms_test)

submission_file = pd.DataFrame({
    'Id': mushrooms_test['Id'],
    'poisonous': sub_preds
}).to_csv('submission.csv', index=False)