In [71]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.metrics import roc_auc_score

## 1. Reading data

In [51]:
data = pd.read_csv("mushroom.csv")
X, y = data.drop('class', axis=1).copy(), data['class'].copy()
data.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [52]:
#по умолчанию describe делается по числовым колонкам
data.describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,61069.0,61069.0,61069.0
mean,6.733854,6.581538,12.14941
std,5.264845,3.370017,10.035955
min,0.38,0.0,0.0
25%,3.48,4.64,5.21
50%,5.86,5.95,10.19
75%,8.54,7.74,16.57
max,62.34,33.92,103.91


In [53]:
#describe по не числовым колонкам
data.describe(exclude=[np.number])

Unnamed: 0,class,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
count,61069,61069,46949,61069,61069,51185,36006,61069,9531,22945,61069,3177,7413,61069,58598,6354,61069,61069
unique,2,7,11,12,2,7,3,12,5,8,13,1,6,2,8,7,8,4
top,p,x,t,n,f,a,c,w,s,s,w,u,w,f,f,k,d,a
freq,33888,26934,8196,24218,50479,12698,24710,18521,3177,6025,22926,3177,5474,45890,48361,2118,44209,30177


In [54]:
NaN_percent = X.isnull().sum(axis=0) / len(X) * 100 # % NaN
print(NaN_percent)
#баланс классов
print('-'*20)
print(y.describe())
print({k:(y == k).mean() for k in y.unique()})

cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-root               84.393064
stem-surface            62.427746
stem-color               0.000000
veil-type               94.797688
veil-color              87.861272
has-ring                 0.000000
ring-type                4.046243
spore-print-color       89.595376
habitat                  0.000000
season                   0.000000
dtype: float64
--------------------
count     61069
unique        2
top           p
freq      33888
Name: class, dtype: object
{'p': 0.5549132947976878, 'e': 0.44508670520231214}


* Существует проблема, что есть признаки, у которых  большое количество пропусков, поэтому для некоторых моделей это может стать проблемой

  Так как я не разбираюсь в грибах принимаю решение избавиться от тех признаков, что содержат много пропусков, а там где пропусков мало заполним модой

* Классы выглядят сбалансированными

In [55]:
#y transformation
LE = LabelEncoder().fit(y)
y = LE.transform(y)

In [56]:
#x transformation
categorical_columns = X.select_dtypes(exclude = [np.number]).columns.to_list()

for col in categorical_columns:
    oh = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    one_hot_encoded = oh.fit_transform(X[[col]])
    X = pd.concat([X,one_hot_encoded],axis=1).drop(columns=[col])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## 2. Training models

#### 2.1 OneRule

In [60]:
from mlxtend.classifier import OneRClassifier

In [69]:
OneR = OneRClassifier().fit(X_train.to_numpy(), y_train)
y_pred = OneR.predict(X_test.to_numpy())
print(f'Accuracy: {(y_pred == y_test).mean()}')

Accuracy: 0.6037334206648108


#### 2.2 LogisticRegression

In [72]:
from sklearn.linear_model import LogisticRegression

In [88]:
LogReg = LogisticRegression(max_iter=2000).fit(X_train, y_train)
print(
f'''Accuracy: {LogReg.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, LogReg.predict_proba(X_test)[:, 1])}'''
)#Разберись что пихают в рок аук

Accuracy: 0.8633262376507832
roc_auc score 0.9369836822359202


#### 2.3 KNN

In [90]:
from sklearn.neighbors import KNeighborsClassifier

In [91]:
KNN = KNeighborsClassifier().fit(X_train, y_train)
print(
f'''Accuracy: {KNN.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, KNN.predict_proba(X_test)[:, 1])}'''
)

Accuracy: 0.9994541782653785
roc_auc score 0.9999009671549097


#### 2.4 Bayes Classifier

In [92]:
from sklearn.naive_bayes import CategoricalNB

In [101]:
MrBayes = CategoricalNB().fit(X_train, y_train)
print(
f'''Accuracy: {MrBayes.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, MrBayes.predict_proba(X_test)[:, 1])}'''
)

IndexError: index 103 is out of bounds for axis 1 with size 103

In [103]:
MrBayes.predict(X_test)

IndexError: index 103 is out of bounds for axis 1 with size 103

#### 2.5 DescisionTree

In [104]:
from sklearn.tree import DecisionTreeClassifier

In [105]:
tree = DecisionTreeClassifier().fit(X_train, y_train)
print(
f'''train acc {(tree.predict(X_train) == y_train).mean()}
test acc {(tree.predict(X_test) == y_test).mean()}
'''
)

train acc 1.0
test acc 0.9990721030511435



#### 2.6 Boosting