In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.metrics import roc_auc_score

## 1. Reading data

In [38]:
data = pd.read_csv("mushroom.csv")
X, y = data.drop('class', axis=1).copy(), data['class'].copy()
data.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [39]:
#по умолчанию describe делается по числовым колонкам
data.describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,61069.0,61069.0,61069.0
mean,6.733854,6.581538,12.14941
std,5.264845,3.370017,10.035955
min,0.38,0.0,0.0
25%,3.48,4.64,5.21
50%,5.86,5.95,10.19
75%,8.54,7.74,16.57
max,62.34,33.92,103.91


In [40]:
#describe по не числовым колонкам
data.describe(exclude=[np.number])

Unnamed: 0,class,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
count,61069,61069,46949,61069,61069,51185,36006,61069,9531,22945,61069,3177,7413,61069,58598,6354,61069,61069
unique,2,7,11,12,2,7,3,12,5,8,13,1,6,2,8,7,8,4
top,p,x,t,n,f,a,c,w,s,s,w,u,w,f,f,k,d,a
freq,33888,26934,8196,24218,50479,12698,24710,18521,3177,6025,22926,3177,5474,45890,48361,2118,44209,30177


In [41]:
NaN_percent = X.isnull().sum(axis=0) / len(X) * 100 # % NaN
print(NaN_percent)
#баланс классов
print('-'*20)
print(y.describe())
print({k:(y == k).mean() for k in y.unique()})

cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-root               84.393064
stem-surface            62.427746
stem-color               0.000000
veil-type               94.797688
veil-color              87.861272
has-ring                 0.000000
ring-type                4.046243
spore-print-color       89.595376
habitat                  0.000000
season                   0.000000
dtype: float64
--------------------
count     61069
unique        2
top           p
freq      33888
Name: class, dtype: object
{'p': 0.5549132947976878, 'e': 0.44508670520231214}


* Существует проблема, что есть признаки, у которых  большое количество пропусков, поэтому для некоторых моделей это может стать проблемой

  Так как я не разбираюсь в грибах принимаю решение избавиться от тех признаков, что содержат много пропусков, а там где пропусков мало заполним модой

* Классы выглядят сбалансированными

In [42]:
#y transformation
LE = LabelEncoder().fit(y)
y = LE.transform(y)

In [43]:
#x transformation
categorical_columns = X.select_dtypes(exclude = [np.number]).columns.to_list()

for col in categorical_columns:
    oh = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    one_hot_encoded = oh.fit_transform(X[[col]])
    X = pd.concat([X,one_hot_encoded],axis=1).drop(columns=[col])

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## 2. Training models

#### 2.1 OneRule

In [45]:
from mlxtend.classifier import OneRClassifier

In [46]:
OneR = OneRClassifier().fit(X_train.to_numpy(), y_train)
y_pred = OneR.predict(X_test.to_numpy())
print(f'Accuracy: {(y_pred == y_test).mean()}')

Accuracy: 0.6036242563178866


#### 2.2 LogisticRegression

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
LogReg = LogisticRegression(max_iter=2000).fit(X_train, y_train)
print(
f'''Accuracy: {LogReg.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, LogReg.predict_proba(X_test)[:, 1])}'''
)#Разберись что пихают в рок аук

Accuracy: 0.8624529228753889
roc_auc score 0.9351077148424406


#### 2.3 KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [50]:
KNN = KNeighborsClassifier().fit(X_train, y_train)
print(
f'''Accuracy: {KNN.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, KNN.predict_proba(X_test)[:, 1])}'''
)

Accuracy: 0.9994541782653785
roc_auc score 0.9998890264940532


#### 2.4 Bayes Classifier

In [51]:
from sklearn.naive_bayes import CategoricalNB

In [52]:
MrBayes = CategoricalNB().fit(X_train, y_train)
print(
f'''Accuracy: {MrBayes.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, MrBayes.predict_proba(X_test)[:, 1])}'''
)

IndexError: index 62 is out of bounds for axis 1 with size 60

In [53]:
MrBayes.predict(X_test)

IndexError: index 62 is out of bounds for axis 1 with size 60

#### 2.5 DescisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier().fit(X_train, y_train)
print(
f'''train acc {(tree.predict(X_train) == y_train).mean()}
test acc {(tree.predict(X_test) == y_test).mean()}
'''
)

#### 2.6 Boosting

In [1]:
from catboost import CatBoostClassifier, Pool

In [60]:
acc = (preds_class == y_test).mean()
acc

0.9652857376780744

In [59]:
test_pool = Pool(X_test, y_test)

model = CatBoostClassifier(iterations=10000,
                           depth=3,
                           learning_rate=0.001,
                           loss_function='Logloss',
                           task_type='GPU',
                           verbose=True)
# train the model
model.fit(X_train, y_train)
# make the prediction using the resulting model
preds_class = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)
print("class = ", preds_class)
print("proba = ", preds_proba)

0:	learn: 0.6929744	total: 9.26ms	remaining: 1m 32s
1:	learn: 0.6928010	total: 19.9ms	remaining: 1m 39s
2:	learn: 0.6926277	total: 30.4ms	remaining: 1m 41s
3:	learn: 0.6924546	total: 39.3ms	remaining: 1m 38s
4:	learn: 0.6922821	total: 49.9ms	remaining: 1m 39s
5:	learn: 0.6921115	total: 58.7ms	remaining: 1m 37s
6:	learn: 0.6919410	total: 68.7ms	remaining: 1m 38s
7:	learn: 0.6917698	total: 78.3ms	remaining: 1m 37s
8:	learn: 0.6915985	total: 87.5ms	remaining: 1m 37s
9:	learn: 0.6914279	total: 97.6ms	remaining: 1m 37s
10:	learn: 0.6912578	total: 106ms	remaining: 1m 36s
11:	learn: 0.6910879	total: 116ms	remaining: 1m 36s
12:	learn: 0.6909197	total: 125ms	remaining: 1m 36s
13:	learn: 0.6907503	total: 134ms	remaining: 1m 35s
14:	learn: 0.6905817	total: 143ms	remaining: 1m 35s
15:	learn: 0.6904135	total: 152ms	remaining: 1m 34s
16:	learn: 0.6902509	total: 161ms	remaining: 1m 34s
17:	learn: 0.6900835	total: 170ms	remaining: 1m 34s
18:	learn: 0.6899236	total: 178ms	remaining: 1m 33s
19:	learn: 0