In [187]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

## 1. Reading data

In [103]:
data = pd.read_csv("mushroom.csv")
X, y = data.drop('class', axis=1).copy(), data['class'].copy()
data.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [104]:
#по умолчанию describe делается по числовым колонкам
data.describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,61069.0,61069.0,61069.0
mean,6.733854,6.581538,12.14941
std,5.264845,3.370017,10.035955
min,0.38,0.0,0.0
25%,3.48,4.64,5.21
50%,5.86,5.95,10.19
75%,8.54,7.74,16.57
max,62.34,33.92,103.91


In [105]:
#describe по не числовым колонкам
data.describe(exclude=[np.number])

Unnamed: 0,class,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
count,61069,61069,46949,61069,61069,51185,36006,61069,9531,22945,61069,3177,7413,61069,58598,6354,61069,61069
unique,2,7,11,12,2,7,3,12,5,8,13,1,6,2,8,7,8,4
top,p,x,t,n,f,a,c,w,s,s,w,u,w,f,f,k,d,a
freq,33888,26934,8196,24218,50479,12698,24710,18521,3177,6025,22926,3177,5474,45890,48361,2118,44209,30177


In [106]:
NaN_percent = X.isnull().sum(axis=0) / len(X) * 100 # % NaN
print(NaN_percent)
#баланс классов
print('-'*20)
print(y.describe())
print({k:(y == k).mean() for k in y.unique()})

cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-root               84.393064
stem-surface            62.427746
stem-color               0.000000
veil-type               94.797688
veil-color              87.861272
has-ring                 0.000000
ring-type                4.046243
spore-print-color       89.595376
habitat                  0.000000
season                   0.000000
dtype: float64
--------------------
count     61069
unique        2
top           p
freq      33888
Name: class, dtype: object
{'p': 0.5549132947976878, 'e': 0.44508670520231214}


* Существует проблема, что есть признаки, у которых  большое количество пропусков, поэтому для некоторых моделей это может стать проблемой

  Так как я не разбираюсь в грибах принимаю решение избавиться от тех признаков, что содержат много пропусков, а там где пропусков мало заполним модой

* Классы выглядят сбалансированными

In [107]:
#y transformation
LE = LabelEncoder().fit(y)
y = LE.transform(y)

In [108]:
#x transformation
categorical_columns = X.select_dtypes(exclude = [np.number]).columns.to_list()

for col in categorical_columns:
    oh = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    one_hot_encoded = oh.fit_transform(X[[col]])
    X = pd.concat([X,one_hot_encoded],axis=1).drop(columns=[col])

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## 2. Training models

In [110]:
models_dict = {}

#### 2.1 OneRule

In [111]:
from mlxtend.classifier import OneRClassifier

In [112]:
OneR = OneRClassifier().fit(X_train.to_numpy(), y_train)
y_pred = OneR.predict(X_test.to_numpy())
print(f'Accuracy: {(y_pred == y_test).mean()}')

Accuracy: 0.6013863872059385


In [113]:
models_dict['OneR'] = OneR

#### 2.2 LogisticRegression

In [114]:
from sklearn.linear_model import LogisticRegression

In [115]:
LogReg = LogisticRegression(max_iter=2000).fit(X_train, y_train)
print(
f'''Accuracy: {LogReg.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, LogReg.predict_proba(X_test)[:, 1])}'''
)#Разберись что пихают в рок аук

Accuracy: 0.8613066972326838
roc_auc score 0.9357636421807044


In [116]:
models_dict['LogReg'] = LogReg

#### 2.3 KNN

In [117]:
from sklearn.neighbors import KNeighborsClassifier

In [118]:
KNN = KNeighborsClassifier().fit(X_train, y_train)
print(
f'''Accuracy: {KNN.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, KNN.predict_proba(X_test)[:, 1])}'''
)

Accuracy: 0.9995633426123028
roc_auc score 0.9999384352994417


In [119]:
models_dict['KNN'] = KNN

#### 2.4 Bayes Classifier

In [120]:
from sklearn.naive_bayes import CategoricalNB

In [121]:
MrBayes = CategoricalNB().fit(X_train, y_train)
print(
f'''Accuracy: {MrBayes.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, MrBayes.predict_proba(X_test)[:, 1])}'''
)
#что-то с фичами не так, нужен фикс

Accuracy: 0.7872932700180121
roc_auc score 0.8804257162275968


In [122]:
models_dict['Bayes'] = MrBayes

#### 2.5 DescisionTree

In [123]:
from sklearn.tree import DecisionTreeClassifier

In [124]:
tree = DecisionTreeClassifier().fit(X_train, y_train)
print(
f'''train acc {(tree.predict(X_train) == y_train).mean()}
test acc {(tree.predict(X_test) == y_test).mean()}
'''
)

train acc 1.0
test acc 0.9991812673980678



In [125]:
models_dict['tree'] = tree

#### 2.6 Boosting

In [154]:
from catboost import CatBoostClassifier, Pool

In [157]:
test_pool = Pool(X_test, y_test)

Boosting = CatBoostClassifier(iterations=256,
                           depth=3,
                           learning_rate=0.001,
                           loss_function='Logloss',
                           task_type='GPU',
                           verbose=False)
# train the model
Boosting.fit(X_train, y_train)
# make the prediction using the resulting model
preds_class = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)
score = model.score(test_pool)
print("class = ", preds_class)
print("proba = ", preds_proba)
print("score = ", score)

class =  [0 1 0 ... 0 0 1]
proba =  [[0.57208783 0.42791217]
 [0.46681048 0.53318952]
 [0.50599494 0.49400506]
 ...
 [0.50593279 0.49406721]
 [0.50599494 0.49400506]
 [0.41506982 0.58493018]]
score =  0.7109873915179302


In [158]:
models_dict['CatBoost'] = Boosting

#### 2.7 SVC

In [129]:
from sklearn.svm import SVC

In [131]:
vector_classif = SVC().fit(X_train, y_train)
print(
f'''Accuracy: {vector_classi.score(X_test, y_test)}'''
)

Accuracy: 0.9574259046995252


In [132]:
models_dict['SVM'] = vector_classif

#### 2.8 MLP

In [133]:
from sklearn.neural_network import MLPClassifier

In [134]:
MLP = MLPClassifier().fit(X_train, y_train)
print(
f'''Accuracy: {MLP.score(X_test, y_test)}
roc_auc score {roc_auc_score(y_test, MLP.predict_proba(X_test)[:, 1])}'''
)

Accuracy: 1.0
roc_auc score 1.0


In [135]:
models_dict['MLP'] = MLP

#### 2.9 Ensemble

In [136]:
from sklearn.ensemble import RandomForestClassifier

In [137]:
forest = RandomForestClassifier(n_estimators=50, 
                                criterion='entropy', 
                                max_depth=30)
forest.fit(X_train, y_train)
print(
f'''train acc {(forest.predict(X_train) == y_train).mean()}
test acc {(forest.predict(X_test) == y_test).mean()}'''
)

train acc 1.0
test acc 1.0


In [139]:
models_dict['RandomForest'] = forest 

## 3. Testing models

In [171]:
#accuracy
print(f'{'Model name':13}|{'accuracy':>10}')
print('-'*25)
for name, model in models_dict.items():    
    print(f'{name:13}|{(model.predict(X_test.to_numpy()) == y_test).mean() if name == 'OneR' else (model.predict(X_test) == y_test).mean():>10.2f}')

Model name   |  accuracy
-------------------------
OneR         |      0.60
LogReg       |      0.86
KNN          |      1.00
Bayes        |      0.79
tree         |      1.00
CatBoost     |      0.71
SVM          |      0.96
MLP          |      1.00
RandomForest |      1.00


In [189]:
#f1-score
print(f'{'Model name':13}|{'f1-score':>10}')
print('-'*25)
for name, model in models_dict.items():
    y_pred = model.predict(X_test.to_numpy()) if name == 'OneR' else model.predict(X_test)
    print(f'{name:13}|{f1_score(y_test, y_pred):>10.2f}')

Model name   |  f1-score
-------------------------
OneR         |      0.65
LogReg       |      0.87
KNN          |      1.00
Bayes        |      0.80
tree         |      1.00
CatBoost     |      0.72
SVM          |      0.96
MLP          |      1.00
RandomForest |      1.00


In [190]:
roc_auc_score?

[0;31mSignature:[0m
[0mroc_auc_score[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0my_true[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_score[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maverage[0m[0;34m=[0m[0;34m'macro'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_fpr[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmulti_class[0m[0;34m=[0m[0;34m'raise'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)     from prediction scores.

Note: this implementation can be used with binary, multiclass and
multilabel classification, but some restrictions apply (see Parameters).

Read more in the :ref:`User Guide <roc_me

In [193]:
y_score

array([[8.11268864e-01, 1.88731136e-01],
       [9.23463932e-01, 7.65360681e-02],
       [5.66494328e-01, 4.33505672e-01],
       ...,
       [9.99916732e-01, 8.32677531e-05],
       [4.81176400e-03, 9.95188236e-01],
       [2.75152358e-03, 9.97248476e-01]])

In [199]:
OneR.predict(X_test.to_numpy())

array([1, 1, 1, ..., 0, 1, 1])

In [196]:
vector_classif.decision_function(X_test)

array([-1.36530201, -0.87809441, -0.55838995, ..., -1.20146127,
        1.60168681,  2.83995353])

In [213]:
#roc_auc_score
#Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores

print(f'{'Model name':13}|{'f1-score':>10}')
print('-'*25)
for name, model in models_dict.items():
    if name == 'OneR':
        continue

    y_score = model.decision_function(X_test) if name == 'SVM' else model.predict_proba(X_test)[:, 1]
    print(f'{name:13}|{roc_auc_score(y_test, y_score):>10.2f}')

Model name   |  f1-score
-------------------------
LogReg       |      0.94
KNN          |      1.00
Bayes        |      0.88
tree         |      1.00
CatBoost     |      0.80
SVM          |      0.99
MLP          |      1.00
RandomForest |      1.00
