In [19]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score
from sklearn.tree import export_graphviz
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [20]:
# import csv file
df = pd.read_csv("mushrooms.csv")

In [21]:
# encode df
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [22]:
X = df.drop(['class'], axis =1)
y = df['class']


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

In [24]:
scaler = StandardScaler()
scaler.fit(X_train)
X2 = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=23)
rf.fit(X_train,y_train)

RandomForestClassifier(random_state=23)

In [26]:
importances = rf.feature_importances_
importances_sorted = sorted(zip(rf.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.1581755058575411, 'odor'),
 (0.12933627642838447, 'gill-size'),
 (0.09855281277878224, 'gill-color'),
 (0.09136665878595977, 'spore-print-color'),
 (0.07903977751800717, 'ring-type'),
 (0.05737117482189133, 'population'),
 (0.053680852639674294, 'bruises'),
 (0.05365619393835093, 'stalk-root'),
 (0.05250130243759179, 'stalk-surface-above-ring'),
 (0.04796332896400079, 'gill-spacing'),
 (0.04308281089684837, 'stalk-surface-below-ring'),
 (0.03635576400733289, 'habitat'),
 (0.02216198743231622, 'stalk-shape'),
 (0.01679707566150379, 'stalk-color-above-ring'),
 (0.016100069713408622, 'cap-color'),
 (0.013281602557990472, 'ring-number'),
 (0.012188766213459287, 'stalk-color-below-ring'),
 (0.009874124482614799, 'cap-surface'),
 (0.003473358003752621, 'cap-shape'),
 (0.0026122522829187802, 'gill-attachment'),
 (0.002428304577670025, 'veil-color'),
 (0.0, 'veil-type')]

In [27]:
# drop column that isn't needed for analysis since there's only one value
df = df.drop(["veil-type"], axis=1)
X = df.drop(['class'], axis =1)
y = df['class']

In [28]:
# Lets check the correlation of each variable 

selected_columns = ['odor', 'gill-color', 'gill-size', 
'spore-print-color','ring-type','stalk-root',
'population','stalk-surface-below-ring','bruises','stalk-surface-above-ring',
'habitat','gill-spacing','stalk-shape','stalk-color-below-ring','stalk-color-above-ring',
'cap-color','ring-number','cap-surface','cap-shape','gill-attachment','veil-color']

correlation_matrix = X[selected_columns].corr()
correlation_matrix

Unnamed: 0,odor,gill-color,gill-size,spore-print-color,ring-type,stalk-root,population,stalk-surface-below-ring,bruises,stalk-surface-above-ring,...,gill-spacing,stalk-shape,stalk-color-below-ring,stalk-color-above-ring,cap-color,ring-number,cap-surface,cap-shape,gill-attachment,veil-color
odor,1.0,-0.129213,0.310495,0.469055,-0.281387,-0.205215,-0.043623,0.06182,-0.061825,0.118617,...,0.063936,0.459766,0.169407,0.174532,-0.387121,0.111905,0.045233,-0.021935,-0.05959,-0.057747
gill-color,-0.129213,1.0,-0.516736,-0.416135,0.629398,0.31508,-0.03409,0.257224,0.52712,0.224287,...,0.100193,-0.175699,-0.074781,-0.058299,0.084659,0.096054,-0.161017,-0.006039,-0.128567,-0.097583
gill-size,0.310495,-0.516736,1.0,0.622991,-0.460872,-0.344345,0.147682,0.010894,-0.369596,0.05631,...,-0.108333,0.214576,0.278708,0.296548,-0.169464,-0.171362,0.2081,0.05405,0.108984,0.103809
spore-print-color,0.469055,-0.416135,0.622991,1.0,-0.487048,-0.536996,-0.126859,0.130974,-0.285008,0.100764,...,0.047323,0.258831,0.254518,0.271533,-0.293523,0.338417,0.230364,-0.073416,-0.029524,-0.0036
ring-type,-0.281387,0.629398,-0.460872,-0.487048,1.0,0.210155,0.211763,0.394644,0.692973,0.390091,...,-0.195897,-0.291444,-0.034284,-0.048878,0.162513,0.058312,-0.106407,-0.025457,-0.146689,-0.143673
stalk-root,-0.205215,0.31508,-0.344345,-0.536996,0.210155,1.0,-0.306747,0.087454,0.244188,-0.027065,...,0.350548,-0.163422,0.159805,0.15714,0.321274,-0.247357,-0.126245,0.030191,0.144063,0.156213
population,-0.043623,-0.03409,0.147682,-0.126859,0.211763,-0.306747,1.0,0.046797,0.088137,0.079604,...,-0.529253,0.087383,-0.242792,-0.240261,-0.14477,-0.24202,0.021555,0.063413,0.165575,0.124924
stalk-surface-below-ring,0.06182,0.257224,0.010894,0.130974,0.394644,0.087454,0.046797,1.0,0.458983,0.437164,...,-0.213775,-0.034399,0.110656,0.106933,-0.04771,0.040006,0.107965,-0.032591,-0.116177,-0.077284
bruises,-0.061825,0.52712,-0.369596,-0.285008,0.692973,0.244188,0.088137,0.458983,1.0,0.460824,...,-0.299473,0.099364,0.092874,0.083538,-0.000764,0.056788,0.070228,-0.035374,0.137359,0.11977
stalk-surface-above-ring,0.118617,0.224287,0.05631,0.100764,0.390091,-0.027065,0.079604,0.437164,0.460824,1.0,...,-0.212359,0.015193,0.142835,0.132708,-0.060837,0.107904,0.08909,-0.030417,-0.088916,-0.090591


In [29]:
# https://www.statology.org/how-to-calculate-vif-in-python/

from statsmodels.stats.outliers_influence import variance_inflation_factor


vif_data = pd.DataFrame()
vif_data["variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

                    variable         VIF
0                  cap-shape    5.463186
1                cap-surface    3.904712
2                  cap-color    5.950600
3                    bruises    7.984231
4                       odor    8.918271
5            gill-attachment  256.122838
6               gill-spacing    4.534113
7                  gill-size    5.758180
8                 gill-color    6.284702
9                stalk-shape    9.243136
10                stalk-root    8.129864
11  stalk-surface-above-ring   11.707764
12  stalk-surface-below-ring   11.768772
13    stalk-color-above-ring   17.163156
14    stalk-color-below-ring   17.215182
15                veil-color  280.123888
16               ring-number   65.370463
17                 ring-type   13.429124
18         spore-print-color   16.937589
19                population   21.434074
20                   habitat    2.800932


In [30]:

selected_columns = [
'population','stalk-surface-below-ring','bruises','stalk-surface-above-ring',
'stalk-shape','stalk-color-below-ring','stalk-color-above-ring',
'ring-number','gill-attachment','veil-color']

correlation_matrix = X[selected_columns].corr()
correlation_matrix

Unnamed: 0,population,stalk-surface-below-ring,bruises,stalk-surface-above-ring,stalk-shape,stalk-color-below-ring,stalk-color-above-ring,ring-number,gill-attachment,veil-color
population,1.0,0.046797,0.088137,0.079604,0.087383,-0.242792,-0.240261,-0.24202,0.165575,0.124924
stalk-surface-below-ring,0.046797,1.0,0.458983,0.437164,-0.034399,0.110656,0.106933,0.040006,-0.116177,-0.077284
bruises,0.088137,0.458983,1.0,0.460824,0.099364,0.092874,0.083538,0.056788,0.137359,0.11977
stalk-surface-above-ring,0.079604,0.437164,0.460824,1.0,0.015193,0.142835,0.132708,0.107904,-0.088916,-0.090591
stalk-shape,0.087383,-0.034399,0.099364,0.015193,1.0,0.235794,0.223439,-0.293221,0.186485,0.162604
stalk-color-below-ring,-0.242792,0.110656,0.092874,0.142835,0.235794,1.0,0.49151,0.08758,0.09716,0.065567
stalk-color-above-ring,-0.240261,0.106933,0.083538,0.132708,0.223439,0.49151,1.0,0.084917,0.099299,0.067377
ring-number,-0.24202,0.040006,0.056788,0.107904,-0.293221,0.08758,0.084917,1.0,0.093236,0.03638
gill-attachment,0.165575,-0.116177,0.137359,-0.088916,0.186485,0.09716,0.099299,0.093236,1.0,0.897518
veil-color,0.124924,-0.077284,0.11977,-0.090591,0.162604,0.065567,0.067377,0.03638,0.897518,1.0


In [31]:
selected_columns = ['odor', 'gill-color', 'gill-size']

In [32]:
import statsmodels.api as sm
# Add constant term to predictor variables
X_select = X[selected_columns]
X_select = sm.add_constant(X_select)

# Fit logistic regression model
log_reg_model = sm.Logit(y, X_select).fit()

# Print the summary of the logistic regression model
print(log_reg_model.summary())

Optimization terminated successfully.
         Current function value: 0.408973
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  class   No. Observations:                 8124
Model:                          Logit   Df Residuals:                     8120
Method:                           MLE   Df Model:                            3
Date:                Wed, 30 Aug 2023   Pseudo R-squ.:                  0.4094
Time:                        11:47:11   Log-Likelihood:                -3322.5
converged:                       True   LL-Null:                       -5625.9
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.6540      0.093     28.387      0.000       2.471       2.837
odor          -0.5336      0.

In [33]:
selected_columns = ['odor', 'gill-color', 'gill-size', 
'spore-print-color','ring-type','stalk-root',
'population','stalk-surface-below-ring','bruises','stalk-surface-above-ring',
'habitat','gill-spacing','stalk-shape','stalk-color-below-ring','stalk-color-above-ring',
'cap-color','ring-number','cap-surface','cap-shape','gill-attachment','veil-color']

correlation_matrix = X[selected_columns].corr()
correlation_matrix

Unnamed: 0,odor,gill-color,gill-size,spore-print-color,ring-type,stalk-root,population,stalk-surface-below-ring,bruises,stalk-surface-above-ring,...,gill-spacing,stalk-shape,stalk-color-below-ring,stalk-color-above-ring,cap-color,ring-number,cap-surface,cap-shape,gill-attachment,veil-color
odor,1.0,-0.129213,0.310495,0.469055,-0.281387,-0.205215,-0.043623,0.06182,-0.061825,0.118617,...,0.063936,0.459766,0.169407,0.174532,-0.387121,0.111905,0.045233,-0.021935,-0.05959,-0.057747
gill-color,-0.129213,1.0,-0.516736,-0.416135,0.629398,0.31508,-0.03409,0.257224,0.52712,0.224287,...,0.100193,-0.175699,-0.074781,-0.058299,0.084659,0.096054,-0.161017,-0.006039,-0.128567,-0.097583
gill-size,0.310495,-0.516736,1.0,0.622991,-0.460872,-0.344345,0.147682,0.010894,-0.369596,0.05631,...,-0.108333,0.214576,0.278708,0.296548,-0.169464,-0.171362,0.2081,0.05405,0.108984,0.103809
spore-print-color,0.469055,-0.416135,0.622991,1.0,-0.487048,-0.536996,-0.126859,0.130974,-0.285008,0.100764,...,0.047323,0.258831,0.254518,0.271533,-0.293523,0.338417,0.230364,-0.073416,-0.029524,-0.0036
ring-type,-0.281387,0.629398,-0.460872,-0.487048,1.0,0.210155,0.211763,0.394644,0.692973,0.390091,...,-0.195897,-0.291444,-0.034284,-0.048878,0.162513,0.058312,-0.106407,-0.025457,-0.146689,-0.143673
stalk-root,-0.205215,0.31508,-0.344345,-0.536996,0.210155,1.0,-0.306747,0.087454,0.244188,-0.027065,...,0.350548,-0.163422,0.159805,0.15714,0.321274,-0.247357,-0.126245,0.030191,0.144063,0.156213
population,-0.043623,-0.03409,0.147682,-0.126859,0.211763,-0.306747,1.0,0.046797,0.088137,0.079604,...,-0.529253,0.087383,-0.242792,-0.240261,-0.14477,-0.24202,0.021555,0.063413,0.165575,0.124924
stalk-surface-below-ring,0.06182,0.257224,0.010894,0.130974,0.394644,0.087454,0.046797,1.0,0.458983,0.437164,...,-0.213775,-0.034399,0.110656,0.106933,-0.04771,0.040006,0.107965,-0.032591,-0.116177,-0.077284
bruises,-0.061825,0.52712,-0.369596,-0.285008,0.692973,0.244188,0.088137,0.458983,1.0,0.460824,...,-0.299473,0.099364,0.092874,0.083538,-0.000764,0.056788,0.070228,-0.035374,0.137359,0.11977
stalk-surface-above-ring,0.118617,0.224287,0.05631,0.100764,0.390091,-0.027065,0.079604,0.437164,0.460824,1.0,...,-0.212359,0.015193,0.142835,0.132708,-0.060837,0.107904,0.08909,-0.030417,-0.088916,-0.090591


In [34]:
# selected_columns = ['odor', 'gill-color', 'gill-size', 'spore-print-color','ring-type','stalk-root','population']

selected_columns_1 = ['stalk-shape','stalk-color-below-ring','stalk-color-above-ring',
'cap-color','ring-number','cap-surface','cap-shape','gill-attachment','veil-color']

In [35]:
import statsmodels.api as sm
# Add constant term to predictor variables
X_select = X[selected_columns_1]
X_select = sm.add_constant(X_select)

# Fit logistic regression model
log_reg_model = sm.Logit(y, X_select).fit()

# Print the summary of the logistic regression model
print(log_reg_model.summary())

         Current function value: 0.583891
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                  class   No. Observations:                 8124
Model:                          Logit   Df Residuals:                     8114
Method:                           MLE   Df Model:                            9
Date:                Wed, 30 Aug 2023   Pseudo R-squ.:                  0.1568
Time:                        11:47:12   Log-Likelihood:                -4743.5
converged:                      False   LL-Null:                       -5625.9
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                    -55.7370   1.79e+04     -0.003      0.998   -3.51e+04     3.5e+04
stalk-shape               -1.



In [36]:
nonselect_columns = ['veil-color', 'gill-attachment', 'cap-shape', 
'cap-surface','ring-number','cap-color']

In [37]:
# Add constant term to predictor variables
X_nonselect = X[nonselect_columns]
X_nonselect = sm.add_constant(X_nonselect)

# Fit logistic regression model
log_reg_model = sm.Logit(y, X_nonselect).fit()

# Print the summary of the logistic regression model
print(log_reg_model.summary())

         Current function value: 0.631109
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                  class   No. Observations:                 8124
Model:                          Logit   Df Residuals:                     8117
Method:                           MLE   Df Model:                            6
Date:                Wed, 30 Aug 2023   Pseudo R-squ.:                 0.08865
Time:                        11:47:12   Log-Likelihood:                -5127.1
converged:                      False   LL-Null:                       -5625.9
Covariance Type:            nonrobust   LLR p-value:                3.109e-212
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -69.5712   3.79e+05     -0.000      1.000   -7.42e+05    7.42e+05
veil-color         48.5881   3.78e+05      0.000  



In [38]:
import scipy.stats as st
import scipy.stats as stats

veil_color_test=stats.ttest_ind(y, X['veil-color'], equal_var=False)
gill_attach_test = stats.ttest_ind(y, X['gill-attachment'], equal_var=False)
cap_shape_test = stats.ttest_ind(y, X['cap-shape'], equal_var=False)
cap_surface_test = stats.ttest_ind(y, X['cap-surface'], equal_var=False)
ring_number_test = stats.ttest_ind(y, X['ring-number'], equal_var=False)


In [39]:
# print(veil_color_test)
# print(gill_attach_test)
# print(cap_shape_test)
# print(cap_surface_test)
# print(ring_number_test)

In [40]:
# selected_columns = ['odor', 'gill-color', 'gill-size', 
# 'spore-print-color','ring-type','stalk-root',
# 'population','stalk-surface-below-ring','bruises','stalk-surface-above-ring',
# 'habitat','gill-spacing','stalk-shape','stalk-color-below-ring','stalk-color-above-ring',
# 'cap-color','ring-number','cap-surface','cap-shape','gill-attachment','veil-color']

In [41]:
print(st.normaltest(X['odor']))
print(st.normaltest(X['gill-color']))
print(st.normaltest(X['gill-size']))
print(st.normaltest(X['spore-print-color']))
print(st.normaltest(X['ring-type']))
print(st.normaltest(X['stalk-root']))
print(st.normaltest(X['population']))
print(st.normaltest(X['stalk-surface-below-ring']))
print(st.normaltest(X['bruises']))
print(st.normaltest(X['stalk-surface-above-ring']))
print(st.normaltest(X['habitat']))
print(st.normaltest(X['gill-spacing']))
print(st.normaltest(X['stalk-shape']))
print(st.normaltest(X['stalk-color-below-ring']))
print(st.normaltest(X['stalk-color-above-ring']))
print(st.normaltest(X['cap-color']))
print(st.normaltest(X['ring-number']))
print(st.normaltest(X['cap-surface']))
print(st.normaltest(X['cap-shape']))
print(st.normaltest(X['gill-attachment']))
print(st.normaltest(X['veil-color']))


NormaltestResult(statistic=624.4495030832669, pvalue=2.5264650329942025e-136)
NormaltestResult(statistic=21063.39497710686, pvalue=0.0)
NormaltestResult(statistic=53901.06580328662, pvalue=0.0)
NormaltestResult(statistic=752386.8991575951, pvalue=0.0)
NormaltestResult(statistic=35185.72937602552, pvalue=0.0)
NormaltestResult(statistic=890.4803687835673, pvalue=4.3116628814186303e-194)
NormaltestResult(statistic=1876.5069768457386, pvalue=0.0)
NormaltestResult(statistic=637.0866472034288, pvalue=4.5539925169479305e-139)
NormaltestResult(statistic=30303.18649748688, pvalue=0.0)
NormaltestResult(statistic=1122.6649415536767, pvalue=1.6459075162514516e-244)
NormaltestResult(statistic=971.912714552137, pvalue=8.950258715861973e-212)
NormaltestResult(statistic=2423.555047869679, pvalue=0.0)
NormaltestResult(statistic=29374.136616654418, pvalue=0.0)
NormaltestResult(statistic=2565.089206031342, pvalue=0.0)
NormaltestResult(statistic=2654.02901354856, pvalue=0.0)
NormaltestResult(statistic=137

In [42]:


odor_test = stats.ttest_ind(y, X['odor'], equal_var=False)
gill_color_test = stats.ttest_ind(y, X['gill-color'], equal_var=False)
gill_size_test = stats.ttest_ind(y, X['gill-size'], equal_var=False)
spore_print_color_test = stats.ttest_ind(y, X['spore-print-color'], equal_var=False)
ring_type_test = stats.ttest_ind(y, X['ring-type'], equal_var=False)
stalk_root_test = stats.ttest_ind(y, X['stalk-root'], equal_var=False)
population_test = stats.ttest_ind(y, X['population'], equal_var=False)
stalk_surface_below_ring_test = stats.ttest_ind(y, X['stalk-surface-below-ring'], equal_var=False)
bruises_test = stats.ttest_ind(y, X['bruises'], equal_var=False)
stalk_surface_above_ring_test = stats.ttest_ind(y, X['stalk-surface-above-ring'], equal_var=False)
gill_spacing_test = stats.ttest_ind(y, X['gill-spacing'], equal_var=False)
stalk_shape_test = stats.ttest_ind(y, X['stalk-shape'], equal_var=False)
stalk_color_below_ring_test = stats.ttest_ind(y, X['stalk-color-below-ring'], equal_var=False)
stalk_color_above_ring_test = stats.ttest_ind(y, X['stalk-color-above-ring'], equal_var=False)
cap_color_test = stats.ttest_ind(y, X['cap-color'], equal_var=False)
ring_number_test = stats.ttest_ind(y, X['ring-number'], equal_var=False)
cap_surface_test = stats.ttest_ind(y, X['cap-surface'], equal_var=False)
cap_shape_test = stats.ttest_ind(y, X['cap-shape'], equal_var=False)
gill_attachment_test = stats.ttest_ind(y, X['gill-attachment'], equal_var=False)
veil_color_test = stats.ttest_ind(y, X['veil-color'], equal_var=False)


In [43]:
print("Odor t-test:")
print("Ho: There is no relation between the class of mushroom and the odor.")
print("Ha: There is a relation between the class of mushroom and the odor.")
print(odor_test)
print()

print("Gill Color t-test:")
print("Ho: There is no relation between the class of mushroom and the gill color.")
print("Ha: There is a relation between the class of mushroom and the gill color.")
print(gill_color_test)
print()

print("Gill Size t-test:")
print("Ho: There is no relation between the class of mushroom and the gill size.")
print("Ha: There is a relation between the class of mushroom and the gill size.")
print(gill_size_test)
print()






Odor t-test:
Ho: There is no relation between the class of mushroom and the odor.
Ha: There is a relation between the class of mushroom and the odor.
Ttest_indResult(statistic=-152.67958042193732, pvalue=0.0)

Gill Color t-test:
Ho: There is no relation between the class of mushroom and the gill color.
Ha: There is a relation between the class of mushroom and the gill color.
Ttest_indResult(statistic=-109.12071206855487, pvalue=0.0)

Gill Size t-test:
Ho: There is no relation between the class of mushroom and the gill size.
Ha: There is a relation between the class of mushroom and the gill size.
Ttest_indResult(statistic=22.884209163534898, pvalue=4.328435202228122e-114)



In [44]:
print("Spore Print Color t-test:")
print("Ho: There is no relation between the class of mushroom and the spore print color.")
print("Ha: There is a relation between the class of mushroom and the spore print color.")
print(spore_print_color_test)
print()

print("Ring Type t-test:")
print("Ho: There is no relation between the class of mushroom and the ring type.")
print("Ha: There is a relation between the class of mushroom and the ring type.")
print(ring_type_test)
print()

print("Stalk Root t-test:")
print("Ho: There is no relation between the class of mushroom and the stalk root.")
print("Ha: There is a relation between the class of mushroom and the stalk root.")
print(stalk_root_test)
print()


Spore Print Color t-test:
Ho: There is no relation between the class of mushroom and the spore print color.
Ha: There is a relation between the class of mushroom and the spore print color.
Ttest_indResult(statistic=-115.31727817550028, pvalue=0.0)

Ring Type t-test:
Ho: There is no relation between the class of mushroom and the ring type.
Ha: There is a relation between the class of mushroom and the ring type.
Ttest_indResult(statistic=-87.25324956432573, pvalue=0.0)

Stalk Root t-test:
Ho: There is no relation between the class of mushroom and the stalk root.
Ha: There is a relation between the class of mushroom and the stalk root.
Ttest_indResult(statistic=-48.24259208207392, pvalue=0.0)



In [45]:
print("Population t-test:")
print("Ho: There is no relation between the class of mushroom and the population.")
print("Ha: There is a relation between the class of mushroom and the population.")
print(population_test)
print()

print("Stalk Surface Below Ring t-test:")
print("Ho: There is no relation between the class of mushroom and the stalk surface below the ring.")
print("Ha: There is a relation between the class of mushroom and the stalk surface below the ring.")
print(stalk_surface_below_ring_test)
print()

print("Bruises t-test:")
print("Ho: There is no relation between the class of mushroom and the presence of bruises.")
print("Ha: There is a relation between the class of mushroom and the presence of bruises.")
print(bruises_test)
print()



Population t-test:
Ho: There is no relation between the class of mushroom and the population.
Ha: There is a relation between the class of mushroom and the population.
Ttest_indResult(statistic=-211.40637734183275, pvalue=0.0)

Stalk Surface Below Ring t-test:
Ho: There is no relation between the class of mushroom and the stalk surface below the ring.
Ha: There is a relation between the class of mushroom and the stalk surface below the ring.
Ttest_indResult(statistic=-120.26158858422612, pvalue=0.0)

Bruises t-test:
Ho: There is no relation between the class of mushroom and the presence of bruises.
Ha: There is a relation between the class of mushroom and the presence of bruises.
Ttest_indResult(statistic=8.536077652126417, pvalue=1.5097703176396305e-17)



In [46]:
print("Stalk Surface Above Ring t-test:")
print("Ho: There is no relation between the class of mushroom and the stalk surface above the ring.")
print("Ha: There is a relation between the class of mushroom and the stalk surface above the ring.")
print(stalk_surface_above_ring_test)
print()

print("Gill Spacing t-test:")
print("Ho: There is no relation between the class of mushroom and the gill spacing.")
print("Ha: There is a relation between the class of mushroom and the gill spacing.")
print(gill_spacing_test)
print()

print("Stalk Shape t-test:")
print("Ho: There is no relation between the class of mushroom and the stalk shape.")
print("Ha: There is a relation between the class of mushroom and the stalk shape.")
print(stalk_shape_test)
print()

print("Stalk Color Below Ring t-test:")
print("Ho: There is no relation between the class of mushroom and the stalk color below the ring.")
print("Ha: There is a relation between the class of mushroom and the stalk color below the ring.")
print(stalk_color_below_ring_test)
print()

Stalk Surface Above Ring t-test:
Ho: There is no relation between the class of mushroom and the stalk surface above the ring.
Ha: There is a relation between the class of mushroom and the stalk surface above the ring.
Ttest_indResult(statistic=-123.54550750304384, pvalue=0.0)

Gill Spacing t-test:
Ho: There is no relation between the class of mushroom and the gill spacing.
Ha: There is a relation between the class of mushroom and the gill spacing.
Ttest_indResult(statistic=46.55289558246135, pvalue=0.0)

Stalk Shape t-test:
Ho: There is no relation between the class of mushroom and the stalk shape.
Ha: There is a relation between the class of mushroom and the stalk shape.
Ttest_indResult(statistic=-10.90992075623163, pvalue=1.2885534075146685e-27)

Stalk Color Below Ring t-test:
Ho: There is no relation between the class of mushroom and the stalk color below the ring.
Ha: There is a relation between the class of mushroom and the stalk color below the ring.
Ttest_indResult(statistic=-24

In [47]:
print("Stalk Color Above Ring t-test:")
print("Ho: There is no relation between the class of mushroom and the stalk color above the ring.")
print("Ha: There is a relation between the class of mushroom and the stalk color above the ring.")
print(stalk_color_above_ring_test)
print()

print("Cap Color t-test:")
print("Ho: There is no relation between the class of mushroom and the cap color.")
print("Ha: There is a relation between the class of mushroom and the cap color.")
print(cap_color_test)
print()

print("Ring Number t-test:")
print("Ho: There is no relation between the class of mushroom and the number of rings.")
print("Ha: There is a relation between the class of mushroom and the number of rings.")
print(ring_number_test)
print()

print("Cap Surface t-test:")
print("Ho: There is no relation between the class of mushroom and the cap surface.")
print("Ha: There is a relation between the class of mushroom and the cap surface.")
print(cap_surface_test)
print()



Stalk Color Above Ring t-test:
Ho: There is no relation between the class of mushroom and the stalk color above the ring.
Ha: There is a relation between the class of mushroom and the stalk color above the ring.
Ttest_indResult(statistic=-244.5193612663866, pvalue=0.0)

Cap Color t-test:
Ho: There is no relation between the class of mushroom and the cap color.
Ha: There is a relation between the class of mushroom and the cap color.
Ttest_indResult(statistic=-139.75267620514018, pvalue=0.0)

Ring Number t-test:
Ho: There is no relation between the class of mushroom and the number of rings.
Ha: There is a relation between the class of mushroom and the number of rings.
Ttest_indResult(statistic=-93.1302488351084, pvalue=0.0)

Cap Surface t-test:
Ho: There is no relation between the class of mushroom and the cap surface.
Ha: There is a relation between the class of mushroom and the cap surface.
Ttest_indResult(statistic=-91.36407484933889, pvalue=0.0)



In [48]:
print("Cap Shape t-test:")
print("Ho: There is no relation between the class of mushroom and the cap shape.")
print("Ha: There is a relation between the class of mushroom and the cap shape.")
print(cap_shape_test)
print()

print("Gill Attachment t-test:")
print("Ho: There is no relation between the class of mushroom and the gill attachment.")
print("Ha: There is a relation between the class of mushroom and the gill attachment.")
print(gill_attachment_test)
print()

print("Veil Color t-test:")
print("Ho: There is no relation between the class of mushroom and the veil color.")
print("Ha: There is a relation between the class of mushroom and the veil color.")
print(veil_color_test)
print()

Cap Shape t-test:
Ho: There is no relation between the class of mushroom and the cap shape.
Ha: There is a relation between the class of mushroom and the cap shape.
Ttest_indResult(statistic=-153.7349519927033, pvalue=0.0)

Gill Attachment t-test:
Ho: There is no relation between the class of mushroom and the gill attachment.
Ha: There is a relation between the class of mushroom and the gill attachment.
Ttest_indResult(statistic=-84.60124863289589, pvalue=0.0)

Veil Color t-test:
Ho: There is no relation between the class of mushroom and the veil color.
Ha: There is a relation between the class of mushroom and the veil color.
Ttest_indResult(statistic=-240.7016021589974, pvalue=0.0)



In [49]:
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,2,7,7,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,2,7,7,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,2,7,7,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,2,7,7,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,2,7,7,2,1,0,3,0,1


In [50]:
# https://www.statology.org/how-to-calculate-vif-in-python/

from statsmodels.stats.outliers_influence import variance_inflation_factor


vif_data = pd.DataFrame()
vif_data["variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

                    variable         VIF
0                  cap-shape    5.463186
1                cap-surface    3.904712
2                  cap-color    5.950600
3                    bruises    7.984231
4                       odor    8.918271
5            gill-attachment  256.122838
6               gill-spacing    4.534113
7                  gill-size    5.758180
8                 gill-color    6.284702
9                stalk-shape    9.243136
10                stalk-root    8.129864
11  stalk-surface-above-ring   11.707764
12  stalk-surface-below-ring   11.768772
13    stalk-color-above-ring   17.163156
14    stalk-color-below-ring   17.215182
15                veil-color  280.123888
16               ring-number   65.370463
17                 ring-type   13.429124
18         spore-print-color   16.937589
19                population   21.434074
20                   habitat    2.800932


In [51]:
X = X[['cap-shape','cap-surface','cap-color','bruises','odor','gill-spacing','habitat',
'gill-size','gill-color','stalk-shape','stalk-root']]

In [52]:
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-spacing,habitat,gill-size,gill-color,stalk-shape,stalk-root
0,5,2,4,1,6,0,5,1,4,0,3
1,5,2,9,1,0,0,1,0,4,0,2
2,0,2,8,1,3,0,3,0,5,0,2
3,5,3,8,1,6,0,5,1,5,0,3
4,5,2,3,0,5,1,1,0,4,1,3


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

In [54]:
scaler = StandardScaler()
scaler.fit(X_train)
X2 = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [55]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [56]:
print("test accuracy: {}%".format(round(lr.score(X_test,y_test)*100,2)))

test accuracy: 83.6%




In [57]:
print("TRAINING RESULTS:\n")
print(f'Accuracy Score: {(accuracy_score(y_train,lr.predict(X_train)))}')
print(f'Classification Report:{(classification_report(y_train,lr.predict(X_train)))}')
print(f'Confusion Matrix: {(confusion_matrix(y_train,lr.predict(X_train)))}')

TRAINING RESULTS:

Accuracy Score: 0.901362218939767
Classification Report:              precision    recall  f1-score   support

           0       0.91      0.90      0.90      3149
           1       0.90      0.90      0.90      2944

    accuracy                           0.90      6093
   macro avg       0.90      0.90      0.90      6093
weighted avg       0.90      0.90      0.90      6093

Confusion Matrix: [[2843  306]
 [ 295 2649]]


In [58]:
print("TEST RESULTS:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,lr.predict(X_test))))
print('Classification Report:\n{}\n'.format(classification_report(y_test,lr.predict(X_test))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,lr.predict(X_test))))

TEST RESULTS:

Accuracy Score: 0.8360

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.70      0.82      1059
           1       0.75      0.98      0.85       972

    accuracy                           0.84      2031
   macro avg       0.86      0.84      0.83      2031
weighted avg       0.87      0.84      0.83      2031


Confusion Matrix:
[[744 315]
 [ 18 954]]





In [59]:
from sklearn.svm import SVC

svm = SVC(random_state=23, gamma="auto")
svm.fit(X_train,y_train)

SVC(gamma='auto', random_state=23)

In [60]:
print(f"Test Accuracy: {svm.score(X_test,y_test)*100}")

Test Accuracy: 46.77498769079271




In [61]:
from sklearn.tree import DecisionTreeClassifier as DT

dt = DT(criterion='entropy',random_state=23)
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=23)

In [62]:
print("Test RESULTS:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,dt.predict(X_test))))
print('Classification Report:\n{}\n'.format(classification_report(y_test,dt.predict(X_test))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,dt.predict(X_test))))

Test RESULTS:

Accuracy Score: 0.6760

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.56      0.64      1059
           1       0.63      0.80      0.70       972

    accuracy                           0.68      2031
   macro avg       0.69      0.68      0.67      2031
weighted avg       0.69      0.68      0.67      2031


Confusion Matrix:
[[597 462]
 [196 776]]





//Random Forest

In [63]:
rf = RandomForestClassifier(n_estimators=100, random_state=23)
rf.fit(X_train,y_train)


RandomForestClassifier(random_state=23)

In [64]:
print("Test RESULTS:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,rf.predict(X_test))))
print('Classification Report:\n{}\n'.format(classification_report(y_test,rf.predict(X_test))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,rf.predict(X_test))))


Test RESULTS:

Accuracy Score: 0.6519

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.42      0.56      1059
           1       0.59      0.90      0.71       972

    accuracy                           0.65      2031
   macro avg       0.71      0.66      0.64      2031
weighted avg       0.71      0.65      0.63      2031


Confusion Matrix:
[[445 614]
 [ 93 879]]



