In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('second_data_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


Main Hypothesis:
The characteristics of mushrooms (e.g., cap diameter, cap color, and habitat) have significant associations with their edibility.

Sub-Hypotheses:
H1a: The mean cap diameter of poisonous mushrooms is different from that of edible mushrooms.
H1b: The distribution of cap color is different between poisonous and edible mushrooms.
H1c: The habitat preference of poisonous mushrooms is different from that of edible mushrooms.
For this, we'll run:

Independent samples t-test for H1a (as before).
Chi-squared test for independence for H1b.
Chi-squared test for independence for H1c.

In [5]:
# H1a: T-test for cap diameter
poisonous = df[df['class'] == 'p']['cap-diameter']
edible = df[df['class'] == 'e']['cap-diameter']  

t_stat, p_value = stats.ttest_ind(poisonous, edible, equal_var=False, nan_policy='omit')
print(f"(H1a) Cap Diameter - T-statistic: {t_stat}, P-value: {p_value}")

# H1b: Chi-squared test for cap color
contingency_table_color = pd.crosstab(df['class'], df['cap-color'])
chi2_stat_color, p_value_color, _, _ = stats.chi2_contingency(contingency_table_color)
print(f"(H1b) Cap Color - Chi2 Statistic: {chi2_stat_color}, P-value: {p_value_color}")

# H1c: Chi-squared test for habitat
contingency_table_habitat = pd.crosstab(df['class'], df['habitat'])
chi2_stat_habitat, p_value_habitat, _, _ = stats.chi2_contingency(contingency_table_habitat)
print(f"(H1c) Habitat - Chi2 Statistic: {chi2_stat_habitat}, P-value: {p_value_habitat}")

# Interpretation
alpha = 0.05
for hypothesis, p in [("H1a: Cap Diameter", p_value), 
                      ("H1b: Cap Color", p_value_color), 
                      ("H1c: Habitat", p_value_habitat)]:
    if p < alpha:
        print(f"We reject the null hypothesis for {hypothesis} and conclude that it has a significant association with edibility.")
    else:
        print(f"We fail to reject the null hypothesis for {hypothesis}.")


(H1a) Cap Diameter - T-statistic: -43.35888031518477, P-value: 0.0
(H1b) Cap Color - Chi2 Statistic: 3541.079818275377, P-value: 0.0
(H1c) Habitat - Chi2 Statistic: 1907.309416697382, P-value: 0.0
We reject the null hypothesis for H1a: Cap Diameter and conclude that it has a significant association with edibility.
We reject the null hypothesis for H1b: Cap Color and conclude that it has a significant association with edibility.
We reject the null hypothesis for H1c: Habitat and conclude that it has a significant association with edibility.


In [9]:
contingency_table_color

cap-color,b,e,g,k,l,n,o,p,r,u,w,y
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
e,974,880,2297,501,452,12407,1071,449,198,611,3750,3591
p,256,3155,2123,778,376,11811,2585,1254,1584,1098,3916,4952
