In [18]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
from tabulate import tabulate
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [5]:
df_no_encoded = pd.read_csv("Data_Preproc_NoEncoding.csv")

In [6]:
df_encoded = pd.read_csv("Data_Preproc_Encoded.csv")

In [7]:
df_class = pd.read_csv("Labels.csv")

In [25]:
df_class.replace('p', 1, inplace=True)
df_class.replace('e', 0, inplace=True)

In [26]:
contingency_table = pd.crosstab(df_no_encoded['cap-color'], df_class['class'])

In [27]:
contingency_table

class,0,1
cap-color,Unnamed: 1_level_1,Unnamed: 2_level_1
b,974,256
e,880,3155
g,2297,2123
k,501,778
l,452,376
n,12407,11811
o,1071,2585
p,449,1254
r,198,1584
u,611,1098


sample 10% of each cap-color group

In [29]:
sampled_table = contingency_table.apply(lambda row: (row * 0.10).round())

In [30]:
combinations = list(itertools.combinations(sampled_table.index, 2))

print("\nPairwise Two-Proportion Z-tests on Sample:")

alpha = 0.05
for combo in combinations:
    count1 = sampled_table.loc[combo[0], 1]
    count2 = sampled_table.loc[combo[1], 1]
    nobs1 = sum(sampled_table.loc[combo[0]])
    nobs2 = sum(sampled_table.loc[combo[1]])

    stat, pval = proportions_ztest([count1, count2], [nobs1, nobs2])

    print(f"\nComparing {combo[0]} vs {combo[1]}:")
    print(f"Z-statistic: {stat}")
    print(f"P-value: {pval}")

    if pval < alpha:
        print(f"The proportions of successes between {combo[0]} and {combo[1]} are significantly different.")
    else:
        print(f"No significant difference in proportions between {combo[0]} and {combo[1]}.")



Pairwise Two-Proportion Z-tests on Sample:

Comparing b vs e:
Z-statistic: -11.612631068906298
P-value: 3.5550537785408604e-31
The proportions of successes between b and e are significantly different.

Comparing b vs g:
Z-statistic: -5.329357255385819
P-value: 9.856095029033209e-08
The proportions of successes between b and g are significantly different.

Comparing b vs k:
Z-statistic: -6.398738672189651
P-value: 1.566657086067309e-10
The proportions of successes between b and k are significantly different.

Comparing b vs l:
Z-statistic: -3.749026289566177
P-value: 0.00017752247763547276
The proportions of successes between b and l are significantly different.

Comparing b vs n:
Z-statistic: -5.985148307250141
P-value: 2.161927796213129e-09
The proportions of successes between b and n are significantly different.

Comparing b vs o:
Z-statistic: -9.634952007100905
P-value: 5.691859136667364e-22
The proportions of successes between b and o are significantly different.

Comparing b vs p

1. **Significant Differences in Proportions**:
   - **b vs. e**: There's a significant difference between the proportions of edible and poisonous mushrooms with caps colored 'b' and 'e'. The negative Z-score indicates that 'b' has a lower proportion of edible mushrooms than 'e'.
   - **g vs. y**: Similarly, there's a significant difference between the proportions of edible and poisonous mushrooms with caps colored 'g' and 'y'. With the positive Z-score, 'g' has a higher proportion of edible mushrooms than 'y'.
   - **b vs. y**: The positive Z-score suggests that the proportion of edible mushrooms with cap color 'b' is higher than those with cap color 'y'.

2. **Non-significant Differences in Proportions**:
   - **e vs. g**: There's no significant difference between the proportions of edible and poisonous mushrooms with cap colors 'e' and 'g'.
   - **e vs. y**: Similarly, there's no significant difference between the proportions of edible and poisonous mushrooms with cap colors 'e' and 'y'.
   - **g vs. b**: The proportions of edible and poisonous mushrooms with cap colors 'g' and 'b' are not significantly different.

**Implications**: 
If the goal is to understand which cap colors are more commonly associated with edible mushrooms (and which with poisonous ones), this analysis suggests:
- Mushrooms with cap color 'b' might have a higher likelihood of being edible compared to those with cap color 'y'.
- Mushrooms with cap color 'g' might also have a higher likelihood of being edible compared to those with cap color 'y'.
- However, when comparing cap colors 'b', 'e', and 'g' among themselves, the differences in proportions of edibility are less clear or non-significant.

In [33]:
chi2, p, _, _ = chi2_contingency(contingency_table)

print(f"Chi-squared Value = {chi2}")
print(f"P-value = {p}")

alpha = 0.05
if p < alpha:
    print("We reject the null hypothesis. There's a significant association between cap-color and class.")
else:
    print("We fail to reject the null hypothesis. No significant association between cap-color and class.")

Chi-squared Value = 3541.079818275377
P-value = 0.0
We reject the null hypothesis. There's a significant association between cap-color and class.


Given that Chi-squared Value is quite high and the P-value is essentially 0, it's very clear that the cap color and edibility are not independent.

Thus, the cap color of a mushroom could be considered as one of the predictors or indicators of its edibility, at least based on this dataset. However, caution should always be taken when using such results in real-world applications, especially when health and safety are concerned. Always remember that while there's a significant association between the two variables in this dataset, it does not guarantee that this association will hold true in every scenario or region. Other factors could also come into play, and proper identification by experts is essential.