# Assignment - Preprocessing Mushroom Data

This assignment is about getting the mushroom data ready for machine learning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
# all the column names
cols = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
        'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
        'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
        'stalk-surface-below-ring', 'stalk-color-above-ring',
        'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
        'ring-type', 'spore-print-color', 'population', 'habitat']

# load data from UCI
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
df = pd.read_csv(url, names=cols)

print(len(df))
df.head()

## Select Columns

Just need class, odor, and gill-color

In [None]:
# pick the 3 columns
data = df[['class', 'odor', 'gill-color']].copy()
data.head()

In [None]:
# see what values are in each column
print(data['class'].unique())
print(data['odor'].unique())
print(data['gill-color'].unique())

## Convert to Numbers

Need to change letters to numbers for sklearn

In [None]:
# convert class (e=0, p=1)
data['poisonous'] = data['class'].replace({'e': 0, 'p': 1})

# convert odor
data['odor_num'] = data['odor'].replace({
    'a': 0, 'l': 1, 'c': 2, 'y': 3, 'f': 4, 'm': 5, 'n': 6, 'p': 7, 's': 8
})

# convert gill color
data['gill_num'] = data['gill-color'].replace({
    'k': 0, 'n': 1, 'b': 2, 'h': 3, 'g': 4, 'r': 5, 'o': 6, 'p': 7, 'u': 8, 'e': 9, 'w': 10, 'y': 11
})

data.head()

In [None]:
# basic stats
data[['poisonous', 'odor_num', 'gill_num']].describe()

## Distribution Plots

In [None]:
# how many poisonous vs edible
data['poisonous'].value_counts().plot(kind='bar')
plt.title('Edible vs Poisonous')
plt.xticks([0, 1], ['Edible', 'Poisonous'], rotation=0)
plt.show()

print(data['poisonous'].value_counts())

In [None]:
# odor distribution
data['odor'].value_counts().plot(kind='bar')
plt.title('Odor Types')
plt.ylabel('Count')
plt.show()

In [None]:
# gill color distribution
data['gill-color'].value_counts().plot(kind='bar')
plt.title('Gill Colors')
plt.ylabel('Count')
plt.show()

## Scatterplots

In [None]:
# odor vs poisonous scatterplot
plt.scatter(data['odor_num'], data['poisonous'], alpha=0.3)
plt.xlabel('Odor')
plt.ylabel('Poisonous')
plt.title('Odor vs Poisonous')
plt.show()

In [None]:
# gill color vs poisonous scatterplot
plt.scatter(data['gill_num'], data['poisonous'], alpha=0.3)
plt.xlabel('Gill Color')
plt.ylabel('Poisonous')
plt.title('Gill Color vs Poisonous')
plt.show()

In [None]:
# check how odor relates to being poisonous
print("Poisonous percentage by odor:")
for odor_type in data['odor'].unique():
    subset = data[data['odor'] == odor_type]
    pct = (subset['poisonous'].sum() / len(subset)) * 100
    print(f"{odor_type}: {pct:.1f}%")

In [None]:
# check how gill color relates to being poisonous
print("Poisonous percentage by gill color:")
for gill in data['gill-color'].unique():
    subset = data[data['gill-color'] == gill]
    pct = (subset['poisonous'].sum() / len(subset)) * 100
    print(f"{gill}: {pct:.1f}%")

In [None]:
# correlation
corr = data[['poisonous', 'odor_num', 'gill_num']].corr()
print(corr)

sns.heatmap(corr, annot=True)
plt.title('Correlations')
plt.show()

## Conclusions

Odor looks like a better predictor than gill color. Some odors like foul and fishy are almost 100% poisonous, while others like none and almond are mostly edible. 

Gill color has some patterns but they're not as strong. The correlation numbers show odor has a stronger relationship with being poisonous.

For the next project, odor should give better predictions.

In [None]:
# save for next assignment
data.to_csv('mushroom_data_processed.csv', index=False)
print("Saved!")