# Project - Predictive Analysis

Using machine learning to predict if mushrooms are poisonous based on odor and gill color.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [None]:
# try to load the file from last assignment
try:
    df = pd.read_csv('mushroom_data_processed.csv')
except:
    # if not found, load from UCI
    cols = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
            'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
            'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
            'stalk-surface-below-ring', 'stalk-color-above-ring',
            'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
            'ring-type', 'spore-print-color', 'population', 'habitat']
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
    temp = pd.read_csv(url, names=cols)
    df = temp[['class', 'odor', 'gill-color']].copy()
    df['poisonous'] = df['class'].replace({'e': 0, 'p': 1})
    df['odor_num'] = df['odor'].replace({'a': 0, 'l': 1, 'c': 2, 'y': 3, 'f': 4, 'm': 5, 'n': 6, 'p': 7, 's': 8})
    df['gill_num'] = df['gill-color'].replace({'k': 0, 'n': 1, 'b': 2, 'h': 3, 'g': 4, 'r': 5, 'o': 6, 'p': 7, 'u': 8, 'e': 9, 'w': 10, 'y': 11})

print(df.shape)
df.head()

## Get Dummies

Converting to binary columns with get_dummies like the assignment says

In [None]:
# make dummy variables
odor_dummies = pd.get_dummies(df['odor'], prefix='odor')
gill_dummies = pd.get_dummies(df['gill-color'], prefix='gill')

print(f"Odor columns: {len(odor_dummies.columns)}")
print(f"Gill columns: {len(gill_dummies.columns)}")

odor_dummies.head()

In [None]:
# set up X and y
y = df['poisonous']
X_odor = odor_dummies
X_gill = gill_dummies
X_both = pd.concat([odor_dummies, gill_dummies], axis=1)

print(f"Target: {y.shape}")
print(f"Odor features: {X_odor.shape}")
print(f"Gill features: {X_gill.shape}")
print(f"Both features: {X_both.shape}")

## Split Data

In [None]:
# split into train and test
X_odor_train, X_odor_test, y_odor_train, y_odor_test = train_test_split(X_odor, y, test_size=0.3, random_state=42)
X_gill_train, X_gill_test, y_gill_train, y_gill_test = train_test_split(X_gill, y, test_size=0.3, random_state=42)
X_both_train, X_both_test, y_both_train, y_both_test = train_test_split(X_both, y, test_size=0.3, random_state=42)

print(f"Train: {len(X_odor_train)}")
print(f"Test: {len(X_odor_test)}")

## Logistic Regression

In [None]:
# odor only
model = LogisticRegression(max_iter=1000)
model.fit(X_odor_train, y_odor_train)
pred = model.predict(X_odor_test)
acc_odor = accuracy_score(y_odor_test, pred)

# gill only
model = LogisticRegression(max_iter=1000)
model.fit(X_gill_train, y_gill_train)
pred = model.predict(X_gill_test)
acc_gill = accuracy_score(y_gill_test, pred)

# both
model = LogisticRegression(max_iter=1000)
model.fit(X_both_train, y_both_train)
pred = model.predict(X_both_test)
acc_both = accuracy_score(y_both_test, pred)

print("Logistic Regression:")
print(f"Odor: {acc_odor:.4f}")
print(f"Gill: {acc_gill:.4f}")
print(f"Both: {acc_both:.4f}")

## Decision Tree

In [None]:
# odor only
model = DecisionTreeClassifier(random_state=42)
model.fit(X_odor_train, y_odor_train)
pred = model.predict(X_odor_test)
acc_odor = accuracy_score(y_odor_test, pred)

# gill only
model = DecisionTreeClassifier(random_state=42)
model.fit(X_gill_train, y_gill_train)
pred = model.predict(X_gill_test)
acc_gill = accuracy_score(y_gill_test, pred)

# both
model = DecisionTreeClassifier(random_state=42)
model.fit(X_both_train, y_both_train)
pred = model.predict(X_both_test)
acc_both = accuracy_score(y_both_test, pred)

print("Decision Tree:")
print(f"Odor: {acc_odor:.4f}")
print(f"Gill: {acc_gill:.4f}")
print(f"Both: {acc_both:.4f}")

## Random Forest

In [None]:
# odor only
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_odor_train, y_odor_train)
pred = model.predict(X_odor_test)
acc_odor = accuracy_score(y_odor_test, pred)
rf_odor = model  # save for later

# gill only
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_gill_train, y_gill_train)
pred = model.predict(X_gill_test)
acc_gill = accuracy_score(y_gill_test, pred)
rf_gill = model  # save for later

# both
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_both_train, y_both_train)
pred = model.predict(X_both_test)
acc_both = accuracy_score(y_both_test, pred)
rf_both = model  # save for later

print("Random Forest:")
print(f"Odor: {acc_odor:.4f}")
print(f"Gill: {acc_gill:.4f}")
print(f"Both: {acc_both:.4f}")

## Confusion Matrix

In [None]:
# confusion matrix for odor
pred = rf_odor.predict(X_odor_test)
cm = confusion_matrix(y_odor_test, pred)
print("Odor predictor:")
print(cm)

plt.figure(figsize=(6, 4))
plt.imshow(cm, cmap='Blues')
plt.colorbar()
plt.title('Confusion Matrix - Odor')
plt.xlabel('Predicted')
plt.ylabel('Actual')
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], ha='center', va='center')
plt.show()

In [None]:
# confusion matrix for gill
pred = rf_gill.predict(X_gill_test)
cm = confusion_matrix(y_gill_test, pred)
print("Gill predictor:")
print(cm)

plt.figure(figsize=(6, 4))
plt.imshow(cm, cmap='Blues')
plt.colorbar()
plt.title('Confusion Matrix - Gill Color')
plt.xlabel('Predicted')
plt.ylabel('Actual')
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], ha='center', va='center')
plt.show()

## Feature Importance

In [None]:
# which features are most important in the combined model
importances = rf_both.feature_importances_
features = X_both.columns

# make a dataframe
imp_df = pd.DataFrame({'feature': features, 'importance': importances})
imp_df = imp_df.sort_values('importance', ascending=False)

print("Top 10 features:")
print(imp_df.head(10))

In [None]:
# compare odor vs gill total importance
odor_total = 0
gill_total = 0

for i, feature in enumerate(features):
    if 'odor' in feature:
        odor_total += importances[i]
    else:
        gill_total += importances[i]

print(f"Odor total importance: {odor_total:.4f}")
print(f"Gill total importance: {gill_total:.4f}")

plt.bar(['Odor', 'Gill'], [odor_total, gill_total])
plt.title('Feature Importance by Type')
plt.ylabel('Total Importance')
plt.show()

## Summary

In [None]:
# make a table of all results
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Odor': [0, 0, 0],
    'Gill': [0, 0, 0],
    'Both': [0, 0, 0]
})

# logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_odor_train, y_odor_train)
results.loc[0, 'Odor'] = accuracy_score(y_odor_test, model.predict(X_odor_test))
model.fit(X_gill_train, y_gill_train)
results.loc[0, 'Gill'] = accuracy_score(y_gill_test, model.predict(X_gill_test))
model.fit(X_both_train, y_both_train)
results.loc[0, 'Both'] = accuracy_score(y_both_test, model.predict(X_both_test))

# decision tree
model = DecisionTreeClassifier(random_state=42)
model.fit(X_odor_train, y_odor_train)
results.loc[1, 'Odor'] = accuracy_score(y_odor_test, model.predict(X_odor_test))
model.fit(X_gill_train, y_gill_train)
results.loc[1, 'Gill'] = accuracy_score(y_gill_test, model.predict(X_gill_test))
model.fit(X_both_train, y_both_train)
results.loc[1, 'Both'] = accuracy_score(y_both_test, model.predict(X_both_test))

# random forest
results.loc[2, 'Odor'] = accuracy_score(y_odor_test, rf_odor.predict(X_odor_test))
results.loc[2, 'Gill'] = accuracy_score(y_gill_test, rf_gill.predict(X_gill_test))
results.loc[2, 'Both'] = accuracy_score(y_both_test, rf_both.predict(X_both_test))

print(results)

In [None]:
# plot the results
x = [0, 1, 2]
width = 0.25

plt.bar([i - width for i in x], results['Odor'], width, label='Odor')
plt.bar(x, results['Gill'], width, label='Gill')
plt.bar([i + width for i in x], results['Both'], width, label='Both')

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.xticks(x, results['Model'])
plt.legend()
plt.ylim([0.8, 1.0])
plt.show()

## Conclusions

Odor is definitely the better predictor. It gets over 97% accuracy with all the models, and Random Forest gets it almost perfect.

Gill color is okay but not as good - around 90-92% accuracy.

Using both together helps a little bit but not much since odor is already so good.

The feature importance chart shows odor features are way more important than gill color features.

Random Forest worked best overall.