# Project 8

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import SVG
# You may need to install the Python graphviz library. At the command line:
#   pip install graphviz
# You will also need to install the graphviz executables. You can use apt,
# macports, or other installer for your system.
from graphviz import Source


In [None]:
df = pd.read_csv("data/agaricus-lepiota.csv")
df['poisonous'] = df.poisonous.map({'e':0, 'p':1})
df

## Pick characteristics more correlated with poiseness mushrooms via research
https://environment.co/how-to-identify-poisonous-mushrooms/

Reasearch says 
- bulbous growth at the base of a mushroom stem
- White gills
- Red on cap or stem
- Strange odor

### Find most correlated odors with poisonous mushrooms

In [None]:


df_odors = pd.get_dummies(df['odor'], prefix='odor', drop_first=True)
df = pd.concat([df, df_odors], axis=1)
related = df_odors.columns.tolist()
X = df[related]
y = df['poisonous']


df_corr = pd.concat([X, y], axis=1)  # Combine features and target for correlation
correlation_matrix = df_corr.corr()


plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Mushroom Features')
plt.show()


### Create new feature for any red color

In [None]:
red_related_columns = ['cap-color', 'gill-color', 'stalk-color-above-ring', 'stalk-color-below-ring']

df['contains-red'] = df[red_related_columns].apply(lambda row: 'e' in row.values, axis=1)

df[df['contains-red'] == True]

### Create new fields for gill-color is white and bulbous stem type

In [None]:
df['gill-white'] = df['gill-color'] == 'w'
#df['skirt-ring'] = df['ring-type'].isin(['p', 'f'])
df['bulbous-stem'] = df['stalk-root'].isin(['b', 'u'])
df

## Create decision tree

In [None]:

feature_cols = ['odor_f','odor_n','contains-red','gill-white','bulbous-stem']

# define X and y
X = df[feature_cols]
y = df['poisonous']

# fit a classification tree with max_depth=3 on all data

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)

graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
display(SVG(graph.pipe(format='svg')))

### Test Tree

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X_train, y_train)
y_pred = treeclf.predict(X_test)

precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

print(f"Precision (Poisonous): {precision[1]:.2f}, Precision (Non-Poisonous): {precision[0]:.2f}")
print(f"Recall (Poisonous): {recall[1]:.2f}, Recall (Non-Poisonous): {recall[0]:.2f}")
print(f"F1 Score (Poisonous): {fscore[1]:.2f}, F1 Score (Non-Poisonous): {fscore[0]:.2f}")
