In [19]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [20]:
# loading in data
df = pd.read_csv("/Users/cartermain/Downloads/mushroom.csv")

In [86]:
# counting null values
print(df.isnull().sum())

cap-shape                         0
cap-surface                       0
cap-color                         0
bruises%3F                        0
odor                              0
gill-attachment                   0
gill-spacing                      0
gill-size                         0
gill-color                        0
stalk-shape                       0
stalk-root                        0
stalk-surface-above-ring          0
stalk-surface-below-ring          0
stalk-color-above-ring            0
stalk-color-below-ring            0
veil-type                         0
veil-color                        0
ring-number                       0
ring-type                         0
spore-print-color                 0
population                        0
habitat                           0
class                             0
cap-shape_class                   0
cap-surface_class                 0
cap-color_class                   0
bruises%3F_class                  0
odor_class                  

In [23]:
# using labelencoder to turn object dtypes into integers
le = preprocessing.LabelEncoder()
for feature in df.columns:
    df[feature + "_class"] = le.fit_transform(df[feature])

In [87]:
# inspecting dataframe after creating new columns
print(df.head())

  cap-shape cap-surface cap-color bruises%3F  odor gill-attachment  \
0      b'x'        b's'      b'n'       b't'  b'p'            b'f'   
1      b'x'        b's'      b'y'       b't'  b'a'            b'f'   
2      b'b'        b's'      b'w'       b't'  b'l'            b'f'   
3      b'x'        b'y'      b'w'       b't'  b'p'            b'f'   
4      b'x'        b's'      b'g'       b'f'  b'n'            b'f'   

  gill-spacing gill-size gill-color stalk-shape  ...  \
0         b'c'      b'n'       b'k'        b'e'  ...   
1         b'c'      b'b'       b'k'        b'e'  ...   
2         b'c'      b'b'       b'n'        b'e'  ...   
3         b'c'      b'n'       b'n'        b'e'  ...   
4         b'w'      b'b'       b'k'        b't'  ...   

  stalk-color-above-ring_class stalk-color-below-ring_class veil-type_class  \
0                            7                            7               0   
1                            7                            7               0   
2    

In [25]:
# collecting feature names of newly-created integer classifications
feature_names = []
for feature in df.columns:
    if "_class" in feature:
        feature_names.append(feature)

In [26]:
# creating feature set
features = df[feature_names]

In [27]:
# dropping classification column
features = features.drop(columns = "class_class")

In [29]:
# train test splitting
x_train, x_test, y_train, y_test = train_test_split(features, df["class_class"], train_size = 0.7, random_state = 42)

In [30]:
# fitting and scoring model
model = RandomForestClassifier(n_estimators = 10, random_state = 42)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))

1.0


Let's run a feature elimination to see if we can get the number of features this model takes down while maintining 100% accuracy to increase efficiency.

In [34]:
max_score = 0
best_x = 0 
for x in range(1,len(features.columns)):
    rfe = RFE(estimator = model, n_features_to_select = x)
    rfe.fit(x_train, y_train)
    score = rfe.score(x_test, y_test)
    if score > max_score:
        max_score = score
        best_x = x
        best_support = rfe.support_
print(max_score, best_x, best_support)

1.0 6 [False False False False  True False False  True  True False False False
  True False False False False False  True  True False False]


Nice, we kept the accuracy and significantly reduces the number of features needed. Let's update the feature set with those 6 and retrain before dropping in some test data.

In [36]:
# collecting kept feature names
kept_features = []
x = 0
for kept in best_support:
    if kept == True:
        kept_features.append(features.columns[x])
    x +=1

In [38]:
# creating revised features dataframe
revised_features = df[kept_features]

In [39]:
# resplitting revised feature set
x_train, x_test, y_train, y_test = train_test_split(revised_features, df["class_class"], train_size = 0.7, random_state = 42)

In [40]:
# refitting model
model.fit(x_train, y_train)
print(model.score(x_test, y_test))

1.0


As a final step, let's load some supplemental data in without an edibility classification and use our model to determine if it's edible or poisonous.

In [75]:
# loading in data
test_data = pd.read_csv("/Users/cartermain/Downloads/Mushroom Test Data.csv")

In [76]:
# converting into classifications based on original dataset classifications to ensure these match what the model was trained on
for feature in test_data.columns:
    le.fit(df[feature])
    test_data[feature + "_class"] = le.transform(test_data[feature])

In [77]:
# collecting name of newly created columns
class_test_data = []
for feature in test_data.columns:
    if "_class" in feature:
        class_test_data.append(feature)

In [85]:
# creating set of features that model will use to predict
test_features = test_data[class_test_data]

In [80]:
# using model to predict whether or not each of these mushrooms are edible
test_data["edible?"] = model.predict(test_features)

In [84]:
# printing results
print(test_data["edible?"])

0    0
1    0
2    1
3    1
Name: edible?, dtype: int64


Mushrooms 0 and 1 are edible, mushrooms 2 and 3 are not