In [1]:
# import dependencies
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

In [2]:
# load data for training
orig_df = pd.read_csv('../Resources/mushrooms_df_clean.csv')
orig_df.columns

Index(['Unnamed: 0', 'Poisonous or Edible', 'Cap-Shape', 'Cap-Surface',
       'Cap-Color', 'Bruises', 'Odor', 'Gill-attachment', 'gill-spacing',
       'Gill-size', 'Gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk color below ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'],
      dtype='object')

In [3]:
# Keep X, y columns that can be filtered on
new_df = orig_df[['Poisonous or Edible','Bruises', 'Odor', 'stalk-shape', 'ring-type']].copy()
new_df.head()

Unnamed: 0,Poisonous or Edible,Bruises,Odor,stalk-shape,ring-type
0,p,t,p,e,p
1,e,f,n,t,e
2,e,f,n,t,e
3,e,f,n,t,e
4,e,f,n,t,e


In [4]:
new_df.columns = ['Poisonous or Edible','Bruises', 'Odor', 'stalk_shape', 'ring_type']
new_df.head()

Unnamed: 0,Poisonous or Edible,Bruises,Odor,stalk_shape,ring_type
0,p,t,p,e,p
1,e,f,n,t,e
2,e,f,n,t,e
3,e,f,n,t,e
4,e,f,n,t,e


In [8]:
new_df["ring_type"].value_counts()

p    3488
l    1296
e     824
n      36
Name: ring_type, dtype: int64

In [5]:
# create function to run machine learning
def encodeData(data):
    df = data
    
    # setup data for OneHotEncoder
    app_cat = df.dtypes[df.dtypes == "object"].index.tolist()

    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)
    
#     # save encoder state (must be saved before fit_transform)
#     enc_file = 'Resources/encoder.pkl'
#     pickle.dump(enc, open(enc_file, 'wb'))
    
    # Fit and transform the OneHotEncoder using the categorical variable list
    encoded_df = pd.DataFrame(enc.fit_transform(df[app_cat]))
    
    # save encoder state (must be saved before fit_transform)
    enc_file = '../Resources/encoder.pkl'
    pickle.dump(enc, open(enc_file, 'wb'))

    # Add the encoded variable names to the dataframe
    encoded_df.columns = enc.get_feature_names(app_cat)

    # Merge one-hot encoded features and drop the originals
    mushroom_df = df.merge(encoded_df, left_index=True, right_index=True)
    mushroom_df = mushroom_df.drop(app_cat, axis=1)
    
    # Remove class target from features data
    # Use Poisonous as our "True" value for predictions
    X = mushroom_df.iloc[:,2:].values.astype(int)
    y = mushroom_df.iloc[:,0].astype(int)
    
    return X, y

In [6]:
# Split the preprocessed data into a training and testing dataset
X, y = encodeData(new_df)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

# USE SELECTED CLASSIFER MODEL ON DATA (XGBoost)
# XGBoost supports categorical data as "experimental" starting with version 1.5

# fit model
model = XGBClassifier(
    tree_method='gpu_hist',     # only use gpu_hist or gpu_predictor
    enable_categorical=True,
    use_label_encoder=False,    # label encoding removed in v1.6 Use OneHotEncoding.
    max_depth=4,
    predictor='gpu_predictor',
    random_state=1
)

model.fit(X_train,y_train)

# Save model to Pickle
filename = '../Resources/fit_model.pkl'
pickle.dump(model, open(filename, 'wb'))

