# Dependencies

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Preprocessing

In [14]:
#import data
filepath = 'Resources/Raw/primary_data.csv'
filepath2 = 'Resources/Raw/secondary_data.csv'
df = pd.read_csv(filepath, sep=';')
df.head()

Unnamed: 0,family,name,class,cap-diameter,cap-shape,Cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,Spore-print-color,habitat,season
0,Amanita Family,Fly Agaric,p,"[10, 20]","[x, f]","[g, h]","[e, o]",[f],[e],,...,[s],[y],[w],[u],[w],[t],"[g, p]",,[d],"[u, a, w]"
1,Amanita Family,Panther Cap,p,"[5, 10]","[p, x]",[g],[n],[f],[e],,...,,[y],[w],[u],[w],[t],[p],,[d],"[u, a]"
2,Amanita Family,False Panther Cap,p,"[10, 15]","[x, f]",,"[g, n]",[f],[e],,...,,,[w],[u],[w],[t],"[e, g]",,[d],"[u, a]"
3,Amanita Family,The Blusher,e,"[5, 15]","[x, f]",,[n],[t],,,...,[b],,[w],[u],[w],[t],[g],,[d],"[u, a]"
4,Amanita Family,Death Cap,p,"[5, 12]","[x, f]",[h],[r],[f],,[c],...,,,[w],[u],[w],[t],"[g, p]",,[d],"[u, a]"


In [4]:
#check num of null values
df.isnull().sum()

class                       0
cap-diameter                0
cap-shape                   0
cap-surface             14120
cap-color                   0
does-bruise-or-bleed        0
gill-attachment          9884
gill-spacing            25063
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               51538
stem-surface            38124
stem-color                  0
veil-type               57892
veil-color              53656
has-ring                    0
ring-type                2471
spore-print-color       54715
habitat                     0
season                      0
dtype: int64

In [5]:
#drop features with more than 75% null values
df_smaller = df.drop(columns={'stem-root', 'stem-surface','veil-type', 'veil-color', 'spore-print-color', 'gill-spacing'})
df_smaller.head()

#dropped columns that had large number of nan
#nan could be a useful classifier, will consider re-running with the nans left in

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,g,o,f,e,w,16.95,17.09,w,t,g,d,w
1,p,16.6,x,g,o,f,e,w,17.99,18.19,w,t,g,d,u
2,p,14.07,x,g,o,f,e,w,17.8,17.74,w,t,g,d,w
3,p,14.17,f,h,e,f,e,w,15.77,15.98,w,t,p,d,w
4,p,14.64,x,h,o,f,e,w,16.53,17.2,w,t,p,d,w


In [6]:
#dropping rows with nans
df_smaller.dropna(inplace=True)

In [7]:
#one-hot encoding
df_dummies = pd.get_dummies(df_smaller)
df_dummies

Unnamed: 0,cap-diameter,stem-height,stem-width,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,...,habitat_g,habitat_h,habitat_l,habitat_m,habitat_u,habitat_w,season_a,season_s,season_u,season_w
0,15.26,16.95,17.09,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,16.60,17.99,18.19,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,14.07,17.80,17.74,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,14.17,15.77,15.98,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,14.64,16.53,17.20,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,1.18,3.93,6.22,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
61065,1.27,3.18,5.43,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
61066,1.27,3.86,6.37,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
61067,1.24,3.56,5.44,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
#separate X and y data
X = df_dummies.drop(columns = {'class_e','class_p'})
y = df_dummies['class_e']

In [9]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# scaling the data
# using standard scaler because assuming the mushroom data is normal
X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Export

In [11]:
# traindata export
train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns.to_list())
train_df = pd.concat([train_df, y_train], axis=1, join='inner')
train_df.to_csv('Resources/Cleaned/train.csv', index=False)

# test data export
test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns.to_list())
test_df = pd.concat([test_df, y_test], axis=1, join='inner')
test_df.to_csv('Resources/Cleaned/test.csv', index=False)