# Data preparation

In [45]:
import pandas as pd
import pickle as rick
from sklearn.preprocessing import StandardScaler

In [39]:
# Set labels
labels = ["poisonous_edible", 
          "cap-shape",
          "cap-surface",
          "cap-color",
          "bruises?",
          "odor",
          "gill-attachment",
          "gill-spacing",
          "gill-size",
          "gill-color",
          "stalk-shape",
          "stalk-root",
          "stalk-surface-above-ring",
          "stalk-surface-below-ring",
          "stalk-color-above-ring",
          "stalk-color-below-ring",
          "veil-type",
          "veil-color",
          "ring-number",
          "ring-type",
          "spore-print-color",
          "population",
          "habitat"]

In [40]:
# Import with full length headers
df = pd.read_csv('data/agaricus-lepiota.data', names=labels, encoding= 'unicode_escape')

In [41]:
# Set target 'y'
y = df['poisonous_edible'].tolist()

# And show top-10
y[:10]

['p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e']

In [42]:
# Use lambda to convert each value in each column to a numeric representation
for label in labels:
    df[label] = df[label].apply(lambda a: ord(a))


In [43]:
# Set data 'X' (by removing target column)
X = df.drop(columns=['poisonous_edible'])

# And show top-10
X[:10]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,120,115,110,116,112,102,99,110,107,101,...,115,119,119,112,119,111,112,107,115,117
1,120,115,121,116,97,102,99,98,107,101,...,115,119,119,112,119,111,112,110,110,103
2,98,115,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,110,109
3,120,121,119,116,112,102,99,110,110,101,...,115,119,119,112,119,111,112,107,115,117
4,120,115,103,102,110,102,119,98,107,116,...,115,119,119,112,119,111,101,110,97,103
5,120,121,121,116,97,102,99,98,110,101,...,115,119,119,112,119,111,112,107,110,103
6,98,115,119,116,97,102,99,98,103,101,...,115,119,119,112,119,111,112,107,110,109
7,98,121,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,115,109
8,120,121,119,116,112,102,99,110,112,101,...,115,119,119,112,119,111,112,107,118,103
9,98,115,121,116,97,102,99,98,103,101,...,115,119,119,112,119,111,112,107,115,109


In [49]:
# Use Z-scale transformation
scaler = StandardScaler()

scaler.fit(X)

In [54]:
# Show transformed data
X = scaler.transform(X)

print(X)

[[ 1.07774273  0.16938584  0.12010595 ... -0.65193203 -0.1598263
   2.20918348]
 [ 1.07774273  0.16938584  1.5976869  ... -0.14070656 -0.96730694
  -0.33531051]
 [-1.40169814  0.16938584  1.32903582 ... -0.14070656 -0.96730694
   0.75518692]
 ...
 [-0.95089071  0.16938584  0.12010595 ... -2.18560841 -2.74376435
   0.57343735]
 [-0.38738142  0.93786532  0.12010595 ...  1.39296982  0.32466208
   0.57343735]
 [ 1.07774273  0.16938584  0.12010595 ...  0.02970192 -2.74376435
   0.57343735]]


In [55]:
# Save 'X' as pickle object
with open('X.pickle', 'wb') as f:
    rick.dump(X, f)

# Save 'y' as pickle object
with open('y.pickle', 'wb') as f:
    rick.dump(y, f)