In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv("data/mushrooms.csv")

In [4]:
df["class"] = df["class"].apply(lambda x: 1 if "p" else 0)

In [5]:
train_set, test_set = train_test_split(df, test_size = 0.3, random_state = 10)

In [6]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [7]:
lab_cols = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [8]:
#target pipeline

target_pipeline = Pipeline(
    steps=[
        ("ordinal_encoder", OrdinalEncoder()),
        ("pca", PCA(n_components=10))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("lab_pipeline", target_pipeline, lab_cols)
    ]
)


In [9]:
target_column = "class"

In [10]:
drop_columns = [target_column]

In [11]:
input_feature_train_df = train_set.drop(columns = drop_columns, axis = 1)
target_feature_train_df = train_set[target_column]

In [12]:
input_feature_test_df = test_set.drop(columns = drop_columns, axis = 1)
target_feature_test_df = test_set[target_column]

In [15]:
input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessor.transform(input_feature_test_df)

In [16]:
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]


In [17]:
train_arr

array([[ 6.40005874,  0.71737581,  0.77474753, ...,  0.25743174,
        -0.21809242,  1.        ],
       [-5.10338118, -3.87298142,  2.50305722, ..., -2.61325944,
         1.13597538,  1.        ],
       [-0.6933122 , -1.93861363,  0.13861589, ..., -0.42351017,
         1.81023608,  1.        ],
       ...,
       [-0.45774016,  3.15935309,  5.95314147, ..., -2.49596369,
        -1.61180861,  1.        ],
       [-2.42778227, -5.5042774 , -2.17134532, ...,  0.64643389,
         0.35123841,  1.        ],
       [-0.55061669,  1.99372197, -3.88513224, ...,  0.87545126,
        -1.54869207,  1.        ]])

In [18]:
test_arr

array([[-1.52960887,  3.01150973, -3.09623674, ..., -0.31633862,
        -1.1045539 ,  1.        ],
       [-2.11216056, -1.76907499, -2.01350501, ..., -1.23054355,
         0.23710009,  1.        ],
       [-0.62662671,  2.04344835, -2.98526961, ...,  0.34822784,
        -1.26227765,  1.        ],
       ...,
       [ 2.27413708,  0.13396777, -1.67821525, ..., -0.66244189,
         0.69143697,  1.        ],
       [-0.97865513, -1.21011499,  4.3710846 , ..., -0.24964597,
        -1.18054038,  1.        ],
       [-5.75455549,  0.70522095, -4.61199863, ..., -0.07557396,
        -0.0329173 ,  1.        ]])