In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Open the file
df = pd.read_csv('Data_Preproc_NoEncoding.csv')
df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,15.26,x,g,o,f,e,c,w,16.95,17.09,w,t,g,d,w
1,16.6,x,g,o,f,e,c,w,17.99,18.19,w,t,g,d,u
2,14.07,x,g,o,f,e,c,w,17.8,17.74,w,t,g,d,w
3,14.17,f,h,e,f,e,c,w,15.77,15.98,w,t,p,d,w
4,14.64,x,h,o,f,e,c,w,16.53,17.2,w,t,p,d,w


In [3]:
feature_columns = [cols for cols in df.columns]

# Store the columns which are categorical
categorical_cols = []
for cols in feature_columns:
    if df[cols].dtype == 'object':
        categorical_cols.append(cols)

print(categorical_cols)

['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']


In [4]:
# Create a new dataframe with only categorical columns
df_cat = df[categorical_cols]
df_cat.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-color,has-ring,ring-type,habitat,season
0,x,g,o,f,e,c,w,w,t,g,d,w
1,x,g,o,f,e,c,w,w,t,g,d,u
2,x,g,o,f,e,c,w,w,t,g,d,w
3,f,h,e,f,e,c,w,w,t,p,d,w
4,x,h,o,f,e,c,w,w,t,p,d,w


In [5]:
# Create a new dataframe with only numerical columns
df_num = df.drop(categorical_cols, axis=1)
df_num.head()

Unnamed: 0,cap-diameter,stem-height,stem-width
0,15.26,16.95,17.09
1,16.6,17.99,18.19
2,14.07,17.8,17.74
3,14.17,15.77,15.98
4,14.64,16.53,17.2


In [7]:
# One hot encode the categorical columns
onehotencoder = OneHotEncoder(sparse=False)
df_cat_onehot = onehotencoder.fit_transform(df_cat)
df_cat_onehot = pd.DataFrame(df_cat_onehot)

print("Shape of the one hot encoded categorical data: ", df_cat_onehot.shape)

Shape of the one hot encoded categorical data:  (61069, 89)




In [8]:
# Merge the numerical and categorical data
df_final = pd.concat([df_num, df_cat_onehot], axis=1)
df_final.head()

Unnamed: 0,cap-diameter,stem-height,stem-width,0,1,2,3,4,5,6,...,79,80,81,82,83,84,85,86,87,88
0,15.26,16.95,17.09,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,16.6,17.99,18.19,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,14.07,17.8,17.74,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,14.17,15.77,15.98,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,14.64,16.53,17.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# Save the data to a csv file
df_final.to_csv('Data_Preproc_OneHotEncoding.csv', index=False)