In [None]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
df= sns.load_dataset("penguins")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [95]:
num_cols = df.select_dtypes('number').columns.tolist()
print(num_cols)

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']


In [96]:
cat_cols = df.select_dtypes('object').columns
print(cat_cols.tolist)

<bound method IndexOpsMixin.tolist of Index(['species', 'island', 'sex'], dtype='object')>


In [97]:
num_imp = SimpleImputer()
cal_imp = SimpleImputer(strategy='most_frequent')

In [98]:
df[num_cols] = num_imp.fit_transform(df[num_cols])
df.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     0
body_mass_g           0
sex                  11
dtype: int64

In [99]:
# OneHotEncoder is used for features is in catagorical form
# LabelEncoder is used for target data is in catagorical form 

In [100]:
df[['sex']] = cal_imp.fit_transform(df[['sex']])
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [101]:
sex_enc = LabelEncoder()
df['sex'] = sex_enc.fit_transform(df['sex'])
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,0
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,0


In [102]:

cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(df[cat_cols]).toarray()
dummy_cols

array([[0., 0., 0., 1., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 1.]], shape=(344, 5))

In [103]:
df['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [104]:
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(df[cat_cols]).toarray()
dummy_cols

array([[0., 0., 0., 1., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 1.]], shape=(344, 5))

In [105]:
dummy_df = pd.DataFrame(dummy_cols)
dummy_df.head()

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,0.0,1.0,0.0


In [None]:
# Data Scaling

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,0
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,0


In [107]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,-8.870812e-01,7.877425e-01,-1.422488,-0.565789,1
1,Adelie,Torgersen,-8.134940e-01,1.265563e-01,-1.065352,-0.503168,0
2,Adelie,Torgersen,-6.663195e-01,4.317192e-01,-0.422507,-1.192003,0
3,Adelie,Torgersen,-1.307172e-15,1.806927e-15,0.000000,0.000000,1
4,Adelie,Torgersen,-1.328605e+00,1.092905e+00,-0.565361,-0.941517,0
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,-1.307172e-15,1.806927e-15,0.000000,0.000000,1
340,Gentoo,Biscoe,5.294731e-01,-1.450118e+00,1.006038,0.811880,0
341,Gentoo,Biscoe,1.191758e+00,-7.380718e-01,1.506028,1.939064,1
342,Gentoo,Biscoe,2.351241e-01,-1.195816e+00,0.791756,1.250229,0


In [109]:
# Combining dummy df with df and removing categorical columns
df = pd.concat([df.drop(columns = cat_cols),dummy_df], axis = 1)
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,4
0,-8.870812e-01,7.877425e-01,-1.422488,-0.565789,0.0,0.0,0.0,1.0,1.0
1,-8.134940e-01,1.265563e-01,-1.065352,-0.503168,0.0,0.0,0.0,1.0,0.0
2,-6.663195e-01,4.317192e-01,-0.422507,-1.192003,0.0,0.0,0.0,1.0,0.0
3,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,0.0,0.0,1.0,1.0
4,-1.328605e+00,1.092905e+00,-0.565361,-0.941517,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,1.0,0.0,0.0,1.0
340,5.294731e-01,-1.450118e+00,1.006038,0.811880,0.0,1.0,0.0,0.0,0.0
341,1.191758e+00,-7.380718e-01,1.506028,1.939064,0.0,1.0,0.0,0.0,1.0
342,2.351241e-01,-1.195816e+00,0.791756,1.250229,0.0,1.0,0.0,0.0,0.0
