#### Import library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### DummyVariables_EncodingCategoricalData

#### Import dataset

In [2]:
data = {'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
        'Country': ['Germany', 'France', 'Spain', 'Germany', 'Spain'], 
        'Purchased': ['Yes', 'No', 'No', 'Yes', 'Yes'], 
        'Salary': [72000, 48000, 35000, 56000, 45000],
        'Age': [44, 36, 20, 28, 30]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Gender,Country,Purchased,Salary,Age
0,Male,Germany,Yes,72000,44
1,Female,France,No,48000,36
2,Female,Spain,No,35000,20
3,Male,Germany,Yes,56000,28
4,Female,Spain,Yes,45000,30


#### Option 1

In [3]:
df1 = pd.get_dummies(df, drop_first=True) #To avoid multicollinearity (especially useful for regression models)
df1

Unnamed: 0,Salary,Age,Gender_Male,Country_Germany,Country_Spain,Purchased_Yes
0,72000,44,1,1,0,1
1,48000,36,0,0,0,0
2,35000,20,0,0,1,0
3,56000,28,1,1,0,1
4,45000,30,0,0,1,1


#### Option 2

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' to avoid multicollinearity
encoded_data = encoder.fit_transform(df[['Gender', 'Country', 'Purchased']])
feature_names = encoder.get_feature_names_out(['Gender', 'Country', 'Purchased'])
encoded_df = pd.DataFrame(encoded_data, columns=feature_names, dtype=int)

df2 = pd.concat([df[['Salary', 'Age']].reset_index(drop=True), encoded_df], axis=1)

df2

Unnamed: 0,Salary,Age,Gender_Male,Country_Germany,Country_Spain,Purchased_Yes
0,72000,44,1,1,0,1
1,48000,36,0,0,0,0
2,35000,20,0,0,1,0
3,56000,28,1,1,0,1
4,45000,30,0,0,1,1


### Feature Scaling

#### Standard Scaler  [-3, 3]

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # Don't need to apply standard scaler for dummy variables
scaled_X = scaler.fit_transform(df[['Salary', 'Age']])

scaled_X = pd.DataFrame(data = scaled_X, columns=['Salary', 'Age'])

df3 = pd.concat([df[['Gender', 'Country', 'Purchased']].reset_index(drop=True), scaled_X], axis=1)
df3

Unnamed: 0,Gender,Country,Purchased,Salary,Age
0,Male,Germany,Yes,1.679605,1.542308
1,Female,France,No,-0.258401,0.54727
2,Female,Spain,No,-1.308154,-1.442804
3,Male,Germany,Yes,0.387601,-0.447767
4,Female,Spain,Yes,-0.500652,-0.199007


#### Normalization [-1, 1]

In [18]:
from sklearn.preprocessing import MinMaxScaler  #also known as min/max scaler

scaler = MinMaxScaler()
normalized_X = scaler.fit_transform(df[['Salary', 'Age']])

normalized_X = pd.DataFrame(data=normalized_X, columns=['Salary', 'Age'])

df4 = pd.concat([df[['Gender', 'Country', 'Purchased']].reset_index(drop=True), normalized_X], axis=1)

df4

Unnamed: 0,Gender,Country,Purchased,Salary,Age
0,Male,Germany,Yes,1.0,1.0
1,Female,France,No,0.351351,0.666667
2,Female,Spain,No,0.0,0.0
3,Male,Germany,Yes,0.567568,0.333333
4,Female,Spain,Yes,0.27027,0.416667
