#### Import library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### Ordinal Encoder & Label Encoder

In [21]:
data = {'OrderPriority': ['High', 'Medium', 'High', 'Medium', 'High'],
        'OrderSatisfaction': ['Satisfied', 'Dissatisfied', 'Prefer not to answer', 'Very Satisfied', 'Very Dissatisfied'], 
        'Segment': ['Customer', 'Ecomm', 'Customer', 'Ecomm', 'Ecomm'], 
        'Payment': ['Credit Card', 'PayPal', 'Cash', 'Credit Card', 'Cash']
}

df = pd.DataFrame(data)
df

Unnamed: 0,OrderPriority,OrderSatisfaction,Segment,Payment
0,High,Satisfied,Customer,Credit Card
1,Medium,Dissatisfied,Ecomm,PayPal
2,High,Prefer not to answer,Customer,Cash
3,Medium,Very Satisfied,Ecomm,Credit Card
4,High,Very Dissatisfied,Ecomm,Cash


In [23]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

#Ordinal Encoding for Order Priority and CustomerOrderSatisfaction, the order these are places MATTERS!
order_priority_encoded = OrdinalEncoder(categories=[['High', 'Medium']])
df['OrderPriorityEncoded'] = order_priority_encoded.fit_transform(df[['OrderPriority']]).astype(int)

customer_satisfaction_encoded = OrdinalEncoder(categories=[['Very Dissatisfied', 'Dissatisfied', 'Prefer not to answer', 'Satisfied', 'Very Satisfied']])
df['CustomerSatisfactionEncoded'] = customer_satisfaction_encoded.fit_transform(df[['OrderSatisfaction']]).astype(int)


#Label Encoder for Segment and PaymentMethod, the other doesn't matter
segment_encoded = LabelEncoder()
df['SegmentEncoded'] = segment_encoded.fit_transform(df['Segment'])

payment_encoded = LabelEncoder()
df['PaymentMethodEncoded'] = payment_encoded.fit_transform(df['Payment'])

df

Unnamed: 0,OrderPriority,OrderSatisfaction,Segment,Payment,OrderPriorityEncoded,CustomerSatisfactionEncoded,SegmentEncoded,PaymentMethodEncoded
0,High,Satisfied,Customer,Credit Card,0,3,0,1
1,Medium,Dissatisfied,Ecomm,PayPal,1,1,1,2
2,High,Prefer not to answer,Customer,Cash,0,2,0,0
3,Medium,Very Satisfied,Ecomm,Credit Card,1,4,1,1
4,High,Very Dissatisfied,Ecomm,Cash,0,0,1,0


#### TransactionEncoder

In [38]:
transactions_list = [
    ['GLOBE ', 'TOWELS', 'SHEET', 'NIGHT LIGHT', 'CARD GAME ', 'STARS GIFT TAPE ', 'PUZZLES'],
     ['RAIN HAT ', 'CARD GAME ', 'MINI JIGSAW', 'MINI JIGSAW','CHARLOTTE BAG'],
     ['LUNCH BAG RED RETROSPOT', 'LUNCH BAG',  'MINI CASES', 'FRIDGE MAGNETS', 'POLKADOTS']
]

In [39]:
from mlxtend.preprocessing import TransactionEncoder

TE = TransactionEncoder()
array = TE.fit(transactions_list).transform(transactions_list)

df = pd.DataFrame(array, columns=TE.columns_)
df

Unnamed: 0,CARD GAME,CHARLOTTE BAG,FRIDGE MAGNETS,GLOBE,LUNCH BAG,LUNCH BAG RED RETROSPOT,MINI CASES,MINI JIGSAW,NIGHT LIGHT,POLKADOTS,PUZZLES,RAIN HAT,SHEET,STARS GIFT TAPE,TOWELS
0,True,False,False,True,False,False,False,False,True,False,True,False,True,True,True
1,True,True,False,False,False,False,False,True,False,False,False,True,False,False,False
2,False,False,True,False,True,True,True,False,False,True,False,False,False,False,False


#### DummyVariables_EncodingCategoricalData

In [30]:
data = {'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
        'Country': ['Germany', 'France', 'Spain', 'Germany', 'Spain'], 
        'Purchased': ['Yes', 'No', 'No', 'Yes', 'Yes'], 
        'Salary': [72000, 48000, 35000, 56000, 45000],
        'Age': [44, 36, 20, 28, 30]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Gender,Country,Purchased,Salary,Age
0,Male,Germany,Yes,72000,44
1,Female,France,No,48000,36
2,Female,Spain,No,35000,20
3,Male,Germany,Yes,56000,28
4,Female,Spain,Yes,45000,30


#### Option 1

In [3]:
df1 = pd.get_dummies(df, drop_first=True) #To avoid multicollinearity (especially useful for regression models)
df1

Unnamed: 0,Salary,Age,Gender_Male,Country_Germany,Country_Spain,Purchased_Yes
0,72000,44,1,1,0,1
1,48000,36,0,0,0,0
2,35000,20,0,0,1,0
3,56000,28,1,1,0,1
4,45000,30,0,0,1,1


#### Option 2

In [31]:
from sklearn.preprocessing import OneHotEncoder

#encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' to avoid multicollinearity
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_data = encoder.fit_transform(df[['Gender', 'Country', 'Purchased']])
feature_names = encoder.get_feature_names_out(['Gender', 'Country', 'Purchased'])
encoded_df = pd.DataFrame(encoded_data, columns=feature_names, dtype=int)

df2 = pd.concat([df[['Salary', 'Age']].reset_index(drop=True), encoded_df], axis=1)

df2

Unnamed: 0,Salary,Age,Gender_Male,Country_Germany,Country_Spain,Purchased_Yes
0,72000,44,1,1,0,1
1,48000,36,0,0,0,0
2,35000,20,0,0,1,0
3,56000,28,1,1,0,1
4,45000,30,0,0,1,1


#### Feature Scaling

#### Standard Scaler  [-3, 3]

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # Don't need to apply standard scaler for dummy variables
scaled_X = scaler.fit_transform(df[['Salary', 'Age']])

scaled_X = pd.DataFrame(data = scaled_X, columns=['Salary', 'Age'])

df3 = pd.concat([df[['Gender', 'Country', 'Purchased']].reset_index(drop=True), scaled_X], axis=1)
df3

Unnamed: 0,Gender,Country,Purchased,Salary,Age
0,Male,Germany,Yes,1.679605,1.542308
1,Female,France,No,-0.258401,0.54727
2,Female,Spain,No,-1.308154,-1.442804
3,Male,Germany,Yes,0.387601,-0.447767
4,Female,Spain,Yes,-0.500652,-0.199007


#### Normalization [-1, 1]

In [33]:
from sklearn.preprocessing import MinMaxScaler  #also known as min/max scaler

scaler = MinMaxScaler()
normalized_X = scaler.fit_transform(df[['Salary', 'Age']])

normalized_X = pd.DataFrame(data=normalized_X, columns=['Salary', 'Age'])

df4 = pd.concat([df[['Gender', 'Country', 'Purchased']].reset_index(drop=True), normalized_X], axis=1)

df4

Unnamed: 0,Gender,Country,Purchased,Salary,Age
0,Male,Germany,Yes,1.0,1.0
1,Female,France,No,0.351351,0.666667
2,Female,Spain,No,0.0,0.0
3,Male,Germany,Yes,0.567568,0.333333
4,Female,Spain,Yes,0.27027,0.416667
