In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 1. One Hot Encoding

In [25]:
df=pd.read_csv("encoding.csv",sep="\t")
df.head()

Unnamed: 0,Score,color,target
0,12,blue,a
1,3,blue,b
2,14,red,c
3,5,red,a
4,6,green,b


In [26]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.2,random_state=42)

In [27]:
x_train

Unnamed: 0,Score,color
8,10,blue
16,1,blue
3,5,red
13,5,blue
15,17,green
17,19,blue
2,14,red
9,11,blue
18,0,red
4,6,green


In [28]:
x_test

Unnamed: 0,Score,color
0,12,blue
5,17,blue
11,13,red
1,3,blue


In [29]:
encoder=OneHotEncoder(sparse=False)

In [30]:
a=pd.DataFrame(encoder.fit_transform(x_train[['color']]),columns=encoder.get_feature_names_out())
b=pd.DataFrame(encoder.transform(x_test[['color']]),columns=encoder.get_feature_names_out())



In [31]:
x_train_trans=pd.concat([x_train.reset_index(drop=True),a],axis=1).drop("color",axis=1)
x_test_trans=pd.concat([x_test.reset_index(drop=True),b],axis=1).drop("color",axis=1)

In [32]:
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [33]:
display(x_train,x_train_trans)

Unnamed: 0,Score,color
8,10,blue
16,1,blue
3,5,red
13,5,blue
15,17,green
17,19,blue
2,14,red
9,11,blue
18,0,red
4,6,green


Unnamed: 0,Score,color_blue,color_green,color_red
0,10,1.0,0.0,0.0
1,1,1.0,0.0,0.0
2,5,0.0,0.0,1.0
3,5,1.0,0.0,0.0
4,17,0.0,1.0,0.0
5,19,1.0,0.0,0.0
6,14,0.0,0.0,1.0
7,11,1.0,0.0,0.0
8,0,0.0,0.0,1.0
9,6,0.0,1.0,0.0


In [34]:
display(x_test,x_test_trans)

Unnamed: 0,Score,color
0,12,blue
5,17,blue
11,13,red
1,3,blue


Unnamed: 0,Score,color_blue,color_green,color_red
0,12,1.0,0.0,0.0
1,17,1.0,0.0,0.0
2,13,0.0,0.0,1.0
3,3,1.0,0.0,0.0


### Handling unknown categories in test data

In [35]:
df=pd.read_csv("encoding-Copy1.csv",sep="\t")
df.head()

Unnamed: 0,Score,color,target
0,12,blue,a
1,3,blue,b
2,14,red,c
3,5,red,a
4,6,green,b


In [36]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.3,random_state=7)

In [37]:
#here we can see that test set has unknown category yellow and black which are not in training set
display(x_train,x_test)

Unnamed: 0,Score,color
6,8,green
10,21,red
11,13,red
8,10,blue
13,5,blue
9,11,blue
12,14,green
14,6,green
7,9,green
16,1,blue


Unnamed: 0,Score,color
1,3,blue
2,14,red
5,17,blue
18,0,yellow
0,12,blue
17,19,black


In [38]:
encoder=OneHotEncoder(sparse_output=False,dtype="int")

In [39]:
encoder.fit_transform(x_train[["color"]])

array([[0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0]])

In [40]:
encoder.transform(x_test[["color"]])

ValueError: Found unknown categories ['black', 'yellow'] in column 0 during transform

In [44]:
#since encoder doesnt know how to handle unknown categories it raises error.

In [45]:
encoder=OneHotEncoder(sparse_output=False,dtype="int",handle_unknown="ignore")

In [46]:
encoder.fit_transform(x_train[["color"]])

array([[0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0]])

In [47]:
encoder.transform(x_test[["color"]])

array([[1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 0],
       [1, 0, 0],
       [0, 0, 0]])

In [48]:
encoder.get_feature_names_out()

array(['color_blue', 'color_green', 'color_red'], dtype=object)

In [49]:
#see it is not raising error.for unknown category value will be all 0s as we can see in row 4 and 6

### k-1 one hot encoding

In [50]:
#here if we have k features we encode /make only k-1 features. 
#k-1 features includes all information and sufficient, thus avoiding reduntant column
#but this is not suitable for tree based methods 
#only suitable for linear models
#and if there is chance of unknown data in test set then dont use this because the unknown category and dropped category will be treated same
#this will avoid dummy variable trap in linear models by removing redundant column

In [51]:
df=pd.read_csv("encoding.csv",sep="\t")
df.head()

Unnamed: 0,Score,color,target
0,12,blue,a
1,3,blue,b
2,14,red,c
3,5,red,a
4,6,green,b


In [52]:
encoder=OneHotEncoder(drop="first",sparse=False,dtype="int")

In [53]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.2,random_state=42)

In [54]:
a=pd.DataFrame(encoder.fit_transform(x_train[['color']]),columns=encoder.get_feature_names_out())
b=pd.DataFrame(encoder.transform(x_test[['color']]),columns=encoder.get_feature_names_out())



In [55]:
x_train_trans=pd.concat([x_train.reset_index(drop=True),a],axis=1).drop("color",axis=1)
x_test_trans=pd.concat([x_test.reset_index(drop=True),b],axis=1).drop("color",axis=1)

In [56]:
#we can observe that only 2 columns are giving all necessary info
#both 0-0 means that model understands that blue is 1
display(x_train,x_train_trans)

Unnamed: 0,Score,color
8,10,blue
16,1,blue
3,5,red
13,5,blue
15,17,green
17,19,blue
2,14,red
9,11,blue
18,0,red
4,6,green


Unnamed: 0,Score,color_green,color_red
0,10,0,0
1,1,0,0
2,5,0,1
3,5,0,0
4,17,1,0
5,19,0,0
6,14,0,1
7,11,0,0
8,0,0,1
9,6,1,0


### k-1 OHE when unknown category comes

In [77]:
df=pd.read_csv("encoding-Copy1.csv",sep="\t")
df.head()

Unnamed: 0,Score,color,target
0,12,blue,a
1,3,blue,b
2,14,red,c
3,5,red,a
4,6,green,b


In [78]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.3,random_state=7)

In [79]:
#here we can see that test set has unknown category yellow and black which are not in training set
display(x_train,x_test)

Unnamed: 0,Score,color
6,8,green
10,21,red
11,13,red
8,10,blue
13,5,blue
9,11,blue
12,14,green
14,6,green
7,9,green
16,1,blue


Unnamed: 0,Score,color
1,3,blue
2,14,red
5,17,blue
18,0,yellow
0,12,blue
17,19,black


In [80]:
encoder=OneHotEncoder(sparse_output=False,dtype="int",drop="first",handle_unknown="ignore")

In [81]:
a=pd.DataFrame(encoder.fit_transform(x_train[['color']]),columns=encoder.get_feature_names_out())
b=pd.DataFrame(encoder.transform(x_test[['color']]),columns=encoder.get_feature_names_out())



In [82]:
x_train_trans=pd.concat([x_train.reset_index(drop=True),a],axis=1).drop("color",axis=1)
x_test_trans=pd.concat([x_test.reset_index(drop=True),b],axis=1).drop("color",axis=1)

In [83]:
display(x_test,x_test_trans)

Unnamed: 0,Score,color
1,3,blue
2,14,red
5,17,blue
18,0,yellow
0,12,blue
17,19,black


Unnamed: 0,Score,color_green,color_red
0,3,0,0
1,14,0,1
2,17,0,0
3,0,0,0
4,12,0,0
5,19,0,0


In [84]:
#so here existing category blue is also 0-0 and new unknown categories yellow,black are 0-0.
#so use k-1 when u are sure there will be no unknown categories

### OHE top categories

In [95]:
df=pd.read_csv("encoding-Copy2.csv",sep="\t")
df.head()

Unnamed: 0,Score,color,target
0,12,blue,a
1,3,blue,b
2,14,red,c
3,5,red,a
4,6,green,b


In [96]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.2,random_state=42)

In [97]:
display(x_train,x_test)

Unnamed: 0,Score,color
8,10,blue
16,1,blue
3,5,red
13,5,pink
15,17,brown
17,19,black
2,14,red
9,11,blue
18,0,yellow
4,6,green


Unnamed: 0,Score,color
0,12,blue
5,17,blue
11,13,white
1,3,blue


In [98]:
x_train["color"].value_counts()

green     5
blue      3
red       3
pink      1
brown     1
black     1
yellow    1
Name: color, dtype: int64

In [99]:
#here we can see there are many infrequet categories
#this can lead to overfitting as well as we can see there is unknown category in test

In [100]:
encoder=OneHotEncoder(sparse_output=False,dtype="int",handle_unknown="infrequent_if_exist",max_categories=4)

In [101]:
a=pd.DataFrame(encoder.fit_transform(x_train[['color']]),columns=encoder.get_feature_names_out())
b=pd.DataFrame(encoder.transform(x_test[['color']]),columns=encoder.get_feature_names_out())

In [102]:
x_train_trans=pd.concat([x_train.reset_index(drop=True),a],axis=1).drop("color",axis=1)
x_test_trans=pd.concat([x_test.reset_index(drop=True),b],axis=1).drop("color",axis=1)

In [103]:
display(x_train,x_train_trans)

Unnamed: 0,Score,color
8,10,blue
16,1,blue
3,5,red
13,5,pink
15,17,brown
17,19,black
2,14,red
9,11,blue
18,0,yellow
4,6,green


Unnamed: 0,Score,color_blue,color_green,color_red,color_infrequent_sklearn
0,10,1,0,0,0
1,1,1,0,0,0
2,5,0,0,1,0
3,5,0,0,0,1
4,17,0,0,0,1
5,19,0,0,0,1
6,14,0,0,1,0
7,11,1,0,0,0
8,0,0,0,0,1
9,6,0,1,0,0


In [105]:
encoder.infrequent_categories_

[array(['black', 'brown', 'pink', 'yellow'], dtype=object)]

In [104]:
display(x_test,x_test_trans)

Unnamed: 0,Score,color
0,12,blue
5,17,blue
11,13,white
1,3,blue


Unnamed: 0,Score,color_blue,color_green,color_red,color_infrequent_sklearn
0,12,1,0,0,0
1,17,1,0,0,0
2,13,0,0,0,1
3,3,1,0,0,0


In [None]:
#here we can see in training data all infrequent categories are grouped into one single category
#and in new unknown data comes it is also put into that category

# 2. Ordinal Encoding

In [163]:
#to perfrom encoding of categorical features as integers

In [164]:
from sklearn.preprocessing import OrdinalEncoder

In [165]:
df=pd.read_csv("encoding-Copy1.csv",sep="\t")
df.head(2)

Unnamed: 0,Score,color,target
0,12,blue,a
1,3,blue,b


In [166]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.2,random_state=7)

In [167]:
display(x_train,x_test)

Unnamed: 0,Score,color
0,12,blue
17,19,black
6,8,green
10,21,red
11,13,red
8,10,blue
13,5,blue
9,11,blue
12,14,green
14,6,green


Unnamed: 0,Score,color
1,3,blue
2,14,red
5,17,blue
18,0,yellow


In [168]:
#since i have unknown data here, i am giving a value to encoder so that it assign for unknown values instead of error
#if we dont provide categories then it encodes based on alphabetical order by default like black-0,blue-1,green-2,red-3
#but since i have given order red-0,blue-1,green-2,black-3
encoder=OrdinalEncoder(categories=[["red","blue","green","black"]],handle_unknown="use_encoded_value",unknown_value=-1)

In [169]:
x_train["color_encoded"]=encoder.fit_transform(x_train[["color"]])

In [170]:
x_test["color_encoded"]=encoder.transform(x_test[["color"]])

In [171]:
display(x_train,x_test)

Unnamed: 0,Score,color,color_encoded
0,12,blue,1.0
17,19,black,3.0
6,8,green,2.0
10,21,red,0.0
11,13,red,0.0
8,10,blue,1.0
13,5,blue,1.0
9,11,blue,1.0
12,14,green,2.0
14,6,green,2.0


Unnamed: 0,Score,color,color_encoded
1,3,blue,1.0
2,14,red,0.0
5,17,blue,1.0
18,0,yellow,-1.0


In [172]:
x_train.drop("color",axis=1,inplace=True)
x_test.drop("color",axis=1,inplace=True)

# 3. Label Encoder

In [189]:
#this is used to encode target variables only with values starting from 0. ex:if binary class-0,1
#this is fixed and no parameters it has.it encodes based on alphabetical order which is not a problem because this is just target

In [190]:
from sklearn.preprocessing import LabelEncoder

In [191]:
df=pd.read_csv("encoding-Copy3.csv",sep="\t")
df.head()

Unnamed: 0,Score,color,target
0,12,blue,c
1,3,blue,a
2,14,red,c
3,5,red,a
4,6,green,c


In [192]:
x_train,x_test,y_train,y_test=train_test_split(df[["Score","color"]],df["target"],test_size=0.2,random_state=7)

In [193]:
encoder=LabelEncoder()

In [194]:
y_train.to_numpy()

array(['c', 'c', 'a', 'b', 'c', 'c', 'b', 'a', 'a', 'c', 'b', 'b', 'a',
       'c', 'a'], dtype=object)

In [195]:
y_test.to_numpy()

array(['a', 'c', 'c', 'a'], dtype=object)

In [196]:
encoder.fit_transform(y_train)

array([2, 2, 0, 1, 2, 2, 1, 0, 0, 2, 1, 1, 0, 2, 0])

In [197]:
encoder.transform(y_test)

array([0, 2, 2, 0])

## 4. Mean/Target Encoding

In [21]:
#pip install category_encoders

In [22]:
from category_encoders import TargetEncoder

In [23]:
df=pd.read_csv("titanic.csv",usecols=["Survived","Sex","Embarked"])
df.head()

Unnamed: 0,Survived,Sex,Embarked
0,0,male,S
1,1,female,C
2,1,female,S
3,1,female,S
4,0,male,S


In [24]:
df["Embarked"].fillna("S",inplace=True)

In [25]:
x_train,x_test,y_train,y_test=train_test_split(df[["Sex","Embarked"]],df["Survived"],test_size=0.2,random_state=7)

In [26]:
enc=TargetEncoder(cols=["Sex","Embarked"],smoothing=10)

In [27]:
x_train_enc=enc.fit_transform(x_train,y_train)

In [28]:
x_test_enc=enc.transform(x_test)

In [29]:
x_train_enc

Unnamed: 0,Sex,Embarked
205,0.759843,0.332039
718,0.174672,0.409406
835,0.759843,0.566175
851,0.174672,0.332039
773,0.174672,0.566175
...,...,...
579,0.174672,0.332039
502,0.759843,0.409406
537,0.759843,0.566175
196,0.174672,0.409406


## 5. Weight of Evidence

In [30]:
from category_encoders import WOEEncoder

In [32]:
df=pd.read_csv("titanic.csv",usecols=["Survived","Sex","Embarked"])
df["Embarked"].fillna("S",inplace=True)
df.head()

Unnamed: 0,Survived,Sex,Embarked
0,0,male,S
1,1,female,C
2,1,female,S
3,1,female,S
4,0,male,S


In [33]:
x_train,x_test,y_train,y_test=train_test_split(df[["Sex","Embarked"]],df["Survived"],test_size=0.2,random_state=7)

In [36]:
enc=WOEEncoder(cols=["Sex","Embarked"])

In [37]:
x_train_enc=enc.fit_transform(x_train,y_train)

In [38]:
x_test_enc=enc.transform(x_test)

In [39]:
x_train_enc

Unnamed: 0,Sex,Embarked
205,1.612998,-0.223776
718,-1.070813,0.119452
835,1.612998,0.734638
851,-1.070813,-0.223776
773,-1.070813,0.734638
...,...,...
579,-1.070813,-0.223776
502,1.612998,0.119452
537,1.612998,0.734638
196,-1.070813,0.119452
