In [1]:
import seaborn as sns


In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Encoding Technique
1. Nominal or OHE 
2. Ordinal and Label Encoding
3. Target guided ordinal encoding

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [5]:
df = pd.DataFrame({'color':['red','blue','green','green','red','blue']})

In [6]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [7]:
df.value_counts()

color
blue     2
green    2
red      2
Name: count, dtype: int64

In [8]:
encoder = OneHotEncoder()

In [9]:
encoded = encoder.fit_transform(df[['color']])

In [10]:
encoded.toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [11]:
encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (6, 3)>

In [12]:
encoder.get_feature_names_out()

array(['color_blue', 'color_green', 'color_red'], dtype=object)

In [13]:
df1= pd.DataFrame(encoded.toarray(),columns=encoder.get_feature_names_out())

In [14]:
df1

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [15]:
df2= pd.concat([df,df1],axis=1)

In [16]:
df2

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


## Lable ENcoding

In [17]:
from sklearn.preprocessing import LabelEncoder

In [22]:
df = pd.DataFrame({'color':['red','blue','green','green','red','blue','black','white']})
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue
6,black
7,white


In [23]:
df.value_counts()

color
blue     2
red      2
green    2
black    1
white    1
Name: count, dtype: int64

In [24]:
labl_encoder = LabelEncoder()

In [25]:
labl_encoder.fit_transform(df['color'])

array([3, 1, 2, 2, 3, 1, 0, 4])

## Ordinal Encoding

In [26]:
from sklearn.preprocessing import OrdinalEncoder

In [27]:
df_size= pd.DataFrame({
    'size': ['small','medium','large','medium','small','large']
})

In [28]:
df_size

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [31]:
df_size.value_counts()

size  
large     2
medium    2
small     2
Name: count, dtype: int64

In [33]:
ordinal_encodr= OrdinalEncoder(categories=[['small','medium','large']])

In [34]:
ord = ordinal_encodr.fit_transform(df_size[['size']])

In [35]:
ord

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [36]:
fd1= pd.DataFrame(ord,columns=['orninal_encoder'])

In [38]:
fd1

Unnamed: 0,orninal_encoder
0,0.0
1,1.0
2,2.0
3,1.0
4,0.0
5,2.0


In [39]:
df_size

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [40]:
df3= pd.concat([df_size,fd1],axis=1)

In [41]:
df3

Unnamed: 0,size,orninal_encoder
0,small,0.0
1,medium,1.0
2,large,2.0
3,medium,1.0
4,small,0.0
5,large,2.0


## Target Guided Ordinal Encoding
It is a technique used to encode categorical variables based on their relationship with the target variable. This encoding technique is useful when we have a categorical variable with a large number of unique categories, and we want to use this variable as a feature in our machine learning model.

In Target Guided Ordinal Encoding, we replace each category in the categorical variable with a numerical value based on the mean or median of the target variable for that category. This creates a monotonic relationship between the categorical variable and the target variable, which can improve the predictive power of our model.

In [42]:
import pandas as pd

# create a sample dataframe with a categorical variable and a target variable
df = pd.DataFrame({
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
    'price': [200, 150, 300, 250, 180, 320]
})

In [43]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [46]:
df.value_counts()

city      price
London    150      1
New York  180      1
          200      1
Paris     300      1
          320      1
Tokyo     250      1
Name: count, dtype: int64

In [44]:
mean_price=df.groupby('city')['price'].mean().to_dict()
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [47]:
df['city_encoded']=df['city'].map(mean_price)

In [48]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [49]:
import seaborn as sns

sns.load_dataset('tips')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
