In [None]:
'''
Data Encoding:
Definition:
Data encoding is the process of converting categorical data into a numerical format 
that can be used by machine learning algorithms. 
This is essential because most algorithms require numerical input and cannot handle categorical data directly.

Types of Encoding:
1. Nominal/OHE Encoding
2. Label and Ordinal Encoding
3. Target Guided Ordinal Encoding
'''

In [None]:
'''
Nominal/OHE Encoding:
- One-Hot Encoding (OHE) is used for nominal categorical variables.
- Nominal categorical variables do not have an inherent order.(Meaning they are just labels)
- It creates binary columns for each category, allowing the model to treat them as separate features.
- Eg: If we have a feature 'Color' with categories 'Red', 'Green', and 'Blue', OHE will create three binary columns:

    Color_Red | Color_Green | Color_Blue
    - Each column will have a value of 1 if the observation belongs to that category, and 0 otherwise.
    For example:
    Color_Red | Color_Green | Color_Blue
    1          | 0            | 0
    0          | 1            | 0
    0          | 0            | 1
    Here 'Color_Red' is 1, indicating the observation is 'Red', while the others are 0 and so on.

- When not to use OHE:
    - OHE is not suitable for high cardinality features (features with many unique categories) as it can lead to a 
        large number of columns, increasing the dimensionality of the dataset and potentially causing overfitting.
    - In such cases, other encoding methods like Label Encoding or Target Guided Ordinal Encoding may be more appropriate.
    - Sparse matrix is in OHE, which means it has a lot of zeros, and it can be memory inefficient for 
        high cardinality features, lead to overfitting.
    (Sparse matrix is a matrix in which most of the elements are zero,
    and it is often used to represent high-dimensional data efficiently. Eg: In OHE, if we have 1000 unique categories,
    we will have 1000 columns, and most of them will be zero for each observation, leading to a sparse matrix.
     col1 | col2 | col3
     0     | 1    | 0
     0     | 0    | 1
     1     | 0    | 0)

'''

In [1]:
# OHE for preprocessing categorical data
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.DataFrame({
    'Color': ['Red', 'Green', 'Blue', 'Red', 'Green']
})
df.head()

Unnamed: 0,Color
0,Red
1,Green
2,Blue
3,Red
4,Green


In [11]:
encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False to get a dense array
encoded_data = encoder.fit_transform(df[['Color']])
encoded_data

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [13]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
encoded_df.head()

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [15]:
encoder.transform([['Red']])



array([[0., 0., 1.]])

In [16]:
pd.concat([df, encoded_df], axis=1).head()

Unnamed: 0,Color,Color_Blue,Color_Green,Color_Red
0,Red,0.0,0.0,1.0
1,Green,0.0,1.0,0.0
2,Blue,1.0,0.0,0.0
3,Red,0.0,0.0,1.0
4,Green,0.0,1.0,0.0


In [19]:
import seaborn as sns
# Load the tips dataset for demonstration
# The tips dataset contains categorical
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [27]:
# one hot encoding
one_hot_encoder = OneHotEncoder(sparse_output=False)
encoded_tips = one_hot_encoder.fit_transform(tips[['sex','day']])
encoded_tips

array([[1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 1.]], shape=(244, 6))

In [28]:
encoded_tips_df = pd.DataFrame(encoded_tips, columns=one_hot_encoder.get_feature_names_out())
encoded_tips_df.head()

Unnamed: 0,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur
0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0


In [None]:
'''
Label and Ordinal Encoding:
- Label Encoding is used for ordinal categorical variables, which have a meaningful order.
- It assigns a unique integer to each category based on their order.
- For example, if we have a feature 'Size' with categories 'Small', 'Medium', and 'Large', Label Encoding will assign:
    Size_Small | Size_Medium | Size_Large
    0          | 1            | 2
    - Here, 'Small' is encoded as 0, 'Medium' as 1, and 'Large' as 2.
    - This encoding preserves the order of the categories, which is important for ordinal variables.
- When not to use Label Encoding:
    - Label Encoding is not suitable for nominal categorical variables, as it can introduce an unintended ordinal relationship.
    - For example, if we encode 'Red', 'Green', and 'Blue' as 0, 1, and 2 respectively, the model might interpret 'Green' (1) as being "between" 'Red' (0) and 'Blue' (2), which is incorrect.
    - In such cases, One-Hot Encoding or Target Guided Ordinal Encoding should be used instead.

'''

In [35]:
# code for Label Encoding
from sklearn.preprocessing import LabelEncoder
# Create a sample DataFrame with ordinal categorical data
df_ordinal = pd.DataFrame({
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small']
})
# Initialize the LabelEncoder
label_encoder = LabelEncoder()  # ordered=True for ordinal encoding
# Fit and transform the 'Size' column
df_ordinal['Size_Encoded'] = label_encoder.fit_transform(df_ordinal['Size'])
df_ordinal.head()

Unnamed: 0,Size,Size_Encoded
0,Small,2
1,Medium,1
2,Large,0
3,Medium,1
4,Small,2


In [36]:
label_encoder.transform(['Small'])

array([2])

In [None]:
'''
Ordinal Encoding:
- Ordinal Encoding is similar to Label Encoding but explicitly preserves the order of categories.
- It is used for ordinal categorical variables where the order matters.
- For example, if we have a feature 'Rating' with categories 'Poor', 'Fair', 'Good', and 'Excellent', Ordinal Encoding will assign:
    Rating_Poor | Rating_Fair | Rating_Good | Rating_Excellent
    0            | 1           | 2           | 3
    - Here, 'Poor' is encoded as 0, 'Fair' as 1, 'Good' as 2, and 'Excellent' as 3.
- When not to use Ordinal Encoding:
    - Ordinal Encoding is not suitable for nominal categorical variables, as it can introduce an unintended ordinal relationship.
    - For example, if we encode 'Red', 'Green', and 'Blue' as 0, 1, and 2 respectively, the model might interpret 'Green' (1) as being "between" 'Red' (0) and 'Blue' (2), which is incorrect.
    - In such cases, One-Hot Encoding or Target Guided Ordinal Encoding should be used instead.
'''

In [38]:
# Code for Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
# Create a sample DataFrame with ordinal categorical data
df_ordinal = pd.DataFrame({
    'Size': ['Poor', 'Fair', 'Good', 'Excellent', 'Good', 'Fair', 'Poor']
})
# Initialize the OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[['Poor', 'Fair', 'Good', 'Excellent']])  # Specify the order of categories
# Fit and transform the 'Size' column
df_ordinal['Size_Encoded'] = ordinal_encoder.fit_transform(df_ordinal[['Size']])
df_ordinal.head()

Unnamed: 0,Size,Size_Encoded
0,Poor,0.0
1,Fair,1.0
2,Good,2.0
3,Excellent,3.0
4,Good,2.0


In [None]:
'''
Target Guided Ordinal Encoding:
- Target Guided Ordinal Encoding is a technique that encodes categorical variables based on their relationship 
    with the target variable.
- It assigns a numerical value to each category based on the mean or median of the target variable for each category.
- This encoding is particularly useful when the categorical variable has a strong relationship with the target variable.
- For example, if we have a feature 'Category' with categories 'A', 'B', and 'C', and a target variable 'Sales', 
    we can calculate the mean sales for each category:
    Category_A | Category_B | Category_C
    100        | 200        | 300
    - Then, we can encode the categories as follows:
        Category_A | Category_B | Category_C
        0.33       | 0.67       | 1.00
    - Here, 'A' is encoded as 0.33, 'B' as 0.67, and 'C' as 1.00 based on their mean sales.

- When not to use Target Guided Ordinal Encoding:
    - This encoding is not suitable for nominal categorical variables, as it assumes a relationship with the target variable.
    - If the categorical variable does not have a meaningful relationship with the target variable, this encoding can lead to misleading results.
    - In such cases, One-Hot Encoding or Label Encoding should be used instead.
'''

In [59]:
# Target Guided Ordinal Encoding code
import numpy as np
df_categorical = pd.DataFrame({
    'Category': ['A', 'B', 'C', 'A', 'B', 'C'],
    'Sales': [100, 200, 300, 150, 250, 350]
})
df_categorical

Unnamed: 0,Category,Sales
0,A,100
1,B,200
2,C,300
3,A,150
4,B,250
5,C,350


In [60]:
# Calculate the mean sales for each category
mean_sales = df_categorical.groupby('Category')['Sales'].mean().reset_index()
mean_sales

Unnamed: 0,Category,Sales
0,A,125.0
1,B,225.0
2,C,325.0


In [62]:
df_categorical['Category_Encoded'] = df_categorical['Category'].map(mean_sales.set_index('Category')['Sales'])
df_categorical

Unnamed: 0,Category,Sales,Category_Encoded
0,A,100,125.0
1,B,200,225.0
2,C,300,325.0
3,A,150,125.0
4,B,250,225.0
5,C,350,325.0
