# Contents
- [Clubbing](#Clubbing)
    - [Infrequent n Categories](#Infrequent-n-Categories)
- [Encoding](#Encoding)
- [Label Encoding](#Label-Encoding)
    - [Ordinal Encoding](#Ordinal-Encoding)
    - [Frequency Encoding](#Frequency-Encoding)
    - [Target Encoding](#Target-Encoding)
- [Hash Function](#Hash-Function)

___
[Back to the top](#Contents)

# Clubbing

In [37]:
import pandas as pd

df = pd.DataFrame({
    'category': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
})
df

Unnamed: 0,category
0,A
1,B
2,C
3,D
4,E
5,F
6,G
7,H
8,I


In [39]:
df['category_new'] = df['category'].apply(lambda x: x if x in ['A', 'B', 'C'] else 'others')
df

Unnamed: 0,category,category_new
0,A,A
1,B,B
2,C,C
3,others,others
4,others,others
5,others,others
6,others,others
7,others,others
8,others,others


#### `Infrequent n Categories`

In [58]:
import pandas as pd

def combine_infrequent_n_categories(dataframe, column_name, infrequent_n):
    """
    Combines the infrequent `n` categories in a column into an "Other" category.
    
    Parameters:
        dataframe (pd.DataFrame): The input dataframe.
        column_name (str): The name of the categorical column to process.
        infrequent_n (int): The number of least frequent categories to combine into "Other".
    
    Returns:
        pd.DataFrame: A dataframe with the least frequent categories combined.
    """
    # Calculate the frequency of each category
    category_counts = dataframe[column_name].value_counts()
    
    # Identify the infrequent n categories
    infrequent_categories = category_counts.nsmallest(infrequent_n).index
    
    # Replace the infrequent categories with "Other"
    dataframe[f'{column_name}_new'] = dataframe[column_name].apply(lambda x: "Other" if x in infrequent_categories else x)
    
    return dataframe

# Example usage
data = {
    'City': ['NY', 'LA', 'SF', 'NY', 'Chicago', 'SF', 'LA', 'Boston', 'LA', 'Austin'],
    # 'Value': [100, 200, 300, 400, 150, 250, 350, 180, 280, 120]
}
df = pd.DataFrame(data)
df.sort_values(by='City', inplace=True) 

# Combine the 2 least frequent categories into "Other"
df = combine_infrequent_n_categories(df, column_name='City', infrequent_n=2)
print(df)


      City City_new
9   Austin    Other
7   Boston    Other
4  Chicago  Chicago
1       LA       LA
6       LA       LA
8       LA       LA
0       NY       NY
3       NY       NY
2       SF       SF
5       SF       SF


___
[Back to the top](#Contents)

# Encoding
- dummy (drop_first=True) encoding
- One-hot (drop_first=False) encoding
- [Not Important] Effect coding: same as above but uses -1 as well

In [26]:
df = pd.DataFrame({
    'State': ['CA', 'CA', 'CA', 'NY', 'NY', 'NY',
             'WA', 'WA', 'WA'],
    'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC',
             'Seattle', 'Seattle', 'Seattle'],
    'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]
    })

df

Unnamed: 0,State,City,Rent
0,CA,SF,3999
1,CA,SF,4000
2,CA,SF,4001
3,NY,NYC,3499
4,NY,NYC,3500
5,NY,NYC,3501
6,WA,Seattle,2499
7,WA,Seattle,2500
8,WA,Seattle,2501


In [27]:
import pandas as pd

def create_dummies(dataframe, column_name, retain_original=False, drop_first=True):
    """
    Generates dummy or one-hot encoding variables for a specified categorical column in a dataframe.
    
    Parameters:
        dataframe (pd.DataFrame): The input dataframe.
        column_name (str): The name of the categorical column to process.
        retain_original (bool): Whether to retain the original column in the dataframe.
        drop_first (bool): Whether to drop the first category for dummy encoding. 
                           Set to False for one-hot encoding.
        
    Returns:
        pd.DataFrame: The modified dataframe with encoding variables.
    """
    dummies = pd.get_dummies(dataframe[column_name], prefix=column_name, drop_first=drop_first)
    col_index = dataframe.columns.get_loc(column_name)
    
    for dummy in dummies.columns:
        dataframe.insert(col_index + 1, dummy, dummies[dummy])
        col_index += 1
    
    if not retain_original:
        dataframe.drop(column_name, axis=1, inplace=True)
    
    return dataframe

# Example usage
df = create_dummies(df, 'State', retain_original=False, drop_first=False)  # Dummy encoding, all levels
df = create_dummies(df, 'City', retain_original=False, drop_first=True)  # One-hot encoding, drop first level
df

Unnamed: 0,State_CA,State_NY,State_WA,City_SF,City_Seattle,Rent
0,True,False,False,True,False,3999
1,True,False,False,True,False,4000
2,True,False,False,True,False,4001
3,False,True,False,False,False,3499
4,False,True,False,False,False,3500
5,False,True,False,False,False,3501
6,False,False,True,False,True,2499
7,False,False,True,False,True,2500
8,False,False,True,False,True,2501


___
[Back to the top](#Contents)

# Label Encoding

#### `Ordinal Encoding`

In [40]:
df = pd.DataFrame({
    'category': ['small', 'medium', 'large', 'small', 'extra_large', 'medium']
})

category_to_number = {
    'small': 1,
    'medium': 2,
    'large': 3,
    'extra_large': 4,
}
df['category_new'] = df['category'].map(category_to_number)
df


Unnamed: 0,category,category_new
0,small,1
1,medium,2
2,large,3
3,small,1
4,extra_large,4
5,medium,2


#### `Frequency Encoding`

In [43]:
df = pd.DataFrame({
    'category': ['small', 'medium', 'large', 'small', 'extra_large', 'medium']
})

def frequency_encode(dataframe, column_name):
    """
    Encodes a categorical column with its frequency or count of occurrence.
    
    Parameters:
        dataframe (pd.DataFrame): The input dataframe.
        column_name (str): The name of the categorical column to encode.
    
    Returns:
        pd.DataFrame: A dataframe with the encoded column.
    """
    # Calculate the frequency or count of each category
    freq = dataframe[column_name].value_counts()
    
    # Map frequencies back to the column
    dataframe[column_name + '_freq_encoded'] = dataframe[column_name].map(freq)
    
    return dataframe

df = frequency_encode(df, 'category')
df

Unnamed: 0,category,category_freq_encoded
0,small,2
1,medium,2
2,large,1
3,small,2
4,extra_large,1
5,medium,2


#### `Target Encoding`

In [50]:
def target_encode(dataframe, column_name, target_column, statistic='mean'):
    """
    Encodes a categorical column with a statistic of the target variable.
    
    Parameters:
        dataframe (pd.DataFrame): The input dataframe.
        column_name (str): The name of the categorical column to encode.
        target_column (str): The name of the target column.
        statistic (str): The statistic to use for encoding ('mean', 'median', 'std', etc.).
    
    Returns:
        pd.DataFrame: A dataframe with the encoded column.
    """
    if statistic == 'mean':
        agg_stat = dataframe.groupby(column_name)[target_column].mean()
    elif statistic == 'median':
        agg_stat = dataframe.groupby(column_name)[target_column].median()
    elif statistic == 'std':
        agg_stat = dataframe.groupby(column_name)[target_column].std()
    else:
        raise ValueError("Invalid statistic. Supported values are 'mean', 'median', and 'std'.")

    # Map the calculated statistic back to the column
    dataframe[column_name + f'_{statistic}_encoded'] = dataframe[column_name].map(agg_stat)
    
    return dataframe

data = {
    'category': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C', 'C', 'B'],
    'target': [1, 0, 1, 1, 0, 1, 1, 0, 0, 0]
}
df = pd.DataFrame(data)
df.sort_values('category', inplace=True)

df = target_encode(df, column_name='category', target_column='target', statistic='mean')
df


Unnamed: 0,category,target,category_mean_encoded
0,A,1,1.0
2,A,1,1.0
5,A,1,1.0
1,B,0,0.0
4,B,0,0.0
9,B,0,0.0
3,C,1,0.5
6,C,1,0.5
7,C,0,0.5
8,C,0,0.5


___
[Back to the top](#Contents)

# Hash Function
- Converts categorical data into numerical data using a hash function.
- Efficient for high-cardinality categorical fields with limited memory.
- May result in hash collisions, where multiple categories map to the same value.
- **Rarely used** in business applications; mostly omitted.


In [52]:
import pandas as pd
import hashlib

def hash_encode(dataframe, column_name, num_buckets=10):
    """
    Encodes a categorical column using a hash function.

    Parameters:
        dataframe (pd.DataFrame): The input dataframe.
        column_name (str): The name of the categorical column to encode.
        num_buckets (int): The number of buckets for the hash function.
                           Categories will be mapped to integers in the range [0, num_buckets-1].

    Returns:
        pd.DataFrame: A dataframe with the hashed encoding for the column.
    """
    def hash_function(value):
        # Apply a hash function and map to a bucket
        return int(hashlib.md5(str(value).encode()).hexdigest(), 16) % num_buckets

    # Apply the hash function to the column
    dataframe[column_name + '_hash_encoded'] = dataframe[column_name].apply(hash_function)
    return dataframe

# Example usage
data = {
    'Category': ['A', 'B', 'C', 'D', 'E', 'F', 'A', 'C', 'E', 'D']
}
df = pd.DataFrame(data)
df = hash_encode(df, column_name='Category', num_buckets=5)
df

Unnamed: 0,Category,Category_hash_encoded
0,A,0
1,B,3
2,C,0
3,D,0
4,E,2
5,F,0
6,A,0
7,C,0
8,E,2
9,D,0
