# Encoding High-Cardinality Categorical Data Using Empirical Bayes 
Article: "A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems" by Daniele Micci-Barreca
    
[Source](https://www.kaggle.com/rakhlin/two-sigma-connect-rental-listing-inquiries/another-python-version-of-it-is-lit-by-branden)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from itertools import product

X_train = pd.read_json('train.json')
X_test = pd.read_json('test.json')
print(X_train.shape)
print(X_test.shape)

(49352, 15)
(74659, 14)


In [2]:
# Make target integer, one hot encoded, calculate target priors
X_train = X_train.replace({'interest_level': {'low': 0, 'medium': 1, 'high': 2}})
X_train = X_train.join(pd.get_dummies(X_train['interest_level'], prefix='pred').astype(int))
prior_0, prior_1, prior_2 = X_train[['pred_0', 'pred_1', 'pred_2']].mean()

In [3]:
X_train[['interest_level', 'pred_0', 'pred_1', 'pred_2']].head(2)

Unnamed: 0,interest_level,pred_0,pred_1,pred_2
10,1,0,1,0
10000,0,1,0,0


In [4]:
print(prior_0)
print(prior_1)
print(prior_2)

0.694683092884
0.227528772897
0.0777881342195


In [5]:
def hcc_encode(X_train, X_test, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    hcc_name = '_'.join(['hcc', variable, target])

    grouped = X_train.groupby(variable)[target].agg({'size': 'size', 'mean': 'mean'})
    grouped['lambda'] = 1 / (g + np.exp((k - grouped['size']) / f))
    grouped[hcc_name] = grouped['lambda'] * grouped['mean'] + (1 - grouped['lambda']) * prior_prob

    df = X_test[[variable]].join(grouped, on=variable, how='left')[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(X_test)) # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = X_test
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

In [6]:
skf = StratifiedKFold(5)
attributes = product(('building_id', 'manager_id'),
                     zip(('pred_1', 'pred_2'), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(X_train, X_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(X_train)), X_train['interest_level']):
        hcc_encode(X_train.iloc[train], X_train.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=X_train)

In [15]:
X_train.iloc[:, -7:].head()

Unnamed: 0,pred_0,pred_1,pred_2,hcc_building_id_pred_1,hcc_building_id_pred_2,hcc_manager_id_pred_1,hcc_manager_id_pred_2
10,0,1,0,0.21863,0.07347387,0.283531,0.0
10000,1,0,0,0.112172,1.751504e-07,0.012516,0.0
100004,0,0,1,0.46198,0.0938732,0.354482,0.031366
100007,1,0,0,0.095932,0.01185935,0.151803,0.084121
100013,1,0,0,0.059042,0.02285231,0.000565,0.000192
