## Getting ready

In [11]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [12]:
import string
import random

def random_id(length=8):
    voc = string.ascii_lowercase + string.digits
    return ''.join(random.choice(voc) for i in range(length))

example = pd.DataFrame({'high_cat_1': [random_id(length=2) for i in range(500)],
                        'high_cat_2': [random_id(length=3) for i in range(500)],
                        'high_cat_3': [random_id(length=4) for i in range(500)]})

In [13]:
example

Unnamed: 0,high_cat_1,high_cat_2,high_cat_3
0,ax,4n9,dwsu
1,kh,cez,3jbp
2,5g,z4v,jao8
3,xa,jkl,70zv
4,55,ko7,3zkr
...,...,...,...
495,ae,42l,96ol
496,7v,2g9,3vtn
497,zm,yyd,5ndi
498,cl,7pd,hok2


## How to do it

In [14]:
class LEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.encoders = dict()
        self.dictionary_size = list()
        self.unk = -1
    
    def fit(self, X, y=None, **fit_params):
        for col in range(X.shape[1]):
            le = LabelEncoder()
            le.fit(X.iloc[:, col].fillna('_nan'))
            le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
            
            if '_nan' not in le_dict:
                max_value = max(le_dict.values())
                le_dict['_nan'] = max_value
            
            max_value = max(le_dict.values())
            le_dict['_unk'] = max_value
            
            self.unk = max_value
            self.dictionary_size.append(len(le_dict))
            col_name = X.columns[col]
            self.encoders[col_name] = le_dict
            
        return self
    
    def transform(self, X, y=None, **fit_params):
        output = list()
        for col in range(X.shape[1]):
            col_name = X.columns[col]
            le_dict = self.encoders[col_name]
            emb = X.iloc[:, col].fillna('_nan').apply(lambda x: le_dict.get(x, le_dict['_unk'])).values
            output.append(pd.Series(emb, name=col_name).astype(np.int32))
        return output

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

## How it works

In [15]:
le = LEncoder()
le.fit_transform(example)

[0      111
 1      228
 2       58
 3      377
 4       50
       ... 
 495    106
 496     74
 497    403
 498    134
 499    167
 Name: high_cat_1, Length: 500, dtype: int32,
 0       73
 1      160
 2      476
 3      273
 4      285
       ... 
 495     68
 496     33
 497    472
 498    103
 499     77
 Name: high_cat_2, Length: 500, dtype: int32,
 0      212
 1       42
 2      284
 3      102
 4       50
       ... 
 495    141
 496     49
 497     79
 498    272
 499    120
 Name: high_cat_3, Length: 500, dtype: int32]

## There's more...

In [16]:
le.dictionary_size

[409, 493, 502]