In [1]:
from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np

In [2]:
# from sklearn.base import TransformerMixin

class OrdinalEncoder(TransformerMixin):
    """Transformer converts categorical values to ordinal ones. 
        If unkown value is encountered during transform stage, 
        imputes values from values learned during fit"""
    
    def __init__(self, cols=None, retain_na=True, impute_probabilities = 'NaN'):
        if cols is not None:
            self.cols = list(cols)
        else:
            self.cols = cols
        self.retain_na = retain_na
        self.impute_probabilities = impute_probabilities #one of: NaN, prob, mode

    def fit(self, X, y=None):
        X_ = X.copy()
        X_ = X_.fillna(-999)
        self.vals_probs_dict = {}
        self.vals_range_dict = {}
        if self.cols is None:
            self.cols = X_.columns.values
        for colname, colvals in X_[self.cols].iteritems():
            #calculate probabilites
            val_counts = colvals.dropna().value_counts()
            val_probs = val_counts/val_counts.sum()
            temp_dict = pd.DataFrame(val_probs).to_dict().values()[0]
            self.vals_probs_dict[colname] =  temp_dict
            #assign ordinal values
            uniq_vals = val_counts.index
            encode_dict = dict(zip(uniq_vals , range(len(uniq_vals))))
            self.vals_range_dict[colname] =  encode_dict
        
        return self


    def transform(self, X):   
        X_ = X.copy()
        self.is_nan = X_.isnull()
        X_ = X_.fillna(-999)

        for colname in self.cols:
            most_common_key = sorted(self.vals_probs_dict[colname].items(), 
                                       key=lambda x: x[1],reverse=True)[0][0]
            X_[colname] = X_[colname].apply(lambda val: 
                                            self._search_fun(colname, val, most_common_key) )
        if self.retain_na:
            X_[self.is_nan] = np.nan
        
        return X_
 
    def fit_transform(self, X, y=None):
        return self.fit(X, y=None).transform(X)
        
    def _search_fun(self, colname, val, most_common_key):
        try: 
            ordinal = self.vals_range_dict[colname][val]
        except KeyError:
            if self.impute_probabilities == 'prob':
                ordinal = np.random.choice(self.vals_range_dict[colname].values(),
                                  p=self.vals_probs_dict[colname].values())
            elif self.impute_probabilities == 'mode':
                ordinal = self.vals_range_dict[colname][most_common_key]
            else:
                ordinal = np.nan
                
        return ordinal

In [3]:
df_train = pd.DataFrame(data={'a': ['foo', 'bar', 'bar', np.nan], 'b': ['do', 're', np.nan, 'mi'] })

In [4]:
oe = OrdinalEncoder()
oe.fit_transform(df_train)

Unnamed: 0,a,b
0,1.0,2.0
1,0.0,3.0
2,0.0,
3,,0.0


In [5]:
df_test = pd.DataFrame(data={'a': ['foo', 'bez', np.nan], 'b': ['do', 're', np.nan,] })
df_test

Unnamed: 0,a,b
0,foo,do
1,bez,re
2,,


In [6]:
oe.transform(df_test)

Unnamed: 0,a,b
0,1.0,2.0
1,,3.0
2,,
