In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from LabelEncoder import LabelEncoder
from sklearn.preprocessing import LabelEncoder as skLE
import numpy as np
import pandas as pd

In [3]:
y_train = ["paris", "paris", "tokyo", '-1', -1.0, -999, np.nan]
y_test1 = ["paris", "tokyo", "paris", -999, np.nan]
y_test2 = ["paris", "paris", "tokyo", "tel aviv", "amsterdam", -999, np.nan, '-1']

## Goal is to transform categorical entries (strings) to ordinal ones (ints)

In [4]:
skle = skLE()

In [5]:
skle.fit_transform(y_train)

array([4, 4, 5, 0, 1, 2, 3])

In [6]:
skle.classes_

array(['-1', '-1.0', '-999', 'nan', 'paris', 'tokyo'],
      dtype='|S5')

In [7]:
skle.transform(y_test1)

array([4, 5, 4, 2, 3])

## Problem:  sklearn transformer throws an error if it encounters new data during transform. 

In [8]:
skle.transform(y_test2)

ValueError: y contains new labels: ['amsterdam' 'tel aviv']

## Solution: use transformer that can deal with new data. 

### When training and test data are the same LabelEncoder fit_transform is equivalent to sklearn fit_transform

In [9]:
le = LabelEncoder()

In [10]:
f = le.fit_transform(y_train)
f

array([0, 0, 1, 3, 2, 4, 5])

In [11]:
for e in f:
    print type(e)

<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>


In [12]:
le.train_classes

0    paris
1    tokyo
2       -1
3       -1
4     -999
5      NaN
dtype: object

In [13]:
le.test_classes

0     -999
1       -1
2      NaN
3       -1
4    paris
5    tokyo
dtype: object

### When fit and transform are done with different data, by default unknown values encountered during transform are assigned an int value that is n+1 where n is the number of known values. 

In [28]:
print y_train 
print y_test1 
print y_test2 

['paris', 'paris', 'tokyo', '-1', -1.0, -999, nan]
['paris', 'tokyo', 'paris', -999, nan]
['paris', 'paris', 'tokyo', 'tel aviv', 'amsterdam', -999, nan, '-1']


In [29]:
le = LabelEncoder()

In [30]:
le.fit(y_train)

In [31]:
le.transform(y_test2)

y contains new labels!


array([0, 0, 1, 6, 6, 4, 5, 3])

In [32]:
le.train_classes

0    paris
1    tokyo
2       -1
3       -1
4     -999
5      NaN
dtype: object

In [33]:
le.test_classes

0         -999
1          NaN
2           -1
3    amsterdam
4        paris
5     tel aviv
6        tokyo
dtype: object

In [34]:
le.unknown_classes

3    amsterdam
5     tel aviv
dtype: object

### Alternatively, one can assign the most common class encounterd during the fit to unknown values encountered during the transform

In [35]:
print y_train 
print y_test1 
print y_test2 

['paris', 'paris', 'tokyo', '-1', -1.0, -999, nan]
['paris', 'tokyo', 'paris', -999, nan]
['paris', 'paris', 'tokyo', 'tel aviv', 'amsterdam', -999, nan, '-1']


In [36]:
le = LabelEncoder(use_mode=True)

In [37]:
le.fit(y_train)

In [38]:
le.most_common_class

0    paris
dtype: object

In [40]:
le.transform(y_test2)

y contains new labels!


array([0, 0, 1, 0, 0, 4, 5, 3])

In [41]:
le.unknown_classes

3    amsterdam
5     tel aviv
dtype: object

### Or drop unknown values when encountered

In [42]:
le = LabelEncoder(drop_unknown_classes=True)
le.fit(y_train)
le.transform(y_test2)

y contains new labels!


array([0, 0, 1, 4, 5, 3])

### One can also retain nan values in both training and test data

In [43]:
print y_train 
print y_test1 
print y_test2 

['paris', 'paris', 'tokyo', '-1', -1.0, -999, nan]
['paris', 'tokyo', 'paris', -999, nan]
['paris', 'paris', 'tokyo', 'tel aviv', 'amsterdam', -999, nan, '-1']


In [44]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], retain_nan=True)

In [45]:
le.nan_classes

[nan, -999, '-1']

In [46]:
le.fit_transform(y_train)

array([0, 0, 1, '-1', 2, -999, nan], dtype=object)

In [47]:
le.train_classes

0    paris
1    tokyo
2       -1
dtype: object

In [48]:
f = le.transform(y_test2)
f

y contains new labels!


array([0, 0, 1, 3, 3, -999, nan, '-1'], dtype=object)

In [49]:
for e in f:
    print type(e)

<type 'int'>
<type 'int'>
<type 'int'>
<type 'int'>
<type 'int'>
<type 'int'>
<type 'float'>
<type 'str'>


In [50]:
le.test_classes

3    amsterdam
4        paris
5     tel aviv
6        tokyo
dtype: object

#### with use_mode=True

In [51]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=True, retain_nan=True)

In [52]:
le.fit(y_train)

In [53]:
le.transform(y_test1)

array([0, 1, 0, -999, nan], dtype=object)

In [54]:
le.transform(y_test2)

y contains new labels!


array([0, 0, 1, 0, 0, -999, nan, '-1'], dtype=object)

####  with drop unknown classes = True

In [55]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], retain_nan=True, 
                  drop_unknown_classes=True)
le.fit(y_train)
le.transform(y_test2)

y contains new labels!


array([0, 0, 1, -999, nan, '-1'], dtype=object)

### Inverse transform also works as expected

In [64]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=False, retain_nan=True)

In [65]:
le.fit(y_train)
f = le.transform(y_test2)
f

y contains new labels!


array([0, 0, 1, 3, 3, -999, nan, '-1'], dtype=object)

In [66]:
le.unknown_classes

3    amsterdam
5     tel aviv
dtype: object

In [67]:
le.inverse_transform(f)

vector contains unknown indices!


array(['paris', 'paris', 'tokyo', 'unknown', 'unknown', -999, nan, '-1'], dtype=object)

In [68]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=False, retain_nan=False)
le.fit(y_train)
f = le.transform(y_test2)
print f
print le.inverse_transform(f)

y contains new labels!
[0 0 1 6 6 4 5 3]
vector contains unknown indices!
['paris' 'paris' 'tokyo' 'unknown' 'unknown' -999 nan '-1']
