In [202]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [203]:
from LabelEncoder import LabelEncoder
from sklearn.preprocessing import LabelEncoder as skLE
import numpy as np
import pandas as pd

In [263]:
y_train = ["paris", "paris", "tokyo", '-1', -1.0, -999, np.nan]
y_test1 = ["paris", "tokyo", "paris", -999, np.nan]
y_test2 = ["paris", "paris", "tokyo", "tel aviv", "amsterdam", -999, np.nan, '-1']

## Goal is to transform categorical entries (strings) to ordinal ones (ints)

In [205]:
skle = skLE()

In [206]:
skle.fit_transform(y_train)

array([5, 5, 6, 0, 1, 2, 3, 4])

In [207]:
skle.classes_

array(['-1', '-1.0', '-999', 'foo', 'nan', 'paris', 'tokyo'],
      dtype='|S5')

In [208]:
skle.transform(y_test1)

array([5, 6, 5, 2, 4])

## Problem:  sklearn transformer throws an error if it encounters new data during transform. 

In [19]:
skle.transform(y_test2)

ValueError: y contains new labels: ['amsterdam' 'tel aviv']

## Solution: use transformer that can deal with new data. 

### When training and test data are the same LabelEncoder fit_transform is equivalent to sklearn fit_transform

In [211]:
le = LabelEncoder()

In [213]:
f = le.fit_transform(y_train)
f

array([5, 5, 6, 3, 1, 0, 4, 2])

In [214]:
for e in f:
    print type(e)

<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>
<type 'numpy.int64'>


In [215]:
le.train_classes

0     -999
1       -1
2      NaN
3       -1
4      foo
5    paris
6    tokyo
dtype: object

In [216]:
le.test_classes

0     -999
1       -1
2      NaN
3       -1
4      foo
5    paris
6    tokyo
dtype: object

### When fit and transform are done with different data, by default unknown values encountered during transform are assigned an int value that is n+1 where n is the number of known values. 

In [217]:
print y_train 
print y_test1 
print y_test2 

['paris', 'paris', 'tokyo', '-1', -1.0, -999, 'foo', nan]
['paris', 'tokyo', 'paris', -999, nan]
['paris', 'paris', 'tokyo', 'tel aviv', 'amsterdam', -999, nan, '-1']


In [218]:
le = LabelEncoder()

In [219]:
le.fit(y_train)

In [220]:
le.transform(y_test2)

y contains new labels!


array([5, 5, 6, 7, 7, 0, 2, 3])

In [221]:
le.train_classes

0     -999
1       -1
2      NaN
3       -1
4      foo
5    paris
6    tokyo
dtype: object

In [222]:
le.test_classes

0         -999
1          NaN
2           -1
3    amsterdam
4        paris
5     tel aviv
6        tokyo
dtype: object

In [223]:
le.unknown_classes

3    amsterdam
5     tel aviv
dtype: object

### Alternatively, one can assign the most common class encounterd during the fit to unknown values encountered during the transform

In [236]:
print y_train 
print y_test1 
print y_test2 

['paris', 'paris', 'tokyo', '-1', -1.0, -999, 'foo', nan]
['paris', 'tokyo', 'paris', -999, nan]
['paris', 'paris', 'tokyo', 'tel aviv', 'amsterdam', -999, nan, '-1']


In [243]:
le = LabelEncoder(use_mode=True)

In [244]:
le.fit(y_train)

In [245]:
le.most_common_class

5    paris
dtype: object

In [246]:
le.transform(y_test2)

y contains new labels!


array([5, 5, 6, 5, 5, 0, 2, 3])

In [247]:
le.unknown_classes

3    amsterdam
5     tel aviv
dtype: object

### Or drop unknown values when encountered

In [248]:
le = LabelEncoder(drop_unknown_classes=True)
le.fit(y_train)
le.transform(y_test2)

y contains new labels!


array([5, 5, 6, 0, 2, 3])

### One can also retain nan values in both training and test data

In [264]:
print y_train 
print y_test1 
print y_test2 

['paris', 'paris', 'tokyo', '-1', -1.0, -999, nan]
['paris', 'tokyo', 'paris', -999, nan]
['paris', 'paris', 'tokyo', 'tel aviv', 'amsterdam', -999, nan, '-1']


In [265]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], retain_nan=True)

In [266]:
le.nan_classes

[nan, -999, '-1']

In [267]:
le.fit_transform(y_train)

array([4, 4, 5, '-1', 1, -999, nan], dtype=object)

In [268]:
le.train_classes

1       -1
4    paris
5    tokyo
dtype: object

In [269]:
f = le.transform(y_test2)
f

y contains new labels!


array([4, 4, 5, 6, 6, -999, nan, '-1'], dtype=object)

In [270]:
for e in f:
    print type(e)

<type 'int'>
<type 'int'>
<type 'int'>
<type 'int'>
<type 'int'>
<type 'int'>
<type 'float'>
<type 'str'>


In [271]:
le.test_classes

3    amsterdam
4        paris
5     tel aviv
6        tokyo
dtype: object

#### with use_mode=True

In [272]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=True, retain_nan=True)

In [273]:
le.fit(y_train)

In [274]:
le.transform(y_test1)

array([4, 5, 4, -999, nan], dtype=object)

In [275]:
le.transform(y_test2)

y contains new labels!


array([4, 4, 5, 4, 4, -999, nan, '-1'], dtype=object)

####  with drop unknown classes = True

In [276]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], retain_nan=True, 
                  drop_unknown_classes=True)
le.fit(y_train)
le.transform(y_test2)

y contains new labels!


array([4, 4, 5, -999, nan, '-1'], dtype=object)

### Inverse transform also works as expected

In [329]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=True, retain_nan=True)

In [330]:
le.fit(y_train)
f = le.transform(y_test2)
f

y contains new labels!


array([4, 4, 5, 4, 4, -999, nan, '-1'], dtype=object)

In [331]:
le.inverse_transform(f)

vector contains unknown indices!


array(['paris', 'paris', 'tokyo', 'paris', 'paris', -999, nan, '-1'], dtype=object)

In [333]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=False, retain_nan=True)

In [334]:
le.fit(y_train)
f = le.transform(y_test2)
print f
print le.inverse_transform(f)

y contains new labels!
[4 4 5 6 6 -999 nan '-1']
vector contains unknown indices!
['paris' 'paris' 'tokyo' 'unknown' 'unknown' -999 nan '-1']


In [335]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=False, retain_nan=False)
le.fit(y_train)
f = le.transform(y_test2)
print f
print le.inverse_transform(f)

y contains new labels!
[4 4 5 6 6 0 2 3]
vector contains unknown indices!
['paris' 'paris' 'tokyo' 'unknown' 'unknown' -999 nan '-1']


In [337]:
le = LabelEncoder(nan_classes=[np.nan, -999, '-1'], use_mode=True, retain_nan=False)
le.fit(y_train)
f = le.transform(y_test2)
print le.inverse_transform(f)

y contains new labels!
4    paris
4    paris
5    tokyo
4    paris
4    paris
0     -999
2      NaN
3       -1
dtype: object
