In [1]:
import pandas as pd

In [2]:
flowers = pd.DataFrame({
    'color' : ['red', 'green', 'red', 'green', 'red', 'green', 'red', 'green', 'blue', 'blue'],
    'height': [4,9,4,8,4,7,4,7.5,20,19],
    'petals': [3,9,1,8,1,10,2,8,50,47],
    'days'  : [6,16,7,15,8,17,5,12,40,45]
})
flowers

Unnamed: 0,color,height,petals,days
0,red,4.0,3,6
1,green,9.0,9,16
2,red,4.0,1,7
3,green,8.0,8,15
4,red,4.0,1,8
5,green,7.0,10,17
6,red,4.0,2,5
7,green,7.5,8,12
8,blue,20.0,50,40
9,blue,19.0,47,45


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(flowers.drop('days', 1), flowers['days'],
                                                    test_size=.2, random_state=40)

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
ohe = OneHotEncoder(sparse=False)

In [7]:
X_train[['color']]

Unnamed: 0,color
8,blue
1,green
2,red
9,blue
0,red
5,green
7,green
6,red


In [8]:
ohe.fit(X_train[['color']])

OneHotEncoder(sparse=False)

In [9]:
ohe.transform(X_train[['color']])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [10]:
# let's see our test data and transform it
X_test

Unnamed: 0,color,height,petals
4,red,4.0,1
3,green,8.0,8


In [15]:
# ------------- Advantage 1-------------------------
# since we know there are three color categories and get_dummy failed telling us third category
# therefore we will check if this will work to give us three category

In [13]:
ohe.transform(X_test[['color']])

array([[0., 0., 1.],
       [0., 1., 0.]])

In [14]:
pd.DataFrame(ohe.transform(X_test[['color']]))

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,0.0,1.0,0.0


In [16]:
# ---------------------Advantage 2---------------------------

In [17]:
# if we have another category like pink in data


In [18]:
flowers_new = pd.DataFrame({
    'color' : ['red', 'green', 'red', 'pink', 'red', 'green', 'red', 'green', 'blue', 'blue'],
    'height': [4,9,4,8,4,7,4,7.5,20,19],
    'petals': [3,9,1,8,1,10,2,8,50,47],
    'days'  : [6,16,7,15,8,17,5,12,40,45]
})
flowers_new

Unnamed: 0,color,height,petals,days
0,red,4.0,3,6
1,green,9.0,9,16
2,red,4.0,1,7
3,pink,8.0,8,15
4,red,4.0,1,8
5,green,7.0,10,17
6,red,4.0,2,5
7,green,7.5,8,12
8,blue,20.0,50,40
9,blue,19.0,47,45


In [19]:
X_1, X_2, y_1, y_2 = train_test_split(flowers_new.drop('days', 1), flowers_new['days'],
                                                    test_size=.2, random_state=40)

In [20]:
enc1 = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [21]:
X_1['color']

8     blue
1    green
2      red
9     blue
0      red
5    green
7    green
6      red
Name: color, dtype: object

In [22]:
X_2

Unnamed: 0,color,height,petals
4,red,4.0,1
3,pink,8.0,8


In [23]:
enc1.fit_transform(X_1[['color']])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [24]:
# --------------- we have unseen value in our test data------------ pink

We can pass handle unknown to deal with such cases like pink in test data

In [46]:
enc1 = OneHotEncoder(sparse=False, handle_unknown='error')

In [47]:
enc1.fit_transform(X_1[['color']])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [48]:
# try for test data
enc1.transform(X_2[['color']])

ValueError: Found unknown categories ['pink'] in column 0 during transform

In [49]:
# Error says about an extra dummy
#------------------- BOOM-------------------


In [50]:
# lets ignore the error and ignore such test cases as well

In [51]:
enc1 = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [52]:
enc1.fit_transform(X_1[['color']])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [53]:
enc1.transform(X_2[['color']])

array([[0., 0., 1.],
       [0., 0., 0.]])

In [55]:
pd.DataFrame(enc1.transform(X_2[['color']]))

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,0.0,0.0,0.0


In [54]:
# --------------------------- NO ERROR---------------------------------

# Real world example

In [56]:
df = pd.read_csv('data/income_evaluation.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [58]:
X_train, X_test, y_train,y_test = train_test_split(df.drop(' income', 1),
                                                   df[' income'],test_size = 0.2, random_state=40)

In [59]:
X_train[' workclass'].unique()

array([' Private', ' ?', ' Local-gov', ' State-gov', ' Self-emp-not-inc',
       ' Self-emp-inc', ' Federal-gov', ' Never-worked', ' Without-pay'],
      dtype=object)

In [60]:
o1 = OneHotEncoder()


In [61]:
o1.fit_transform(X_train[[' workclass']]).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [62]:
# let's look what have we got
pd.DataFrame(o1.fit_transform(X_train[[' workclass']]).toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
26043,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26044,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26046,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [63]:
# do it for x_test

In [64]:
o1.transform(X_test[[' workclass']]).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [65]:
# --------------------- doing it for multiple cols---------------------

In [66]:
o2 = OneHotEncoder(sparse=False)

In [67]:
o2.fit_transform(X_train[[' workclass', ' occupation']])

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [68]:
pd.DataFrame(o2.fit_transform(X_train[[' workclass', ' occupation']]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26044,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26046,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
o2.get_feature_names()

array(['x0_ ?', 'x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Never-worked',
       'x0_ Private', 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc',
       'x0_ State-gov', 'x0_ Without-pay', 'x1_ ?', 'x1_ Adm-clerical',
       'x1_ Armed-Forces', 'x1_ Craft-repair', 'x1_ Exec-managerial',
       'x1_ Farming-fishing', 'x1_ Handlers-cleaners',
       'x1_ Machine-op-inspct', 'x1_ Other-service',
       'x1_ Priv-house-serv', 'x1_ Prof-specialty', 'x1_ Protective-serv',
       'x1_ Sales', 'x1_ Tech-support', 'x1_ Transport-moving'],
      dtype=object)

In [70]:
pd.set_option('display.max_columns', None)

In [71]:
pd.DataFrame(o2.fit_transform(X_train[[' workclass', ' occupation']]), columns=o2.get_feature_names())

Unnamed: 0,x0_ ?,x0_ Federal-gov,x0_ Local-gov,x0_ Never-worked,x0_ Private,x0_ Self-emp-inc,x0_ Self-emp-not-inc,x0_ State-gov,x0_ Without-pay,x1_ ?,x1_ Adm-clerical,x1_ Armed-Forces,x1_ Craft-repair,x1_ Exec-managerial,x1_ Farming-fishing,x1_ Handlers-cleaners,x1_ Machine-op-inspct,x1_ Other-service,x1_ Priv-house-serv,x1_ Prof-specialty,x1_ Protective-serv,x1_ Sales,x1_ Tech-support,x1_ Transport-moving
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26044,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26046,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# doing the fit transform for all the cols at one go

In [73]:
categ = [col for col in X_train.columns if X_train[col].dtypes == 'O']

In [74]:
o3 = OneHotEncoder(sparse=False)


In [75]:
o3.fit_transform(X_train[categ])


array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [76]:
X_train[' workclass'].unique()

array([' Private', ' ?', ' Local-gov', ' State-gov', ' Self-emp-not-inc',
       ' Self-emp-inc', ' Federal-gov', ' Never-worked', ' Without-pay'],
      dtype=object)

In [77]:
pd.DataFrame(o3.fit_transform(X_train[categ])).shape

(26048, 102)

In [78]:
# ------------------------ How to get rid off dummy variable trap---------------------------

# now using drop='first', we'll notice how 8 columns have been removed, and now we have 102 - 8 = 94 columns.

In [79]:
o4 = OneHotEncoder(sparse=False, drop='first')

In [80]:
o4.fit_transform(X_train[categ])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [81]:
pd.DataFrame(o4.fit_transform(X_train[categ])).shape

(26048, 94)