In [10]:
# http://analyticsbot.tk/2018/06/encoding-categorical-variables-in-python/


# imports
import pandas as pd
import numpy as np
import string, random
from sklearn.ensemble import RandomForestRegressor

np.random.seed(1) # for reproducibility
num_rows = 100 # number of observations
categories = ['heavy', 'light'] # categories

# generate the data
df = pd.DataFrame({'x1': np.random.normal(size=num_rows), 'x2': np.random.randint(0, 2, num_rows),\
                  'x3': [random.choice(string.ascii_lowercase) for i in range(num_rows)], \
                  'x4': [random.sample(categories, 1)[0] for _ in range(num_rows)],\
                  'y': [100*i for i in np.random.normal(size=num_rows)]})

In [2]:
df.shape

(100, 5)

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,x4,y
0,1.624345,1,c,light,52.057634
1,-0.611756,0,s,light,-114.434139
2,-0.528172,0,o,heavy,80.186103
3,-1.072969,1,f,light,4.65673
4,0.865408,0,q,heavy,-18.656977


In [4]:
df.dtypes # data types of the columns in dataframe generated

x1    float64
x2      int64
x3     object
x4     object
y     float64
dtype: object

In [5]:
X = df.copy()
y = df.pop('y') # take out the target variable

In [6]:
rf = RandomForestRegressor() # initialize a randomforestregressor object
rf.fit(X,y) # use the fit method

# stack trace of the error

ValueError: could not convert string to float: 'heavy'

In [12]:
#1 – LabelEncoder

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X = df.copy()
X['x4'] = le.fit_transform(X['x4'])
X['x3'] = le.fit_transform(X['x3'])

In [13]:
X.head()

Unnamed: 0,x1,x2,x3,x4,y
0,1.624345,1,1,1,52.057634
1,-0.611756,0,5,1,-114.434139
2,-0.528172,0,25,1,80.186103
3,-1.072969,1,22,1,4.65673
4,0.865408,0,5,0,-18.656977


In [14]:
y = X.pop('y')
 
rf.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
#2 – Treat as Category. Use the Category codes


X = df.copy()

X["x3"] = X["x3"].astype('category').cat.as_ordered()
X["x4"] = X["x4"].astype('category').cat.as_ordered()

y = X.pop('y')
rf.fit(X,y)

ValueError: could not convert string to float: 'light'

In [16]:
X.head()

Unnamed: 0,x1,x2,x3,x4
0,1.624345,1,b,light
1,-0.611756,0,f,light
2,-0.528172,0,z,light
3,-1.072969,1,w,light
4,0.865408,0,f,heavy


In [17]:
X["x3"] = X["x3"].astype('category').cat.codes
X["x4"] = X["x4"].astype('category').cat.codes

X.head()

Unnamed: 0,x1,x2,x3,x4
0,1.624345,1,1,1
1,-0.611756,0,5,1
2,-0.528172,0,25,1
3,-1.072969,1,22,1
4,0.865408,0,5,0


In [18]:
rf.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
X["x3"] = X["x3"].astype('category').cat.codes
X["x4"] = X["x4"].astype('category').cat.codes
 
X.head()

Unnamed: 0,x1,x2,x3,x4
0,1.624345,1,1,1
1,-0.611756,0,5,1
2,-0.528172,0,25,1
3,-1.072969,1,22,1
4,0.865408,0,5,0


In [20]:
#3 – One Hot Encoding

X = df.copy()

X = pd.get_dummies(X, columns=['x3', 'x4'], drop_first=True)

X.head()

Unnamed: 0,x1,x2,y,x3_b,x3_c,x3_d,x3_e,x3_f,x3_g,x3_h,...,x3_r,x3_s,x3_t,x3_u,x3_v,x3_w,x3_x,x3_y,x3_z,x4_light
0,1.624345,1,52.057634,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-0.611756,0,-114.434139,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,-0.528172,0,80.186103,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,-1.072969,1,4.65673,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0.865408,0,-18.656977,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
y = X.pop('y')

rf.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)