This Jupyter notebook is based on the chapter "Chapter 5: Categorical Variables: Counting Eggs in the Age of Robotic Chickens" from the book "Feature
Engineering
for Machine Learning"

# One Hot Encoding and Dummy Encoding

In [23]:
import pandas as pd
from sklearn import linear_model

In [24]:
df = pd.DataFrame({'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC',
 'Seattle', 'Seattle', 'Seattle'],
 'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501] })

In [25]:
# Convert the categorical variables in the DataFrame to one-hot encoding
# and fit a linear regression model

one_hot_df = pd.get_dummies(df, prefix=['city'],dtype=float)
one_hot_df

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0.0,1.0,0.0
1,4000,0.0,1.0,0.0
2,4001,0.0,1.0,0.0
3,3499,1.0,0.0,0.0
4,3500,1.0,0.0,0.0
5,3501,1.0,0.0,0.0
6,2499,0.0,0.0,1.0
7,2500,0.0,0.0,1.0
8,2501,0.0,0.0,1.0


In [26]:
model = linear_model.LinearRegression()
model.fit(one_hot_df[one_hot_df.columns.difference(['Rent'])],one_hot_df['Rent'])

model.intercept_


3333.3333333333335

In [27]:
one_hot_df['Rent'].mean()

3333.3333333333335

With one-hot encoding, the intercept term represents the global mean of the target
variable

In [28]:
# Convert the categorical variables in the DataFrame to dummy encoding
# and fit a linear regression model

dummy_df = pd.get_dummies(df, prefix=['city'],drop_first=True,dtype=float)
dummy_df

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,1.0,0.0
1,4000,1.0,0.0
2,4001,1.0,0.0
3,3499,0.0,0.0
4,3500,0.0,0.0
5,3501,0.0,0.0
6,2499,0.0,1.0
7,2500,0.0,1.0
8,2501,0.0,1.0


In [29]:
model = linear_model.LinearRegression()
model.fit(dummy_df[dummy_df.columns.difference(['Rent'])],dummy_df['Rent'])

model.intercept_


3500.0

# Effect Coding

In [35]:
effect_df = dummy_df.copy()

# Replace specific values with -1 where 'city_SF' and 'city_Seattle' are 0
effect_df.loc[(effect_df['city_SF']==0) & (effect_df['city_Seattle']==0),['city_SF','city_Seattle']] = -1

model.fit(effect_df[effect_df.columns.difference(['Rent'])],effect_df['Rent'])

model.intercept_


3333.3333333333335

# Dealing with Large Categorical Variables

Large categorical variables are particularly common in transactional records. For
instance, many web services track users using an ID, which is a categorical variable
with hundreds to hundreds of millions of values, depending on the number of unique
users of the service. The IP address of an internet transaction is another example of a
large categorical variable.

## 1. Feature Hashing

In [68]:
from category_encoders.hashing import HashingEncoder
import pandas as pd
from sklearn.datasets import fetch_openml
bunch = fetch_openml(name="house_prices", as_frame=True)
display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
y = bunch.target

  warn(


In [69]:
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir
0,1,60,RL,65.0,2003,GasA,Y
1,2,20,RL,80.0,1976,GasA,Y
2,3,60,RL,68.0,2001,GasA,Y
3,4,70,RL,60.0,1915,GasA,Y
4,5,60,RL,84.0,2000,GasA,Y
...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,1999,GasA,Y
1456,1457,20,RL,85.0,1978,GasA,Y
1457,1458,70,RL,66.0,1941,GasA,Y
1458,1459,20,RL,68.0,1950,GasA,Y


In [74]:
print("Count of Unique elements in Categorical columns")
for col in X.select_dtypes(include='object'):
    print(str(col) + ":" + str(X[col].nunique()))

Count of Unique elements in Categorical columns
MSZoning:5
Heating:6
CentralAir:2


In [75]:
print('Id' + ":" + str(X['Id'].nunique()))

Id:1460


In [70]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [76]:
he = HashingEncoder(cols=['Id']).fit(X, y)
numeric_dataset = he.transform(X)

  elif pd.api.types.is_categorical_dtype(cols):


In [77]:
numeric_dataset

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir
0,0,0,0,1,0,0,0,0,60,RL,65.0,2003,GasA,Y
1,0,0,0,0,1,0,0,0,20,RL,80.0,1976,GasA,Y
2,0,0,0,1,0,0,0,0,60,RL,68.0,2001,GasA,Y
3,0,0,0,0,1,0,0,0,70,RL,60.0,1915,GasA,Y
4,0,0,0,0,0,1,0,0,60,RL,84.0,2000,GasA,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1,0,0,0,0,0,0,0,60,RL,62.0,1999,GasA,Y
1456,0,0,0,0,0,0,1,0,20,RL,85.0,1978,GasA,Y
1457,0,0,0,1,0,0,0,0,70,RL,66.0,1941,GasA,Y
1458,0,0,0,1,0,0,0,0,20,RL,68.0,1950,GasA,Y


## 2. Bin Counting

See "Example 5-6. Bin-counting example" from the book "Feature
Engineering
for Machine Learning"