In [18]:
from sklearn.preprocessing import OneHotEncoder
from matplotlib.colors import cnames
import random
import numpy as np
import pandas as pd

# ___One Hot Encoding___
------------

In [2]:
# What to do when you have categorical values for features or labels?
# Many basic examples use numeric features and labels for convenience but real world datasets often comprise numerical and categorical features 
# and/or labels.
# This is where one-hot encoding comes in.

In [4]:
# Let's say that we have a dataset where the label is a cetegorical variable => colours.

# Many ML models cannot take categorical inputs directly without any preprocessing.
# e.g. Ordinary least squares regression, logistic regression, SVMs

# But some can, e.g. Decision trees and tree based predictors.
# Sometimes the categories are mapped (encoded) to unique numbers, for ease of computation.

In [5]:
# One widely used solution in dealing with categorical variables is called one-hot encoding.
# This takes in a single categorical variable and turns it into a vector of binary values.
# the vector will consist of n columns where n is the unique number of classes in the categorical feature
# elements of rows will be indicate to which class this data point belongs in the form of a binary mask.
# each row will be a binary array with n elements, where only one value will be 1.

In [20]:
colors = np.array([random.choice(list(cnames.keys())) for i in range(1000)]).reshape(-1, 1)

In [29]:
onehot_encoding = OneHotEncoder().fit_transform(colors).toarray()

In [45]:
# disable the sparse matrix

OneHotEncoder(sparse_output = False).fit_transform(colors)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
onehot_encoding.shape

(1000, 148)

In [23]:
# 1000 x 148 matrix

# 1000 rows -> 148 colours

In [25]:
len(cnames)

148

In [34]:
np.argwhere(onehot_encoding)[:, 1]

array([ 55,  28, 119,  28, 131,  96,  72, 108,   3,  88,  91,  48, 134,
        96,  71,  46, 103,  33, 138, 118, 126,  89,  96,  80,  20,  97,
        43,  46,  56,  77,  41,  32,  15,  88,  18,  71,  73,  42,  33,
        73,  78,  21,  23,  89,  43,  32,  80, 137,  57,  75,  93,   0,
        48,  92,  18, 135, 125,  44,  49, 114,  32,  62, 127, 132, 104,
       101,  76,  32,   9,  96,  62, 100, 102, 132,   0,  27,  57,  62,
       109,  66, 130,  38,  77,  80,   4,   6, 108,  26,  52,  82,   0,
        29,  99,  19,  23, 136, 124, 142,  74,  72,  27, 140,  52,  72,
        38,  23, 105,  57,  49, 109, 123,  42, 128, 129,  33,  57,  90,
        90,   6,  66,  10, 127,  30,  98,  82,  49,  85,  30,  65,  36,
        80,   3,  65,  32,  31, 137,  79,  73,  42,   1,  25,  96, 146,
        62,  45,  46, 147, 136, 121,   6,  99,  84, 125, 105, 109, 132,
        42, 110,  50,  69,  60,  27,  53, 101, 126, 131,  53, 138, 106,
        55, 103,  87, 126, 130,  95,  98,  41, 145,  38,  15,  5

In [36]:
cseries = pd.Series(colors.flatten())

In [39]:
pd.get_dummies(cseries)

Unnamed: 0,aliceblue,antiquewhite,aqua,aquamarine,azure,beige,bisque,black,blanchedalmond,blue,...,teal,thistle,tomato,turquoise,violet,wheat,white,whitesmoke,yellow,yellowgreen
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
pd.get_dummies(cseries).astype(np.int8)

Unnamed: 0,aliceblue,antiquewhite,aqua,aquamarine,azure,beige,bisque,black,blanchedalmond,blue,...,teal,thistle,tomato,turquoise,violet,wheat,white,whitesmoke,yellow,yellowgreen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
pd.get_dummies(cseries).astype(np.int8).sum(axis = 0)

aliceblue       4
antiquewhite    5
aqua            5
aquamarine      6
azure           4
               ..
wheat           4
white           7
whitesmoke      6
yellow          9
yellowgreen     9
Length: 148, dtype: int64

In [43]:
# In statistics, this is called dummy encoding or 1 of k encoding.

In [44]:
# When the categorical variable is a feature, we could use OneHotEncoder from sklearn.preprocessing.
# This will return a sparse matrix (mostly zeroes)

In [62]:
sex = ["Male", "Female"]
age_in_decades = list(range(10))

features = [[random.choice(sex), random.choice(age_in_decades)] for i in range(20)]

In [63]:
features[:10]

[['Male', 2],
 ['Female', 2],
 ['Female', 1],
 ['Female', 6],
 ['Female', 5],
 ['Female', 8],
 ['Male', 0],
 ['Male', 6],
 ['Female', 0],
 ['Female', 9]]

In [64]:
# Now we have a list of lists ->
# Where the inner list consists of two categorical variables -> sex: categorical nominal, age: categorical nominal

In [65]:
ohencoder = OneHotEncoder(sparse_output = False).fit(features)

In [66]:
ohencoder.categories_

[array(['Female', 'Male'], dtype=object),
 array([0, 1, 2, 4, 5, 6, 7, 8, 9], dtype=object)]

In [67]:
ohencoder.transform(features)

array([[0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0.

In [68]:
# the first two elements in the rows represent the sex

In [69]:
ohencoder.transform(features)[:, :2]

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [70]:
# the next 10 elements represent ages in decades

ohencoder.transform(features)[:, 2:]

array([[0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [71]:
# What if the encoder encounters a class unbeknownst to it in the training phase.

test = features.copy()

In [73]:
test.append(["Shemale", 11])

In [74]:
ohencoder.transform(test)

ValueError: Found unknown categories ['Shemale'] in column 0 during transform

In [76]:
# To overcome this,

ohencoder = OneHotEncoder(sparse_output = False, handle_unknown = "ignore").fit(features)

In [77]:
# Note that this did not throw an error!.

ohencoder.transform(test)

array([[0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0.

In [None]:
# For the 21st row, where the sex was Shemale and the age was 11, we have all zeroes.

## ___Applying One-Hot Encoders Properly___
----------------

In [78]:
# 1) Split the data into train and test before fitting the encoder.
# 2) Fit the encoder on the training set.
# 3) Use .transform() method of the encoder to transform the training set and test set.
#    This ensures that dummy variables match across the train-test split and all categories (classes) get representation.
# 4) If the test set contains classes not seen during the training; either map them to a new category or ignore them and create an all zero row.

In [None]:
# If the label is a categorical variable,
# We'll use LabelBinarizer