# One-Hot Encoding in Scikit-learn

* Convert categorical data into numerical data automatically

# Intuition 

* Prepare your categorical data using LabelEncoder()

* Apply OneHotEncoder() on your new DataFrame in step 1

In [1]:
# Import library
import numpy as np
import pandas as pd

In [2]:
pwd

'C:\\Users\\anura\\Machine_Learning'

In [3]:
# load dataset

data = pd.read_csv(r"C:\Users\anura\Desktop\Data_set\TitanicDataset\titanic_data.csv ")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Check original shape
data.shape

(891, 12)

In [5]:
# Check the data format
data.dtypes 

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
# limit to categorical data using df.select_dtypes()
data = data.select_dtypes(include=[object])

data.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [7]:
# Check original shape
data.shape

(891, 5)

In [8]:
# Check the data format
data.dtypes 

Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object

In [9]:
# Import preprocessing from sklearn

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [10]:
# view columns using df.columns

data.columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [11]:
# Create a LabelEncoder object and fit it to each feature in data

# encode labels with value between 0 and n_classes-1

le = preprocessing.LabelEncoder()

In [12]:
# Fit and Transform 
# use df.apply() to apply le.fit_transform to all columns

df = data.astype(str).apply(le.fit_transform)
df.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,108,1,523,147,2
1,190,0,596,81,0
2,353,0,669,147,2
3,272,0,49,55,2
4,15,1,472,147,2


In [13]:
# Create a oneHotEncoder object, and fit it to all of data

#  Instantiate

enc = preprocessing.OneHotEncoder()

# Fit
enc.fit(df)

# Transform

onehotlabels = enc.transform(df).toarray()
onehotlabels.shape

# as you can see, you've the same number of rows 891
# but now you've so many more columns due to how we changed all the categorical data into numerical data


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(891, 1726)

In [14]:
onehotlabels

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [15]:
type(onehotlabels)

numpy.ndarray

# OneHotEncoder

* Encode categorical integer features using a one-hot aka one-of-K scheme.

* The input to this transformer should be a matrix of integers, denoting the values taken on by categorical (discrete) features. 

* The output will be a sparse matrix where each column corresponds to one possible value of one feature.

* It is assumed that input features take on values in the range [0, n_values).
                                                                 
* This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels.                                                              