In [31]:
 %load_ext watermark
 %watermark -a 'Ankur Wasnik' -u -d

Author: Ankur Wasnik

Last updated: 2021-01-14



# Handling categorical data
Basically, there are 2 types of data <br>
<li> Oridinal data -> categorical data that can be sorted or ordered
<br><li> Nominal data -> categorical data that can not be ordered.
We have to deal with both types of categorical data

In [1]:
#import libraries
import pandas as pd
import numpy as np

In [4]:
df = pd.DataFrame({'color':['green','red','blue'] ,
                    'size':['M','L','XL'] , 
                    'price':[10.1,13.5,15.3],
                    'classlabel':['class2','class1','class3']})
print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class3


Here , color is nominal data 
<br> size is ordinal data

## Dealing with Ordinal data
Our tool to deal with ordinal data is to mapping values to categorical data


In [6]:
mapping = {'XL':3,
           'L':2,
            'M':0}
df['size'] = df['size'].map(mapping)
print(df)

   color  size  price classlabel
0  green     0   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class3


Here we did mapping to size column and changed its value. You can compare with above dataframe

In [10]:
#Inverse mapping
inv_mapping = { val : key for key , val in mapping.items()}
print(inv_mapping)

{3: 'XL', 2: 'L', 0: 'M'}


In [11]:
df['inv_mapping'] = df['size'].map(inv_mapping)

In [12]:
print(df)

   color  size  price classlabel inv_mapping
0  green     0   10.1     class2           M
1    red     2   13.5     class1           L
2   blue     3   15.3     class3          XL


We have added a column named 'inv_mapping' showing original mapping 

## Encoding class labels
Many machine learning libraries require that class labels are encoded as integers values.

In [13]:
class_mapping = {value:idx for idx , value in enumerate(np.unique(df['classlabel']))}
print(class_mapping)

{'class1': 0, 'class2': 1, 'class3': 2}


In [14]:
#Use mapping technique to convert categorical class label to integer values
print('Categorical class label \n',df['classlabel'])
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)

Categorical class label 
 0    class2
1    class1
2    class3
Name: classlabel, dtype: object
   color  size  price  classlabel inv_mapping
0  green     0   10.1           1           M
1    red     2   13.5           0           L
2   blue     3   15.3           2          XL


You can see the classlabel column has changed to integer values

## Label Encoding using sklearn

In [17]:
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame({'color':['green','red','blue'] ,
                    'size':['M','L','XL'] , 
                    'price':[10.1,13.5,15.3],
                    'classlabel':['class2','class1','class3']})
print(df)


   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class3
[1 0 2]


In [20]:
label_encoder = LabelEncoder()
y=label_encoder.fit_transform(df['classlabel'].values)
print(pd.DataFrame({'classlabel':df['classlabel'].values , 
                   'class_encoding':y}))

  classlabel  class_encoding
0     class2               1
1     class1               0
2     class3               2


In [21]:
#inverse_encoding 
inv_encoding =label_encoder.inverse_transform(y)
print(pd.DataFrame({'class encoding':y ,
                    'inverse_encoding': inv_encoding}))

   class encoding inverse_encoding
0               1           class2
1               0           class1
2               2           class3


## Dealing with Nominal features
Our tool to deal with the nominal features is One_Hot encoding

## One-Hot Encoding using sklearn

In [22]:
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame({'color':['green','red','blue'] ,
                    'size':['M','L','XL'] , 
                    'price':[10.1,13.5,15.3],
                    'classlabel':['class2','class1','class3']})
print(df)
#color is the nominal categorical data 


   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class3


In [29]:
oh_encoder = OneHotEncoder()
oh_encoded_colors = oh_encoder.fit_transform(df['color'].values.reshape(-1,1))
print(oh_encoded_colors)

  (0, 1)	1.0
  (1, 2)	1.0
  (2, 0)	1.0


## One Hot Encoding using Pandas

In [30]:
pd.get_dummies(df[['color','price']])

Unnamed: 0,price,color_blue,color_green,color_red
0,10.1,0,1,0
1,13.5,0,0,1
2,15.3,1,0,0


Here get_dummies method only do one-hot encoding to string values columns and leaves rest .

# Do check other machine learning tutorials @ankurwasnik GITHUB