## How to Handle Categorical Variable

#### Importing the libraries

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### I) One Hot Encoding

In [32]:
dataset=pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
dataset['Sex'].unique()

array(['male', 'female'], dtype=object)

In [34]:
Sex_converted_variable=pd.get_dummies(dataset['Sex'],drop_first=True).head()

In [35]:
Sex_converted_variable

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [36]:
dataset=pd.concat([dataset,Sex_converted_variable],axis=1)
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1.0


In [37]:
dataset.drop('Sex',axis=1,inplace=True)

In [38]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,1.0


### II) One Hot Encoding with Multiple Categories

In [39]:
dataset=pd.read_csv('mercendez.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])
dataset.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [40]:
## Check for unique labels in each column
for col in dataset.columns:
    print(len(dataset[col].unique()))

47
27
44
7
4
29
12


In [41]:
dataset.X1.value_counts()

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
q       3
ab      3
d       3
Name: X1, dtype: int64

In [42]:
list_top_10=dataset.X1.value_counts().sort_values(ascending=False).head(10).index
list_top_10

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [43]:
for category in list_top_10:
    dataset[category]=np.where(dataset['X1']==category,1,0)
dataset.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


In [44]:
dataset.drop('X1',axis=1,inplace=True)
dataset.head()

Unnamed: 0,X0,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


### III) Ordinal Number Encoding

In [45]:
data={'Temperature':['Hot','Cold','Very Hot','Warm','Hot','Warm','Warm','Hot','Hot','Cold']}
dataset=pd.DataFrame(data,columns=['Temperature'])
dataset.head()

Unnamed: 0,Temperature
0,Hot
1,Cold
2,Very Hot
3,Warm
4,Hot


In [46]:
mapping_dictionary_value={'Cold':1,'Warm':2,'Hot':3,'Very Hot':4}
dataset['Temperature_Ordinal']=dataset.Temperature.map(mapping_dictionary_value)
dataset

Unnamed: 0,Temperature,Temperature_Ordinal
0,Hot,3
1,Cold,1
2,Very Hot,4
3,Warm,2
4,Hot,3
5,Warm,2
6,Warm,2
7,Hot,3
8,Hot,3
9,Cold,1


### Count or Frequency Encoding

In [47]:
dataset = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None) 
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [48]:
columns=[1,3,5,6,7,8,9,13]
dataset[columns].head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [49]:
dataset=dataset[columns]
dataset.columns=['Employment','Degree','Status','Designation','Family_job','Race','Sex','Country']
dataset.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [50]:
for col in dataset.columns[:]:
    print(col,':',len(dataset[col].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [51]:
country_map=dataset['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' Greece': 29,
 ' France': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Thailand': 18,
 ' Laos': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Hungary': 13,
 ' Honduras': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [52]:
dataset['Country']=dataset['Country'].map(country_map)
dataset.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


### Target Guided Ordinal Encoding

In [53]:
dataset=pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
dataset.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [54]:
dataset['Cabin'].fillna('Missing',inplace=True)
dataset.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [55]:
dataset['Cabin']=dataset['Cabin'].astype(str).str[0]
dataset.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [56]:
ordinal_index=dataset.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [57]:
ordinal_label={k:i for i,k in enumerate(ordinal_index,0)}
ordinal_label

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [58]:
dataset['Cabin_ordinal_label']=dataset['Cabin'].map(ordinal_label)
dataset.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Ordinal Encoding

In [59]:
mean_ordinal=dataset.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [60]:
dataset['Cabin_mean_ordinal']=dataset['Cabin'].map(mean_ordinal)
dataset.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label,Cabin_mean_ordinal
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


### Probability Ratio Encoding

In [62]:
prob=dataset.groupby(['Cabin'])['Survived'].mean()
prob_df=pd.DataFrame(prob)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [64]:
prob_df['Died']=1-prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [65]:
prob_df['Probability Ratio']=prob_df['Survived']/prob_df['Died']
prob_df

Unnamed: 0_level_0,Survived,Died,Probability Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [67]:
prob_encod_dictionary=prob_df['Probability Ratio'].to_dict()
prob_encod_dictionary

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [69]:
dataset['Cabin_probabilty_ratio']=dataset['Cabin'].map(prob_encod_dictionary)
dataset.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label,Cabin_mean_ordinal,Cabin_probabilty_ratio
0,0,M,1,0.299854,0.428274
1,1,C,4,0.59322,1.458333
2,1,M,1,0.299854,0.428274
3,1,C,4,0.59322,1.458333
4,0,M,1,0.299854,0.428274
