In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Handling Categorical Variables (encoding)

## 1. Nominal Data

### (i) One hot encoding

In [32]:
data=pd.read_csv('data_sets/titanic_train.csv',usecols=['Sex','Embarked','Survived'])
data.head()

Unnamed: 0,Survived,Sex,Embarked
0,0,male,S
1,1,female,C
2,1,female,S
3,1,female,S
4,0,male,S


In [33]:
data.isnull().sum()

Survived    0
Sex         0
Embarked    2
dtype: int64

In [34]:
data['Embarked'].dropna().unique()

array(['S', 'C', 'Q'], dtype=object)

In [35]:
data=pd.get_dummies(data,drop_first=True)
data.head()

Unnamed: 0,Survived,Sex_male,Embarked_Q,Embarked_S
0,0,1,0,1
1,1,0,0,0
2,1,0,0,1
3,1,0,0,1
4,0,1,0,1


### (ii) One hot encoding (many categories)

In [76]:
mercedes=pd.read_csv('data_sets/mercedes_train.csv')
mercedes.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [77]:
data=mercedes[['X1','X2','X3']]
data.head()

Unnamed: 0,X1,X2,X3
0,v,at,a
1,t,av,e
2,w,n,c
3,t,n,f
4,v,n,f


In [78]:
#define a function which take the data set and column name and adds " n " columns to data
def many_encoder(data,variable,n):
    #first store to 10 values in a list
    l=data['X1'].value_counts()[0:n].sort_values(ascending=False).index
    for i in l:
        data[variable+'_'+i]=np.where(data[variable]==i,1,0)
        

In [79]:
data['X1'].value_counts()[0:10].sort_values(ascending=False).index

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [80]:
n=10 #here n is max no. of categories we want
for i in data.columns:
    many_encoder(data,i,n)
data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[variable+'_'+i]=np.where(data[variable]==i,1,0)


Unnamed: 0,X1,X2,X3,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,...,X3_aa,X3_s,X3_b,X3_l,X3_v,X3_r,X3_i,X3_a,X3_c,X3_o
0,v,at,a,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,t,av,e,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,t,n,f,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,b,e,c,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,r,e,f,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,l,as,f,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,s,as,e,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,b,aq,c,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### (iii) Mean Encoding

In [102]:
data=pd.read_csv('data_sets/titanic_train.csv',usecols=['Sex','Cabin','Survived'])
data.head()

Unnamed: 0,Survived,Sex,Cabin
0,0,male,
1,1,female,C85
2,1,female,
3,1,female,C123
4,0,male,


In [103]:
#take mean of each category in Cabin
data=data.dropna()
data['Cabin_0']=data['Cabin'].astype(str).str[0]
data.head()

Unnamed: 0,Survived,Sex,Cabin,Cabin_0
1,1,female,C85,C
3,1,female,C123,C
6,0,male,E46,E
10,1,female,G6,G
11,1,female,C103,C


In [104]:
mean_replace=data.groupby(['Cabin_0'])['Survived'].mean().to_dict()
mean_replace

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'T': 0.0}

In [107]:
data['Cabin_mean_encoded']=data['Cabin_0'].map(mean_replace)
data.head()

Unnamed: 0,Survived,Sex,Cabin,Cabin_0,Cabin_mean_encoded
1,1,female,C85,C,0.59322
3,1,female,C123,C,0.59322
6,0,male,E46,E,0.75
10,1,female,G6,G,0.5
11,1,female,C103,C,0.59322


# 2. Ordinal Data

### (i) Label Encoding

In [5]:
# lets generate 15 days data
import pandas as pd
import datetime

In [6]:
today_date=datetime.datetime.today()
today_date

datetime.datetime(2021, 5, 22, 11, 50, 35, 915854)

In [27]:
date_data=[today_date-datetime.timedelta(x) for x in range(1,16)]
dates=pd.DataFrame(date_data,columns=['date'])
dates.head()

Unnamed: 0,date
0,2021-05-21 11:50:35.915854
1,2021-05-20 11:50:35.915854
2,2021-05-19 11:50:35.915854
3,2021-05-18 11:50:35.915854
4,2021-05-17 11:50:35.915854


In [46]:
dates['weekday']=dates.date.dt.weekday
#in below mapping either we kan assign names to weekdays starting from 0=monday or we can directly assign the impotrance 
# but here impotance is already assigned if taking sunday as most important
dates['weekday_name']=dates.weekday.map({0:'Monday',1:'Tuesday',2:'Wednesday',3:'Thursday',4:'Friday',5:'Saturday',6:'Sunday'})
dates.head()

Unnamed: 0,date,weekday,weekday_name
0,2021-05-21 11:50:35.915854,4,Friday
1,2021-05-20 11:50:35.915854,3,Thursday
2,2021-05-19 11:50:35.915854,2,Wednesday
3,2021-05-18 11:50:35.915854,1,Tuesday
4,2021-05-17 11:50:35.915854,0,Monday


In [50]:
# if you want to assign values according to you
weights={'Monday':6,'Tuesday':5,'Wednesday':4,'Thursday':3,'Friday':2,'Saturday':1,'Sunday':1}
dates['self_assigned']=dates.weekday_name.map(weights)
dates.head()

Unnamed: 0,date,weekday,weekday_name,self_assigned
0,2021-05-21 11:50:35.915854,4,Friday,2
1,2021-05-20 11:50:35.915854,3,Thursday,3
2,2021-05-19 11:50:35.915854,2,Wednesday,4
3,2021-05-18 11:50:35.915854,1,Tuesday,5
4,2021-05-17 11:50:35.915854,0,Monday,6


### (ii) Target Guided Ordinal Encoding

In [69]:
data=pd.read_csv('data_sets/titanic_train.csv',usecols=['Cabin','Survived'])
data.isnull().sum()

Survived      0
Cabin       687
dtype: int64

In [71]:
data=data.dropna()
print('these are out uniqe categories in Cabin column :\n')
print(data.Cabin.astype(str).str[0].unique())

these are out uniqe categories in Cabin column :

['C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']


In [77]:
data['Cabin_init']=data.Cabin.astype(str).str[0]
l=data.groupby(['Cabin_init'])['Survived'].mean().sort_values(ascending=True).index
l # in ascending order

Index(['T', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin_init')

In [79]:
ordinal_dict={}
val=0
for i in l:
    ordinal_dict[i]=val
    val+=1
ordinal_dict

{'T': 0, 'A': 1, 'G': 2, 'C': 3, 'F': 4, 'B': 5, 'E': 6, 'D': 7}

In [84]:
data['Cabin_encoded']=data['Cabin_init'].map(ordinal_dict)
data.head()

Unnamed: 0,Survived,Cabin,Cabin_init,Cabin_encoded
1,1,C85,C,3
3,1,C123,C,3
6,0,E46,E,6
10,1,G6,G,2
11,1,C103,C,3
