In [1]:
import pandas as pd
import numpy as np
import datetime

# Label Encoding

In [2]:
today_date = datetime.datetime.today()

In [3]:
today_date

datetime.datetime(2021, 5, 17, 22, 16, 48, 120008)

In [4]:
today_date - datetime.timedelta(1)

datetime.datetime(2021, 5, 16, 22, 16, 48, 120008)

In [5]:
days = [today_date - datetime.timedelta(x) for x in range(15)]

In [52]:
data = pd.DataFrame(days,columns=['datetime'])

In [53]:
data.head()

Unnamed: 0,datetime
0,2021-05-17 12:01:59.679180
1,2021-05-16 12:01:59.679180
2,2021-05-15 12:01:59.679180
3,2021-05-14 12:01:59.679180
4,2021-05-13 12:01:59.679180


In [64]:
data['weekday'] = data['datetime'].dt.day_name()

In [80]:
data

Unnamed: 0,datetime,weekday
0,2021-05-17 12:01:59.679180,Monday
1,2021-05-16 12:01:59.679180,Sunday
2,2021-05-15 12:01:59.679180,Saturday
3,2021-05-14 12:01:59.679180,Friday
4,2021-05-13 12:01:59.679180,Thursday
5,2021-05-12 12:01:59.679180,Wednesday
6,2021-05-11 12:01:59.679180,Tuesday
7,2021-05-10 12:01:59.679180,Monday
8,2021-05-09 12:01:59.679180,Sunday
9,2021-05-08 12:01:59.679180,Saturday


In [82]:
dic = {'Monday':1, 'Tuesday':2, 'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6, 'Sunday':7}

In [83]:
data['weekday_ordinal'] = data['weekday'].map(dic)

In [84]:
data.head()

Unnamed: 0,datetime,weekday,weekday_ordinal
0,2021-05-17 12:01:59.679180,Monday,1
1,2021-05-16 12:01:59.679180,Sunday,7
2,2021-05-15 12:01:59.679180,Saturday,6
3,2021-05-14 12:01:59.679180,Friday,5
4,2021-05-13 12:01:59.679180,Thursday,4


# Count or Frequency Encoding

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/krishnaik06/Feature-Engineering-Live-sessions/master/mercedes.csv',usecols=['X1','X2','X3'])

In [97]:
df.head()

Unnamed: 0,X1,X2,X3
0,v,at,a
1,t,av,e
2,w,n,c
3,t,n,f
4,v,n,f


In [98]:
for c in df.columns:
    print(c, ':', len(df[c].unique()))

X1 : 27
X2 : 44
X3 : 7


# Count

In [101]:
X2_map = df['X2'].value_counts().to_dict()

In [102]:
df['X2'] = df['X2'].map(X2_map)

In [103]:
df.head()

Unnamed: 0,X1,X2,X3
0,v,6,a
1,t,4,e
2,w,137,c
3,t,137,f
4,v,137,f


# Frequency

In [109]:
X1_map = (df.groupby('X1').size()/len(df)).to_dict()

In [110]:
df['X1'] = df['X1'].map(X1_map)

In [111]:
df.head()

Unnamed: 0,X1,X2,X3
0,0.096935,6,a
1,0.007365,4,e
2,0.012354,137,c
3,0.007365,137,f
4,0.096935,137,f


# Target Guided Ordinal Encoding

In [31]:
df = pd.read_csv('https://raw.githubusercontent.com/krishnaik06/Feature-Engineering-Live-sessions/master/titanic.csv',usecols=['Cabin','Survived'])

In [32]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [33]:
df.Cabin.fillna('Missing',inplace=True)

In [34]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [35]:
df['Cabin'] = df.Cabin.str[0]

In [36]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [140]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [145]:
labels = df.groupby('Cabin')['Survived'].mean().sort_values().index

In [147]:
labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [155]:
labels_dict = {k:i for i,k in enumerate(labels,0)}

In [158]:
df.Cabin = df.Cabin.map(labels_dict)

In [159]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,1
1,1,4
2,1,1
3,1,4
4,0,1


# Mean Encoding

In [188]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [189]:
labels = df.groupby('Cabin')['Survived'].mean().to_dict()

In [191]:
df.Cabin = df.Cabin.map(labels)

In [192]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,0.299854
1,1,0.59322
2,1,0.299854
3,1,0.59322
4,0,0.299854


# Probability Ratio Encoding

In [37]:
prob_df = df.groupby('Cabin')['Survived'].mean()

In [38]:
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [39]:
prob_df['Died'] = 1 - prob_df

In [40]:
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [41]:
prob_df['Probability_ratio'] = prob_df['Survived'] / prob_df['Died']

In [42]:
probability_encoded = prob_df['Probability_ratio'].to_dict()

In [46]:
df['Cabin_ecoded'] = df['Cabin'].map(probability_encoded)

In [47]:
df

Unnamed: 0,Survived,Cabin,Cabin_ecoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
...,...,...,...
886,0,M,0.428274
887,1,B,2.916667
888,0,M,0.428274
889,1,C,1.458333
