In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## One Hot Encoding

In [2]:
df = pd.read_csv('train.csv', usecols=['Sex'])

In [3]:
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [4]:
df = pd.read_csv('train.csv', usecols=['Embarked'])
df.dropna(inplace=True)
# we are dropping just for this tutorial

In [5]:
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### Performing Onehotencoding with many categories in a feature

In [6]:
df = pd.read_csv('merc_train.csv', usecols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'])

In [7]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [8]:
# Number of Categories in each feature
for i in df.columns:
    print(f"{i} has {len(df[i].unique())} categories")

X0 has 47 categories
X1 has 27 categories
X2 has 44 categories
X3 has 7 categories
X4 has 4 categories
X5 has 29 categories
X6 has 12 categories


##### For features with so many categories, one way to deal with them is just to take the top ten categories and ignore the rest of the categories:

In [9]:
# Here we will just take 'X1'
top_10 = list(df.X1.value_counts().head(10).index)

In [10]:
print("The top ten features are :")
top_10

The top ten features are :


['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [11]:
# Now we will encode only the top 10 features present in the column 'X1':
for category in top_10:
    df['X1_' + category] = np.where(df['X1']==category, 1, 0)
    

In [12]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


## Ordinal Number Encoding - 
An Ordinal Number is a number that tells us a position of something in a list

In [13]:
import datetime

In [14]:
today_date = datetime.datetime.today()

In [16]:
data = [today_date - datetime.timedelta(x) for x in range(0, 15)]

In [22]:
df = pd.DataFrame(data)
df.columns = ["Days"]

In [23]:
df.head()

Unnamed: 0,Days
0,2020-07-22 15:49:28.057170
1,2020-07-21 15:49:28.057170
2,2020-07-20 15:49:28.057170
3,2020-07-19 15:49:28.057170
4,2020-07-18 15:49:28.057170


In [37]:
df['Weekday'] = df["Days"].dt.day_name()

In [38]:
df.head()

Unnamed: 0,Days,Weekday
0,2020-07-22 15:49:28.057170,Wednesday
1,2020-07-21 15:49:28.057170,Tuesday
2,2020-07-20 15:49:28.057170,Monday
3,2020-07-19 15:49:28.057170,Sunday
4,2020-07-18 15:49:28.057170,Saturday


In [39]:
day_dict = dict(zip(list(df['Weekday'].unique()), list(range(1,8))))

In [40]:
day_dict

{'Wednesday': 1,
 'Tuesday': 2,
 'Monday': 3,
 'Sunday': 4,
 'Saturday': 5,
 'Friday': 6,
 'Thursday': 7}

In [41]:
df['Weekday_ordinal'] = df['Weekday'].map(day_dict)

In [54]:
df.head(10)

Unnamed: 0,Days,Weekday,Weekday_ordinal
0,2020-07-22 15:49:28.057170,Wednesday,1
1,2020-07-21 15:49:28.057170,Tuesday,2
2,2020-07-20 15:49:28.057170,Monday,3
3,2020-07-19 15:49:28.057170,Sunday,4
4,2020-07-18 15:49:28.057170,Saturday,5
5,2020-07-17 15:49:28.057170,Friday,6
6,2020-07-16 15:49:28.057170,Thursday,7
7,2020-07-15 15:49:28.057170,Wednesday,1
8,2020-07-14 15:49:28.057170,Tuesday,2
9,2020-07-13 15:49:28.057170,Monday,3


## Count or Frequency Encoding 
In this method we will simply replace the Category by the frequency of the category
For example if in a feature India appears 12 times and US appears 10 times we will replace India by 12 and US by 10 resp.

##### Advantages:
1. Easy to implement 
2. We are not increasing any feature space ie.we are not adding extra features.

##### Disadvantages:
1. It will give same weightage if the frequency/count of two or more categories is same.

In [93]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None,index_col=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [94]:
columns = [1, 3, 5, 6, 7, 8, 9, 13]
df = df[columns]

In [95]:
df.columns = ['employment', 'degree', 'status', 'designation', 'relationship', 'race', 'sex', 'country']

In [96]:
df.head()

Unnamed: 0,employment,degree,status,designation,relationship,race,sex,country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [97]:
df['country'] = np.where(df['country'] == ' ?', ' Others', df['country'])

In [98]:
for feature in df.columns:
    print(f"{feature} --- {len(df[feature].unique())} labels")

employment --- 9 labels
degree --- 16 labels
status --- 7 labels
designation --- 15 labels
relationship --- 6 labels
race --- 5 labels
sex --- 2 labels
country --- 42 labels


In [99]:
country_map = df.country.value_counts().to_dict()

In [100]:
df['country'] = df['country'].map(country_map)

In [101]:
df.head(20)

Unnamed: 0,employment,degree,status,designation,relationship,race,sex,country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


## Target Guided Ordinal Encoding 
- Ordering the labels according to the target
- Replace the label by joint probability of being 1 or 0

In [3]:
df = pd.read_csv('train.csv', usecols=['Cabin', 'Survived'])

In [4]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [5]:
df['Cabin'].fillna('Missing', inplace=True)

In [6]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [8]:
# We are going to consider the block of the cabin ie. Just the first leter of the cabin

df['Cabin'] = df['Cabin'].astype(str).str[0]

In [13]:
df.Cabin.value_counts()

M    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [15]:
# Finding percentage of survival based on Cabin:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [18]:
# we have sorted the values so that the algorithm understands the probability of survival(our target)
# and we will encode them accordingly 

ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [21]:
# mapping the ordinal labels to integers
label_dict = {k:value for value, k in enumerate(ordinal_labels)}

In [22]:
label_dict

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [24]:
# Now mapping these encoded values to the dataset
df['Cabin'] = df['Cabin'].map(label_dict)

In [25]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,1
1,1,4
2,1,1
3,1,4
4,0,1


## Mean Encoding
Similar to frequency/count encoding but instead of replaing label by count we will replace them by the mean value according to the target feature.

##### This can sometimes lead to overfitting.

In [27]:
df = pd.read_csv('train.csv', usecols=['Survived', 'Cabin'])

In [28]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [29]:
df.fillna('Missing', inplace=True)

In [31]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [32]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [35]:
label_means = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [36]:
df['Cabin'] = df['Cabin'].map(label_means)

In [37]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,0.299854
1,1,0.59322
2,1,0.299854
3,1,0.59322
4,0,0.299854


## Probability Ratio Encoding 
- We find the probability ratio of target value with respect to the categorical feature which we want to encode and replace the categories with the probability ratio of the same.

- For example, in the titanic dataset we have taken probability ratio of survived with respect to the categorical feature 'Cabin' and replaced it with the probability ratio.

In [29]:
df = pd.read_csv("train.csv", usecols=['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [30]:
df['Cabin'].fillna('Missing', inplace=True)

In [31]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [32]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [33]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [34]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [35]:
prob = df.groupby(['Cabin'])['Survived'].mean()
prob

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [36]:
prob_df = pd.DataFrame(prob)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [37]:
prob_df['Died'] = 1 - prob_df['Survived']

In [38]:
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [39]:
prob_ratio = [(x/y) for x, y in zip(prob_df['Survived'],prob_df['Died'])]

In [40]:
prob_ratio_dict = {key:value for key, value in zip(prob_df.index, prob_ratio)}

In [41]:
prob_ratio_dict

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [42]:
df['Cabin'] = df['Cabin'].map(prob_ratio_dict)

In [43]:
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,0.428274
1,1,1.458333
2,1,0.428274
3,1,1.458333
4,0,0.428274
5,0,0.428274
6,0,3.0
7,0,0.428274
8,1,0.428274
9,1,0.428274
