In [1]:
#1.One Hot Encoding - variables/features with many categories

#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data

import pandas as pd
import numpy as np

df = pd.read_csv('E:/datafiles/mercedesbenz.csv', usecols=['X1', 'X2','X3','X4','X5','X6'])
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [2]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
['a' 'e' 'c' 'f' 'd' 'b' 'g']
['d' 'b' 'c' 'a']
['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']


In [3]:
df['X1'].unique()

array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
       'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
       'ab'], dtype=object)

In [4]:
len(df['X1'].unique())

27

In [5]:
df['X2'].unique()

array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
       'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
       'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
       'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar'], dtype=object)

In [6]:
len(df['X2'].unique())

44

In [7]:
# let's have a look at how many labels each variable/feature has

for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [8]:
df.shape

(4209, 6)

In [9]:
#let's examine how many columns we will obtain after using one hot encoding on these variables

#We can observe that from with just 6 categorical features we are getting 117 features with 
#the help of one hot encoding.

pd.get_dummies(df, drop_first=True).shape

(4209, 117)

In [10]:
#KDD Cup Orange Challenge
#What can we do instead?

#http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup:
#"Winning the KDD Cup Orange Challenge with Ensemble Selection

#The authors limit onehotencoding to the 10 most frequent labels of the variable.This means that they would
#make one binary variable for each of the 10 most frequent labels only.This is equivalent to grouping all
#other labels under a new category,that in this case will be dropped.Thus,the 10 new dummy variables 
#indicate if one of the 10 most frequent labels is present (1) or not (0) for a particular observation.

#1.One Hot Encoding of top Variables

#Advantages:-
#1.Straightforward to implement
#2.Doesnot require hours of variable exploration
#3.Doesnot expand massively the feature space(number of columns in the dataset)

#Disadvantages:-
#1.Doesnot add any information that may make the variable more predictive
#2.Doesnot keep the information of the ignored labels

#Because it is not usual that categorical variables have a few dominating categories and the remaining
#labels add mostly noise,this is a quite simple and straightforward approach that may be useful on many
#occasions

#It is worth noting that the top_10 variables is totally arbitrary number.We can also take top_5,top_20

#How can we do that in python?
#let's find the top 10 most frequent categories for the variable X2

df.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [11]:
# let's make a list with the most frequent categories of the variable i.e, top_10_labels

top_10_labels = [x for x in df.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [12]:
#Now we make the 10 binary variables

for label in top_10_labels:
        df[label] = np.where(df['X2']==label, 1, 0)
        
df[['X2']+top_10_labels].head(10)        

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [13]:
# get whole set of dummy variables, for all the categorical variables
# function to create the dummy variables for the most frequent labels
# we can vary the number of most frequent labels that we encode

def one_hot_encoding_top_10(df, X2, top_10_labels):
    for label in top_10_labels:
        df[X2+'_'+label] = np.where(df[X2]==label, 1, 0)    

In [14]:
# encode X2 into the 10 most frequent categories

one_hot_encoding_top_10(df, 'X2', top_10_labels)
df.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,as,ae,ai,m,...,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,b,e,c,d,g,h,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,r,e,f,d,f,h,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,l,as,f,d,f,j,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#Now find the top most frequent categories for 'X1'

top_10_labels = [x for x in df.X1.value_counts().sort_values(ascending=False).head(10).index]

#Now to create the 10 most frequent dummy variables for 'X1'

one_hot_encoding_top_10(df, 'X1', top_10_labels)
df.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,as,ae,ai,m,...,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,b,e,c,d,g,h,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,r,e,f,d,f,h,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,l,as,f,d,f,j,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,s,as,e,d,f,i,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [16]:
#2.Count or Frequency Encoding
#High Cardinality

#Another way to refer to variables that have a multitude of categories, is to call them variables with 
#high cardinality.

#If we have categorical variables containing many multiple labels or high cardinality,then by using the 
#one hot encoding, we will expand the feature space dramatically.

#One approach that is heavily used in Kaggle competitions, is to replace each label of the categorical 
#variable by the count, this is the amount of times each label appears in the dataset. Or the frequency, 
#this is the percentage of observations within that category. The 2 are equivalent.

#There are some advantages and disadvantages that we will discuss now

#Advantages:-
#1.It is very simple to implement
#2.Does not increase the feature dimensional space or Not increasing feature space

#Disadvantages:-
#1.If some of the labels have the same count, then they will be replaced with the same count and they will
#loose some valuable information.
#2.Adds somewhat arbitrary numbers, and therefore weights to the different labels, that may not be related
#to their predictive power
#3.It will provide same weight if the frequencies are same

#Let's see how this work

import pandas as pd
import numpy as np

# let's open the mercedes benz dataset for demonstration
# Download the dataset from the below link
#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data

df = pd.read_csv('E:/datafiles/mercedesbenz.csv', usecols=['X1', 'X2'])
df.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [17]:
df.shape

(4209, 2)

In [18]:
#One hot Encoding

pd.get_dummies(df, drop_first=True).shape

(4209, 69)

In [19]:
len(df['X1'].unique())

27

In [20]:
len(df['X2'].unique())

44

In [21]:
# let's have a look at how many labels

for col in df.columns[0:]:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels


In [22]:
# let's obtain the counts for each one of the labels in variable X2
# let's capture this in a dictionary that we can use to re-map the labels

df.X2.value_counts().to_dict()

{'as': 1659,
 'ae': 496,
 'ai': 415,
 'm': 367,
 'ak': 265,
 'r': 153,
 'n': 137,
 's': 94,
 'f': 87,
 'e': 81,
 'aq': 63,
 'ay': 54,
 'a': 47,
 't': 29,
 'k': 25,
 'i': 25,
 'b': 21,
 'ao': 20,
 'ag': 19,
 'z': 19,
 'd': 18,
 'ac': 13,
 'g': 12,
 'ap': 11,
 'y': 11,
 'x': 10,
 'aw': 8,
 'h': 6,
 'at': 6,
 'al': 5,
 'an': 5,
 'q': 5,
 'ah': 4,
 'p': 4,
 'av': 4,
 'au': 3,
 'o': 1,
 'af': 1,
 'aa': 1,
 'l': 1,
 'ar': 1,
 'am': 1,
 'c': 1,
 'j': 1}

In [23]:
# And now let's replace each label in X2 by its count
# first we make a dictionary that maps each label to the counts

df_frequency_map = df.X2.value_counts().to_dict()

In [24]:
df.head(30)

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n
5,b,e
6,r,e
7,l,as
8,s,as
9,b,aq


In [25]:
#Now we replace X2 labels in the dataset df
#Now to apply this whole dictionary mapping 'df_frequency_map' into 'X2' and using the map function
#By applying changes to the varaible 'X2' all the labels in 'X2' are converted from categories to count

df.X2 = df.X2.map(df_frequency_map)
df.head()

Unnamed: 0,X1,X2
0,v,6
1,t,4
2,w,137
3,t,137
4,v,137


In [26]:
#3.Ordinal numbering encoding or Label Encoding
#Ordinal categorical variables

#Ordinal data is a categorical, statistical data type where the variables have natural, ordered categories
#and the distances between the categories is not known.

#Categorial divided into two types 1.Nominal 2.Ordinal
#1.Nominal---> which doesnot have any order eg:-pen,pencil,book,cow,dog,cat
#2.Ordinal---> which will have specific order eg:-1.Excellent,2.Good,3.Bad,4.Fantastic,5.Okay,6.Doesn'tLike

#For example:
#Student's grade in an exam (A, B, C or Fail).
#Educational level, with the categories: Elementary school, High school, College graduate, PhD 
#ranked from 1 to 4.

#When the categorical variables are ordinal, the most straight forward best approach is to replace the 
#labels by some ordinal number based on the ranks

#Ordinal Measurement Advantages:-
#Ordinal measurement is normally used for surveys and questionnaires. Statistical analysis is applied to 
#the responses once they are collected to place the people who took the survey into the various categories.
#The data is then compared to draw inferences and conclusions about the whole surveyed population with
#regard to the specific variables. The advantage of using ordinal measurement is ease of collation and 
#categorization. If you ask a survey question without providing the variables, the answers are likely to 
#be so diverse they cannot be converted to statistics.

#With Respect to Machine Learning:-
#1.Keeps the semantical information of the variable (human readable content)
#2.Straightforward

#Ordinal Measurement Disadvantages:-
#The same characteristics of ordinal measurement that create its advantages also create certain 
#disadvantages.The responses are often so narrow in relation to the question that they create or magnify 
#bias that is not factored into the survey.For example, on the question about satisfaction with the 
#governor, people might be satisfied with his job performance but upset about a recent sex scandal. 
#The survey question might lead respondents to state their dissatisfaction about the scandal,in spite of
#satisfaction with his job performance -- but the statistical conclusion will not differentiate.

#With Respect to Machine Learning:-
#1.Does not add machine learning valuable information

import pandas as pd
import datetime

In [27]:
# create a variable with dates, and from that extract the weekday
# I create a list of dates with 20 days difference from today
# and then transform it into a datafame

df_base = datetime.datetime.today()
df_date_list = [df_base - datetime.timedelta(days=x) for x in range(0, 20)]
df = pd.DataFrame(df_date_list)
df.columns = ['day']
df

Unnamed: 0,day
0,2021-05-26 15:36:00.110421
1,2021-05-25 15:36:00.110421
2,2021-05-24 15:36:00.110421
3,2021-05-23 15:36:00.110421
4,2021-05-22 15:36:00.110421
5,2021-05-21 15:36:00.110421
6,2021-05-20 15:36:00.110421
7,2021-05-19 15:36:00.110421
8,2021-05-18 15:36:00.110421
9,2021-05-17 15:36:00.110421


In [28]:
# extract the week day name

df['day_of_week'] = df['day'].dt.day_name()
df.head()

Unnamed: 0,day,day_of_week
0,2021-05-26 15:36:00.110421,Wednesday
1,2021-05-25 15:36:00.110421,Tuesday
2,2021-05-24 15:36:00.110421,Monday
3,2021-05-23 15:36:00.110421,Sunday
4,2021-05-22 15:36:00.110421,Saturday


In [29]:
#Engineer categorical variable by ordinal number replacement
#Now creating a dictionary weekday_map and assigning numbers to all days
#Now to apply this whole dictionary mapping 'weekday_map' into 'day_of_week' by using the map function

weekday_map = {'Monday':1,
               'Tuesday':2,
               'Wednesday':3,
               'Thursday':4,
               'Friday':5,
               'Saturday':6,
               'Sunday':7
}

df['day_ordinal'] = df.day_of_week.map(weekday_map)
df.head(20)

Unnamed: 0,day,day_of_week,day_ordinal
0,2021-05-26 15:36:00.110421,Wednesday,3
1,2021-05-25 15:36:00.110421,Tuesday,2
2,2021-05-24 15:36:00.110421,Monday,1
3,2021-05-23 15:36:00.110421,Sunday,7
4,2021-05-22 15:36:00.110421,Saturday,6
5,2021-05-21 15:36:00.110421,Friday,5
6,2021-05-20 15:36:00.110421,Thursday,4
7,2021-05-19 15:36:00.110421,Wednesday,3
8,2021-05-18 15:36:00.110421,Tuesday,2
9,2021-05-17 15:36:00.110421,Monday,1


In [30]:
#4.Target Guided Ordinal Encoding

#1.Ordering the labels according to the target
#2.Replace the labels by the joint probability of being 1 or 0

import pandas as pd
df = pd.read_csv('E:/datafiles/titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Cabin,Survived
0,,0
1,C85,1
2,,1
3,C123,1
4,,0


In [31]:
#We are replacing the NaN values of Cabin with the 'Missing' keyword

df['Cabin'].fillna('Missing',inplace=True)

In [32]:
#Now we are converting the 'Cabin' feature to string and taking first index value[0] in 'Cabin' column

df['Cabin'] = df['Cabin'].astype(str).str[0]

In [33]:
df['Cabin'].astype(str).str[0]

0      M
1      C
2      M
3      C
4      M
      ..
886    M
887    B
888    M
889    C
890    M
Name: Cabin, Length: 891, dtype: object

In [34]:
df.head()

Unnamed: 0,Cabin,Survived
0,M,0
1,C,1
2,M,1
3,C,1
4,M,0


In [35]:
#To see all the unique features in 'Cabin' column

df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [36]:
#Doing groupby on 'Cabin' based on 'Survived'

df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [37]:
#We are sorting the values based on 'Survives' and giving an index to sorted values

df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [38]:
#Storing the above information in a ordinal_labels

ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [39]:
#We are mapping these ordinal encoding labels to a number

enumerate(ordinal_labels,0)

<enumerate at 0x2186b161600>

In [40]:
#We are assigining the ordinal_labels with a rank attached to it based on their index
#k--->ordinal_labels and i----> index/range starts from 0

ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [41]:
#Now we are mapping these Cabin_ordinal_labels with Cabin column
#Based on the target column 'Survived' we have assigned the ranks in 'Cabin' column to find Survived number

df['Cabin_ordinal_labels'] = df['Cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,Cabin,Survived,Cabin_ordinal_labels
0,M,0,1
1,C,1,4
2,M,1,1
3,C,1,4
4,M,0,1


In [42]:
#5.Mean Encoding

#Now we are creating a variable mean_ordinal and assigning 'Survived' mean values with 'Cabin' labels

mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [43]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [44]:
#Now we are mapping these mean_ordinal_encode with Cabin column
#Based on the target column 'Survived' we have assigned the mean in 'Cabin' column to find Survived number

df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Cabin,Survived,Cabin_ordinal_labels,mean_ordinal_encode
0,M,0,1,0.299854
1,C,1,4,0.59322
2,M,1,1,0.299854
3,C,1,4,0.59322
4,M,0,1,0.299854


In [45]:
#Advantages:-
#1.It captures information with in the label therefore rendering more predictive features
#2.It creates a monotonic relationship betwwen variable and target

#Disadvantages:-
#1.It prones or leads to overfitting