# Ordinal Number Encoding  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import datetime

In [3]:
today_date = datetime.datetime.today()

In [4]:
today_date

datetime.datetime(2020, 10, 30, 16, 45, 30, 699742)

In [5]:
today_date - datetime.timedelta(1)

datetime.datetime(2020, 10, 29, 16, 45, 30, 699742)

In [6]:
# list comprehension

days = [today_date-datetime.timedelta(i) for i in range(15)]

In [7]:
df = pd.DataFrame(days,columns=['days'])

df

Unnamed: 0,days
0,2020-10-30 16:45:30.699742
1,2020-10-29 16:45:30.699742
2,2020-10-28 16:45:30.699742
3,2020-10-27 16:45:30.699742
4,2020-10-26 16:45:30.699742
5,2020-10-25 16:45:30.699742
6,2020-10-24 16:45:30.699742
7,2020-10-23 16:45:30.699742
8,2020-10-22 16:45:30.699742
9,2020-10-21 16:45:30.699742


In [8]:
df['weekday'] = df['days'].dt.day_name()

In [9]:
df.head(10)

Unnamed: 0,days,weekday
0,2020-10-30 16:45:30.699742,Friday
1,2020-10-29 16:45:30.699742,Thursday
2,2020-10-28 16:45:30.699742,Wednesday
3,2020-10-27 16:45:30.699742,Tuesday
4,2020-10-26 16:45:30.699742,Monday
5,2020-10-25 16:45:30.699742,Sunday
6,2020-10-24 16:45:30.699742,Saturday
7,2020-10-23 16:45:30.699742,Friday
8,2020-10-22 16:45:30.699742,Thursday
9,2020-10-21 16:45:30.699742,Wednesday


In [10]:
dictionary = {
    'Monday':1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6, 
    'Sunday':7
}

In [11]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [12]:
df['weekday_ordinal'] = df['weekday'].map(dictionary)

In [13]:
df.head()

Unnamed: 0,days,weekday,weekday_ordinal
0,2020-10-30 16:45:30.699742,Friday,5
1,2020-10-29 16:45:30.699742,Thursday,4
2,2020-10-28 16:45:30.699742,Wednesday,3
3,2020-10-27 16:45:30.699742,Tuesday,2
4,2020-10-26 16:45:30.699742,Monday,1


# Count or Frequency Encoding

In [14]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       32561 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       32561 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      32561 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [18]:
columns=[1,3,5,6,7,8,9,13]

In [19]:
train_set=train_set[columns]

In [20]:
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [21]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [23]:

for feature in train_set.columns:
    print(feature,":",len(train_set[feature].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [24]:
country_map=train_set['Country'].value_counts().to_dict()

In [25]:
train_set['Country']=train_set['Country'].map(country_map)
train_set.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


##### Advantages
Easy To Use
Not increasing feature space 

##### Disadvantages
It will provide same weight if the frequencies are same





# Target Guided Ordinal Encoding
Ordering the labels according to the target

Replace the labels by the joint probability of being 1 or 0

In [27]:
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [30]:
df['Cabin'].fillna('Missing',inplace=True)

In [31]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [32]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [33]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [34]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [38]:
df.groupby('Cabin')['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [41]:
df.groupby('Cabin')['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [43]:
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [44]:
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [45]:
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


# Mean Encoding

In [46]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [47]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [48]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


### Adventage
it capture information within the label therefore randering more predictive feature

### Disadvantage
it lead to overfitting 

# Probability Ratio Encoding

In [49]:
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [50]:
df['Cabin'] = df['Cabin'].fillna('Missing')

In [52]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [55]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [56]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [57]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [60]:
prob = df.groupby('Cabin')['Survived'].mean()

In [65]:
prob_df = pd.DataFrame(prob)

In [66]:
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [67]:
prob_df['Died'] = 1-prob_df['Survived']

In [68]:
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [69]:
prob_df['probaility_ratio'] = prob_df['Survived']/prob_df['Died']

In [70]:
prob_df

Unnamed: 0_level_0,Survived,Died,probaility_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [72]:
probability_encoding = prob_df['probaility_ratio'].to_dict()

In [73]:
df['cabin_encoding'] = df['Cabin'].map(probability_encoding)

In [74]:
df

Unnamed: 0,Survived,Cabin,cabin_encoding
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
...,...,...,...
886,0,M,0.428274
887,1,B,2.916667
888,0,M,0.428274
889,1,C,1.458333
