-----------------------HANDLING CATEGORICAL VARIABLES-----------------------------------

# 1. ONE HOT ENCODING

In [11]:
import pandas as pd

data = pd.read_csv('titanic.csv',usecols=['Sex'])
data.head(10)

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male
5,male
6,male
7,male
8,female
9,female


In [12]:
## getting the dummies
pd.get_dummies(data,drop_first=True).head(10)

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1
5,1
6,1
7,1
8,0
9,0


In [13]:
data2 = pd.read_csv('titanic.csv',usecols=['Embarked'])
data2

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S
...,...
886,S
887,S
888,S
889,C


In [14]:
## fetch all the types of categories
data2['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [15]:
data2.dropna(inplace=True)

In [16]:
## getting dummy variables for Embarked feature
pd.get_dummies(data2,drop_first=True).head(10)

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
5,1,0
6,0,1
7,0,1
8,0,1
9,0,0


# 2. ONE-HOT-ENCODING WITH MANY CATEGORIES IN A FEATURE

In [18]:
data3 = pd.read_csv('mercedes.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])
data3.head(10)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d
5,t,b,e,c,d,g,h
6,al,r,e,f,d,f,h
7,o,l,as,f,d,f,j
8,w,s,as,e,d,f,i
9,j,b,aq,c,d,f,a


In [20]:
data3.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'], dtype='object')

In [25]:
data3['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [26]:
## Getting no. of caetgories inside each feature

for feature in data3.columns:
    print("{} feature has {} unique categories".format(feature,len(data3[feature].unique())))

X0 feature has 47 unique categories
X1 feature has 27 unique categories
X2 feature has 44 unique categories
X3 feature has 7 unique categories
X4 feature has 4 unique categories
X5 feature has 29 unique categories
X6 feature has 12 unique categories


In [29]:
## 10 most frequent categories inside X1 feature
data3.X1.value_counts().sort_values(ascending=False).head(10).index

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [32]:
frequent_10 = data3.X1.value_counts().sort_values(ascending=False).head(10).index
frequent_10 = list(frequent_10)
frequent_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [33]:
import numpy as np

## setting 10 most frequent categories as 1 otherwise as 0 for 'X1' and simultaneously adding the 10 categories as features

for category in frequent_10:
    data3[category] = np.where(data3['X1']==category,1,0)

In [34]:
frequent_10.append('X1')
frequent_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o', 'X1']

In [36]:
data3[frequent_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


Similary for all the other features we can find the most 10 frequent Categories. and add them as performed above.

## 3. ORDINAL NUMBER ENCODING

In [59]:
import datetime as d

In [60]:
present_date = d.datetime.today()
present_date

datetime.datetime(2022, 2, 22, 0, 21, 32, 591326)

Python timedelta() function is present under datetime library which is generally used for calculating differences in dates and also can be used for date manipulations in Python. It is one of the easiest ways to perform date manipulations.

Syntax : datetime.timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)

Returns : Date

In [61]:
present_date - d.timedelta(3)   ## past date before 3 days

datetime.datetime(2022, 2, 19, 0, 21, 32, 591326)

In [62]:
## compute date-time for a given range and storing in a list

days = [present_date-d.timedelta(x) for x in range(0,15)] 
days

[datetime.datetime(2022, 2, 22, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 21, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 20, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 19, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 18, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 17, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 16, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 15, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 14, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 13, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 12, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 11, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 10, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 9, 0, 21, 32, 591326),
 datetime.datetime(2022, 2, 8, 0, 21, 32, 591326)]

In [63]:
import pandas as pd

data = pd.DataFrame(days,columns=['DAY'])
data

Unnamed: 0,DAY
0,2022-02-22 00:21:32.591326
1,2022-02-21 00:21:32.591326
2,2022-02-20 00:21:32.591326
3,2022-02-19 00:21:32.591326
4,2022-02-18 00:21:32.591326
5,2022-02-17 00:21:32.591326
6,2022-02-16 00:21:32.591326
7,2022-02-15 00:21:32.591326
8,2022-02-14 00:21:32.591326
9,2022-02-13 00:21:32.591326


## NOTE::
    The syntax we are trying to use data['Day'].dt.weekday_name is for PANDAS <= 0.22 (pandas.pydata.org/pandas-docs/version/0.22.0/generated/…) For newer pandas versions you should use data['Date'].dt. day_name()

In [64]:
## pandas.Series.dt.weekday_name ---> for old version pandas
data['weekday'] = data['DAY'].dt.weekday_name
data

AttributeError: 'DatetimeProperties' object has no attribute 'weekday_name'

In [66]:
data['Weekday']  = data['DAY'].dt.day_name()
data

Unnamed: 0,DAY,Weekday
0,2022-02-22 00:21:32.591326,Tuesday
1,2022-02-21 00:21:32.591326,Monday
2,2022-02-20 00:21:32.591326,Sunday
3,2022-02-19 00:21:32.591326,Saturday
4,2022-02-18 00:21:32.591326,Friday
5,2022-02-17 00:21:32.591326,Thursday
6,2022-02-16 00:21:32.591326,Wednesday
7,2022-02-15 00:21:32.591326,Tuesday
8,2022-02-14 00:21:32.591326,Monday
9,2022-02-13 00:21:32.591326,Sunday


In [67]:
dictionary = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [68]:
data['Weekday_ordinal'] = data['Weekday'].map(dictionary)
data

Unnamed: 0,DAY,Weekday,Weekday_ordinal
0,2022-02-22 00:21:32.591326,Tuesday,2
1,2022-02-21 00:21:32.591326,Monday,1
2,2022-02-20 00:21:32.591326,Sunday,7
3,2022-02-19 00:21:32.591326,Saturday,6
4,2022-02-18 00:21:32.591326,Friday,5
5,2022-02-17 00:21:32.591326,Thursday,4
6,2022-02-16 00:21:32.591326,Wednesday,3
7,2022-02-15 00:21:32.591326,Tuesday,2
8,2022-02-14 00:21:32.591326,Monday,1
9,2022-02-13 00:21:32.591326,Sunday,7


# 4. COUNT/FREQUENCY ENCODING

In [71]:
train_set = pd.read_csv('adult.data')
train_set.head(10)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
5,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
9,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


In [72]:
## correcting the above dataframe
train_set = pd.read_csv('adult.data',index_col=None,header=None)
train_set.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [74]:
columns=[1,3,5,6,7,8,9,13]

train_set = train_set[columns]
train_set.head(10)

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States


In [75]:
## giving meaningful names to columns/features instead of indices
train_set.columns = train_set.columns=['Employment','Degree','Marital_Status','Designation','family_job','Race','Sex','Country']
train_set.head(10)

Unnamed: 0,Employment,Degree,Marital_Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States


In [82]:
train_set.shape

(32561, 8)

In [76]:
## counting no. of categories inside each feature
for feature in train_set.columns:
    print(feature," : ",len(train_set[feature].unique())," labels")

Employment  :  9  labels
Degree  :  16  labels
Marital_Status  :  7  labels
Designation  :  15  labels
family_job  :  6  labels
Race  :  5  labels
Sex  :  2  labels
Country  :  42  labels


In [81]:
## counting frequency of each category inside 'Country' feature 
country_map = train_set['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Hungary': 13,
 ' Honduras': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [83]:
## Replace country Names with their frequencies
train_set['Country']  = train_set['Country'].map(country_map)
train_set.head(10)

Unnamed: 0,Employment,Degree,Marital_Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


## Advantages
    Easy To Use
    Not increasing feature space 

## Disadvantages
    It will provide same weight if the frequencies are same

# 5.  Target Guided Ordinal Encoding
    Ordering the labels according to the target
    Replace the labels by the joint probability of being 1 or 0

In [102]:
import pandas as pd

data = pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
data.head(20)

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
5,0,
6,0,E46
7,0,
8,1,
9,1,


In [103]:
## fill nan values with 'missing' label

data['Cabin'].fillna('Missing',inplace=True)
data.head(20)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [104]:
data['Cabin'] = data['Cabin'].astype(str).str[0]
data.head(20)

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
5,0,M
6,0,E
7,0,M
8,1,M
9,1,M


In [105]:
data['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [106]:
data.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [107]:
data.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [108]:
ordinal_targeted_label =  data.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_targeted_label

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [109]:
ordinal_targeted_labels = {k:i for i,k in enumerate(ordinal_targeted_label,0)}
ordinal_targeted_labels

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [110]:
data['Cabin_ordinal_labels'] = data['Cabin'].map(ordinal_targeted_labels)
data.head(20)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
5,0,M,1
6,0,E,7
7,0,M,1
8,1,M,1
9,1,M,1


# 6. Mean Encoding

In [111]:
mean_ordinal = data.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [112]:
data['mean_ordinal_encode'] = data['Cabin'].map(mean_ordinal)
data.head(20)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
5,0,M,1,0.299854
6,0,E,7,0.75
7,0,M,1,0.299854
8,1,M,1,0.299854
9,1,M,1,0.299854


# PROBABILITY RATIO ENCODING

In [116]:
import pandas as pd

data = pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
data

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
...,...,...
886,0,
887,1,B42
888,0,
889,1,C148


In [118]:
### Replacing
data['Cabin'].fillna('Missing',inplace=True)
data.head(10)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [119]:
data['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [121]:
data['Cabin']=data['Cabin'].astype(str).str[0]
data.head(10)

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
5,0,M
6,0,E
7,0,M
8,1,M
9,1,M


In [122]:
data.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [124]:
prob_data = data.groupby(['Cabin'])['Survived'].mean()
prob_data

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [125]:
prob_data = pd.DataFrame(prob_data)
prob_data

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [126]:
prob_data['Died']  = 1-prob_data['Survived']
prob_data

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [127]:
prob_df = prob_data
prob_df['Probability_ratio']=prob_df['Survived']/prob_df['Died']
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [128]:
probability_encoded = prob_df['Probability_ratio'].to_dict()
probability_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [130]:
data['Cabin_encoded']=data['Cabin'].map(probability_encoded)
data.head(20)

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274
