In [1]:
#Probability Ratio Encoding:-

#Probability Ratio Encoding. Probability Ratio Encoding is similar to Weight Of Evidence(WoE), with the 
#only difference is the only ratio of good and bad probability is used. For each label, we calculate the
#mean of target=1,that is the probability of being 1 ( P(1) ), and also the probability of the target=0
#( P(0) ).

#1.Probability of Survived based on Cabin ---> Categorical Feature
#2.Probability of Not Survived ---> 1 - prob(Survived)
#3.prob(Survived)/prob(Not Survived)
#4.Dictonary to map cabin with probability
#5.Replace with the categorical feature

In [2]:
import pandas as pd

df=pd.read_csv('E:/datafiles/titanic.csv',usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Cabin,Survived
0,,0
1,C85,1
2,,1
3,C123,1
4,,0


In [3]:
#Replacing the NAN values in 'Cabin' column with the 'Missing' label keyword

df['Cabin'].fillna('Missing',inplace = True)
df.head()

Unnamed: 0,Cabin,Survived
0,Missing,0
1,C85,1
2,Missing,1
3,C123,1
4,Missing,0


In [4]:
#To display the unique values in 'Cabin' column

df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [5]:
#To get the first string character in the 'Cabin' column

df['Cabin'] = df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Cabin,Survived
0,M,0
1,C,1
2,M,1
3,C,1
4,M,0


In [6]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [7]:
#To find probability/percentage of 'Survived' based on 'Cabin' using groupby using variable prob_df

prob_df = df.groupby(['Cabin'])['Survived'].mean()

In [8]:
#Converting the prob_df into a dataframe

prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [9]:
#To find the probability the people have died in 'Titanic' dataset
#When we combine 'Survived' and 'Died' i.e, "Survive + Died = 1" -----> probability

prob_df['Died'] = 1-prob_df['Survived']
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [10]:
#Now if you want to find the probability of ratio of encoding then "prob_df['Survived']/prob_df['Died']"

prob_df['Probability_ratio'] = prob_df['Survived']/prob_df['Died']
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [11]:
#Coverting the prob_ratio into a dictonary and placing into a variable "probability_encoded"

probability_encoded = prob_df['Probability_ratio'].to_dict()

In [12]:
#Mapping this variable "probability_encoded" into my dataframe with new column name "Cabin_encoded" 

df['Cabin_encoded'] = df['Cabin'].map(probability_encoded)
df.head(20)

Unnamed: 0,Cabin,Survived,Cabin_encoded
0,M,0,0.428274
1,C,1,1.458333
2,M,1,0.428274
3,C,1,1.458333
4,M,0,0.428274
5,M,0,0.428274
6,E,0,3.0
7,M,0,0.428274
8,M,1,0.428274
9,M,1,0.428274
