## One Hot Encoding

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.Sex.unique()

array(['male', 'female'], dtype=object)

In [6]:
Sex_converted_variable = pd.get_dummies(dataset['Sex'],drop_first=True).astype('int').head()

## pd.get_dummies will create a new column for the 'male' category
# drop_first=True drops the female category, assuming its the first one in alphabetical order

In [7]:
Sex_converted_variable

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [9]:
dataset = pd.concat([dataset,Sex_converted_variable],axis=1)
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,male.1
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1.0,1.0


In [18]:
dataset.drop('Sex',axis=1,inplace=True)
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,1.0


## One Hot Enconding with multiple categories

In [61]:

dataset= pd.read_csv('mercendez.csv',usecols= ['X0','X1','X2','X3','X4','X5','X6'])
dataset.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [62]:
# Checking the unique labels in each column
dataset.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'], dtype='object')

In [63]:
for i in dataset.columns:
    print(len(dataset[i].unique()))

47
27
44
7
4
29
12


In [64]:
list_top_10 = dataset.X1.value_counts().sort_values(ascending=False).head(10).index
list_top_10

# value_counts(normalize = False, sort= True, ascending=False,bins=None,dropna=False)
# value_counts() returns a series containing counts of unique value

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object', name='X1')

In [65]:
## we will create new column for each element in top 10 and it will 1 if the value in x1 column matches the category

In [66]:
for category in list_top_10:
    dataset[category]= np.where(dataset['X1']==category,1,0)
dataset.head()

# np.where(condition, value_if_true, value_if_false)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


In [67]:
# Now we will drop X1
dataset.drop('X1',axis=1,inplace = True)
dataset.head()

Unnamed: 0,X0,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


## Ordinal Number Encoding

In [11]:
data={'Temperature':['Hot','Cold','Very Hot','Warm','Hot','Warm','Warm','Hot','Hot','Cold']}
dataset = pd.DataFrame(data, columns=['Temperature'])
dataset.head()

Unnamed: 0,Temperature
0,Hot
1,Cold
2,Very Hot
3,Warm
4,Hot


In [12]:
mapping_value = {'Cold':1,'Warm':2,'Hot':3,'Very Hot':4}
dataset['Temperature_Ordinal']= dataset.Temperature.map(mapping_value)
dataset


# we will use .map for mapping the values we have given for the words like hot,cold,warm

Unnamed: 0,Temperature,Temperature_Ordinal
0,Hot,3
1,Cold,1
2,Very Hot,4
3,Warm,2
4,Hot,3
5,Warm,2
6,Warm,2
7,Hot,3
8,Hot,3
9,Cold,1


## Count or Frequency Encoding

In [75]:
#3 As an example. If India appears 56 times in the country column and 
#America appears 49 times, we replace India with 56 and America with 49
#in the country column.


In [104]:
import numpy as np
import pandas as pd

In [105]:
dataset = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None) 
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [107]:
columns=[1,3,5,6,7,8,9,13]
dataset[columns].head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [108]:
dataset = dataset[columns]
dataset.columns=['Employment','Degree','Status','Designation','Family_job','Race','Sex','Country']
dataset.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [109]:
for col in dataset.columns:
    print(col,':',len(dataset[col].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [112]:
country_map = dataset['Country'].value_counts()
country_map

# To convert the given data into dictionary
a = country_map.to_dict()
a


{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [113]:
dataset['Country'] = dataset['Country'].map(a)
dataset.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


## Target Guiding Ordinal Encoding

In [114]:
import numpy as np
import pandas as pd

In [115]:
dataset = pd.read_csv('titanic.csv',usecols =['Cabin','Survived'])
dataset.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [116]:
dataset['Cabin'].fillna('Missing',inplace=True)
dataset.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [118]:
dataset['Cabin'] = dataset['Cabin'].astype(str).str[0]
dataset.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [123]:
ordinal_index = dataset.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

AttributeError: 'Index' object has no attribute 'to_dict'

In [124]:
ordinal_label = {k:i for i,k in enumerate(ordinal_index,0)}
ordinal_label

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [125]:
dataset['Cabin_ordinal_label'] = dataset['Cabin'].map(ordinal_label)

In [126]:
dataset.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


## Mean Ordinal Encoding

In [129]:
mean_ordinal = dataset.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [131]:
dataset['Cabin_mean_ordinal']= dataset['Cabin'].map(mean_ordinal)
dataset.head(10)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label,Cabin_mean_ordinal
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
5,0,M,1,0.299854
6,0,E,7,0.75
7,0,M,1,0.299854
8,1,M,1,0.299854
9,1,M,1,0.299854


## Probability Ratio Encoding



In [4]:
prob= dataset.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A10,0.0
A14,0.0
A16,1.0
A19,0.0
A20,1.0
...,...
F33,1.0
F38,0.0
F4,1.0
G6,0.5


In [5]:
prob_df['Died'] = 1- prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A10,0.0,1.0
A14,0.0,1.0
A16,1.0,0.0
A19,0.0,1.0
A20,1.0,0.0
...,...,...
F33,1.0,0.0
F38,0.0,1.0
F4,1.0,0.0
G6,0.5,0.5


In [6]:
prob_df['Probability Ratio']= prob_df['Survived']/prob_df['Died']
prob_df
                                      

Unnamed: 0_level_0,Survived,Died,Probability Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A10,0.0,1.0,0.0
A14,0.0,1.0,0.0
A16,1.0,0.0,inf
A19,0.0,1.0,0.0
A20,1.0,0.0,inf
...,...,...,...
F33,1.0,0.0,inf
F38,0.0,1.0,0.0
F4,1.0,0.0,inf
G6,0.5,0.5,1.0


In [7]:
prob_encod_dict = prob_df['Probability Ratio'].to_dict()
prob_encod_dict

{'A10': 0.0,
 'A14': 0.0,
 'A16': inf,
 'A19': 0.0,
 'A20': inf,
 'A23': inf,
 'A24': 0.0,
 'A26': inf,
 'A31': inf,
 'A32': 0.0,
 'A34': inf,
 'A36': 0.0,
 'A5': 0.0,
 'A6': inf,
 'A7': 0.0,
 'B101': inf,
 'B102': 0.0,
 'B18': inf,
 'B19': 0.0,
 'B20': inf,
 'B22': 1.0,
 'B28': inf,
 'B3': inf,
 'B30': 0.0,
 'B35': inf,
 'B37': 0.0,
 'B38': 0.0,
 'B39': inf,
 'B4': inf,
 'B41': inf,
 'B42': inf,
 'B49': inf,
 'B5': inf,
 'B50': inf,
 'B51 B53 B55': 1.0,
 'B57 B59 B63 B66': inf,
 'B58 B60': 1.0,
 'B69': inf,
 'B71': 0.0,
 'B73': inf,
 'B77': inf,
 'B78': inf,
 'B79': inf,
 'B80': inf,
 'B82 B84': 0.0,
 'B86': 0.0,
 'B94': 0.0,
 'B96 B98': inf,
 'C101': inf,
 'C103': inf,
 'C104': inf,
 'C106': inf,
 'C110': 0.0,
 'C111': 0.0,
 'C118': 0.0,
 'C123': 1.0,
 'C124': 0.0,
 'C125': inf,
 'C126': inf,
 'C128': 0.0,
 'C148': inf,
 'C2': 1.0,
 'C22 C26': 0.49999999999999994,
 'C23 C25 C27': 1.0,
 'C30': 0.0,
 'C32': inf,
 'C45': inf,
 'C46': 0.0,
 'C47': inf,
 'C49': 0.0,
 'C50': inf,
 'C52': i

In [8]:
dataset['Cabin_probablity_ratio'] = dataset['Cabin'].map(prob_encod_dict)
