In [49]:
import pandas as pd

In [50]:
import numpy as np

In [71]:
df = pd.read_csv('risk2.csv')
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,UK,NO
1,42,12.0,4,USA,NO
2,23,4.0,6,N,NO
3,52,,4,USA,NO
4,43,21.0,8,USA,YES


In [72]:
df.isnull().sum()

Age            0
Experience     2
Rank           0
Nationality    0
Go             0
dtype: int64

# Filling the missing values

In [73]:
mean = df.Experience.mean()
median = df.Experience.median()

In [74]:
mean

9.636363636363637

In [75]:
median

10.0

In [76]:
df.Experience = df.Experience.fillna(median)

In [77]:
df.Experience

0     10.0
1     12.0
2      4.0
3     10.0
4     21.0
5     14.0
6      3.0
7     14.0
8     13.0
9     10.0
10     3.0
11     3.0
12     9.0
Name: Experience, dtype: float64

In [78]:
df.isnull().sum()

Age            0
Experience     0
Rank           0
Nationality    0
Go             0
dtype: int64

# Encoding

# Manual encoding

In [79]:
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,UK,NO
1,42,12.0,4,USA,NO
2,23,4.0,6,N,NO
3,52,10.0,4,USA,NO
4,43,21.0,8,USA,YES


In [80]:
df.Go.unique()

array(['NO', 'YES'], dtype=object)

In [81]:
# No == 0 & Yes == 1
df.Go = df.Go.replace(['NO', 'YES'],[0, 1]) 

In [82]:
df.Go

0     0
1     0
2     0
3     0
4     1
5     0
6     1
7     1
8     1
9     1
10    0
11    1
12    1
Name: Go, dtype: int64

# Label encoding

In [83]:
from sklearn.preprocessing import LabelEncoder

In [84]:
label = LabelEncoder()

In [85]:
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,UK,0
1,42,12.0,4,USA,0
2,23,4.0,6,N,0
3,52,10.0,4,USA,0
4,43,21.0,8,USA,1


In [86]:
df.Nationality = label.fit_transform(df['Nationality'])

In [87]:
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,1,0
1,42,12.0,4,2,0
2,23,4.0,6,0,0
3,52,10.0,4,2,0
4,43,21.0,8,2,1


# One-hot encoding

In [88]:
df = pd.read_csv('risk2.csv')
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,UK,NO
1,42,12.0,4,USA,NO
2,23,4.0,6,N,NO
3,52,,4,USA,NO
4,43,21.0,8,USA,YES


In [90]:
dummy_variable = pd.get_dummies(df['Nationality'], drop_first=True)

In [91]:
dummy_variable.head()

Unnamed: 0,UK,USA
0,1,0
1,0,1
2,0,0
3,0,1
4,0,1


In [92]:
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,UK,NO
1,42,12.0,4,USA,NO
2,23,4.0,6,N,NO
3,52,,4,USA,NO
4,43,21.0,8,USA,YES


In [93]:
#For row axis == 0 & column axis == 1
new_df = df.drop('Nationality', axis=1)

In [94]:
new_df.head()

Unnamed: 0,Age,Experience,Rank,Go
0,36,10.0,9,NO
1,42,12.0,4,NO
2,23,4.0,6,NO
3,52,,4,NO
4,43,21.0,8,YES


In [95]:
df = pd.concat([new_df, dummy_variable], axis=1)

In [96]:
df.head()

Unnamed: 0,Age,Experience,Rank,Go,UK,USA
0,36,10.0,9,NO,1,0
1,42,12.0,4,NO,0,1
2,23,4.0,6,NO,0,0
3,52,,4,NO,0,1
4,43,21.0,8,YES,0,1


In [97]:
median = df.Experience.median()

In [98]:
df.Experience = df.Experience.fillna(median)

In [101]:
# No == 0 & Yes == 1
df.Go = df.Go.replace(['NO', 'YES'],[0, 1]) 

In [102]:
 df

Unnamed: 0,Age,Experience,Rank,Go,UK,USA
0,36,10.0,9,0,1,0
1,42,12.0,4,0,0,1
2,23,4.0,6,0,0,0
3,52,10.0,4,0,0,1
4,43,21.0,8,1,0,1
5,44,14.0,5,0,1,0
6,66,3.0,7,1,0,0
7,35,14.0,9,1,1,0
8,52,13.0,7,1,0,0
9,35,10.0,9,1,0,0


# Ordinal encoder

In [103]:
df = pd.read_csv('risk2.csv')
df.head()

Unnamed: 0,Age,Experience,Rank,Nationality,Go
0,36,10.0,9,UK,NO
1,42,12.0,4,USA,NO
2,23,4.0,6,N,NO
3,52,,4,USA,NO
4,43,21.0,8,USA,YES


In [104]:
median = df.Experience.median()
df.Experience = df.Experience.fillna(median)

In [105]:
from sklearn.preprocessing import OrdinalEncoder

In [106]:
df.Nationality.unique()

array(['UK', 'USA', 'N'], dtype=object)

In [107]:
c_list = ['UK', 'USA', 'N']

In [108]:
ordinal = OrdinalEncoder(categories=[c_list])

In [111]:
encoded_value = ordinal.fit_transform(df[['Nationality']])

In [113]:
pd.DataFrame(encoded_value, columns=['Nationality'])

Unnamed: 0,Nationality
0,0.0
1,1.0
2,2.0
3,1.0
4,1.0
5,0.0
6,2.0
7,0.0
8,2.0
9,2.0
