## 1. Read data file.

In [3]:
import numpy as np
import pandas as pd
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,,36,180000,O


## 2. Clean the missing data using median approach

In [5]:
data["Working Experience(years)"].fillna(data["Working Experience(years)"].median(),inplace = True)
data.head()

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,10.0,36,180000,O


## 3. Finding Correlation between Number of Kids and Working Experience

In [20]:
data["Number of Kids"].corr(data["Age"])

0.9147673836616229

In [22]:
data["Number of Kids"].corr(data["Working Experience(years)"])

0.8528028654224418

==> Age is more related

## 4. One Hot Vectors

In [10]:
blood_types_encoded,categories=data["Blood Types"].factorize()
blood_types_encoded

array([0, 1, 2, 3])

In [11]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
blood_type_cat_1hot = encoder.fit_transform(blood_types_encoded.reshape(-1,1))
blood_type_cat_1hot

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [12]:
data.head()

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,10.0,36,180000,O


In [13]:
one_hot=pd.get_dummies(data,columns=["Blood Types"],drop_first=False,prefix='',prefix_sep='')
one_hot

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,A,AB,B,O
0,3,15.0,45,250000,1,0,0,0
1,1,5.0,30,200000,0,0,1,0
2,2,10.0,38,150000,0,1,0,0
3,1,10.0,36,180000,0,0,0,1


In [14]:
data=data.drop("Blood Types",axis=1)
data

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary
0,3,15.0,45,250000
1,1,5.0,30,200000
2,2,10.0,38,150000
3,1,10.0,36,180000


In [15]:
data=data.join(one_hot["A"])
data=data.join(one_hot["B"])
data=data.join(one_hot["AB"])
data=data.join(one_hot["O"])
data

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,A,B,AB,O
0,3,15.0,45,250000,1,0,0,0
1,1,5.0,30,200000,0,1,0,0
2,2,10.0,38,150000,0,0,1,0
3,1,10.0,36,180000,0,0,0,1


## 5. Scaling the data

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data),
              columns=["Number of Kids","Working Experience(years)","Age",
           "Salary","Blood Type A","Blood Type B","Blood Type AB",
           "Blood Type O"])
data_scaled

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Type A,Blood Type B,Blood Type AB,Blood Type O
0,1.507557,1.414214,1.446956,1.510966,1.732051,-0.57735,-0.57735,-0.57735
1,-0.904534,-1.414214,-1.353604,0.137361,-0.57735,1.732051,-0.57735,-0.57735
2,0.301511,0.0,0.140028,-1.236245,-0.57735,-0.57735,1.732051,-0.57735
3,-0.904534,0.0,-0.23338,-0.412082,-0.57735,-0.57735,-0.57735,1.732051


In [18]:
data_scaled.round(decimals=2)

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Type A,Blood Type B,Blood Type AB,Blood Type O
0,1.51,1.41,1.45,1.51,1.73,-0.58,-0.58,-0.58
1,-0.9,-1.41,-1.35,0.14,-0.58,1.73,-0.58,-0.58
2,0.3,0.0,0.14,-1.24,-0.58,-0.58,1.73,-0.58
3,-0.9,0.0,-0.23,-0.41,-0.58,-0.58,-0.58,1.73
