# Data preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/Data.csv')

In [3]:
data.head()

Unnamed: 0,City,Age,Sex,Smoke,HappinessIndex,Healthy
0,Mumbai,24.0,Male,Yes,241.0,Yes
1,London,80.0,Female,No,928.0,No
2,NewYork,38.0,Male,Yes,,Yes
3,NewYork,22.0,Female,Yes,786.0,Yes
4,NewYork,36.0,Male,Yes,967.0,Yes


In [4]:
data.drop('Sex', axis=1, inplace=True)

In [5]:
X = data.iloc[:, 0:4].values
y = data.iloc[:, 4].values

In [6]:
X

array([['Mumbai', 24.0, 'Yes', 241.0],
       ['London', 80.0, 'No', 928.0],
       ['NewYork', 38.0, 'Yes', nan],
       ['NewYork', 22.0, 'Yes', 786.0],
       ['NewYork', 36.0, 'Yes', 967.0],
       ['London', nan, 'Yes', 665.0],
       ['Mumbai', 17.0, 'No', 293.0],
       ['NewYork', 28.0, 'No', 494.0],
       ['Mumbai', 45.0, 'No', 707.0],
       ['London', 29.0, 'Yes', 599.0]], dtype=object)

In [7]:
y

array(['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
### ?SimpleImputer

In [10]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [11]:
X[:,1:2] = imputer.fit_transform(X[:,1:2])
X[:,3:4] = imputer.fit_transform(X[:,3:4])

# Categorical Data

In [12]:
X

array([['Mumbai', 24.0, 'Yes', 241.0],
       ['London', 80.0, 'No', 928.0],
       ['NewYork', 38.0, 'Yes', 631.1111111111111],
       ['NewYork', 22.0, 'Yes', 786.0],
       ['NewYork', 36.0, 'Yes', 967.0],
       ['London', 35.44444444444444, 'Yes', 665.0],
       ['Mumbai', 17.0, 'No', 293.0],
       ['NewYork', 28.0, 'No', 494.0],
       ['Mumbai', 45.0, 'No', 707.0],
       ['London', 29.0, 'Yes', 599.0]], dtype=object)

In [13]:
y

array(['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [14]:
# Yes -> 1, No -> 0 - LabelEncoder
# M - > 0, L -> 1, N -> 2 - One Hot Encoder

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [16]:
le_x = LabelEncoder()
le_y = LabelEncoder()

In [17]:
X[:, 0] = le_x.fit_transform(X[:, 0])
X[:, 2] = le_x.fit_transform(X[:, 2])
y = le_y.fit_transform(y)

In [18]:
X

array([[1, 24.0, 1, 241.0],
       [0, 80.0, 0, 928.0],
       [2, 38.0, 1, 631.1111111111111],
       [2, 22.0, 1, 786.0],
       [2, 36.0, 1, 967.0],
       [0, 35.44444444444444, 1, 665.0],
       [1, 17.0, 0, 293.0],
       [2, 28.0, 0, 494.0],
       [1, 45.0, 0, 707.0],
       [0, 29.0, 1, 599.0]], dtype=object)

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                                         # Leave the rest of the columns untouched
)

X = ct.fit_transform(X)
ohe = OneHotEncoder(categories='auto')

In [20]:
X

array([[0.0, 1.0, 0.0, 24.0, 1, 241.0],
       [1.0, 0.0, 0.0, 80.0, 0, 928.0],
       [0.0, 0.0, 1.0, 38.0, 1, 631.1111111111111],
       [0.0, 0.0, 1.0, 22.0, 1, 786.0],
       [0.0, 0.0, 1.0, 36.0, 1, 967.0],
       [1.0, 0.0, 0.0, 35.44444444444444, 1, 665.0],
       [0.0, 1.0, 0.0, 17.0, 0, 293.0],
       [0.0, 0.0, 1.0, 28.0, 0, 494.0],
       [0.0, 1.0, 0.0, 45.0, 0, 707.0],
       [1.0, 0.0, 0.0, 29.0, 1, 599.0]], dtype=object)

# Splitting data in Train and Test

In [21]:
# Machine Learning = Training + Testing

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

# Normalize data

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
sc_x = StandardScaler()

In [26]:
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.fit_transform(X_test)