## Data Pre-processing Template

#### Import the libraries

In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

#### Import the Dataset

In [169]:
dataset = pd.read_csv('../Dataset/Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [170]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [171]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

#### Missing Data

In [172]:
from sklearn.impute import MissingIndicator
indicator = MissingIndicator().fit_transform(X)
indicator

array([[False, False],
       [False, False],
       [False, False],
       [False, False],
       [False,  True],
       [False, False],
       [ True, False],
       [False, False],
       [False, False],
       [False, False]])

In [173]:
dataset[indicator]

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
6,Spain,,52000.0,No


In [174]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [175]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#### Encoding Categorical Data

In [176]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(X[:, 0].reshape(-1, 1))
Encoded_Country = oneHotEncoder.transform(X[:, 0].reshape(-1, 1)).toarray()
Encoded_Country

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [177]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
labelEncoder_y = labelEncoder.fit_transform(y)
labelEncoder_y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [178]:
# Copy dataset to a new one
dataset_transformed =  dataset.copy()
dataset_transformed

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [179]:
pd.get_dummies(dataset_transformed['Country'])

Unnamed: 0,France,Germany,Spain
0,True,False,False
1,False,False,True
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,False
6,False,False,True
7,True,False,False
8,False,True,False
9,True,False,False


In [180]:
dataset_transformed['Purchased_labeled'] = labelEncoder_y

dataset_transformed = pd.concat([dataset_transformed, pd.get_dummies(dataset_transformed['Country'], dtype=int)], axis=1)

dataset_transformed

Unnamed: 0,Country,Age,Salary,Purchased,Purchased_labeled,France,Germany,Spain
0,France,44.0,72000.0,No,0,1,0,0
1,Spain,27.0,48000.0,Yes,1,0,0,1
2,Germany,30.0,54000.0,No,0,0,1,0
3,Spain,38.0,61000.0,No,0,0,0,1
4,Germany,40.0,,Yes,1,0,1,0
5,France,35.0,58000.0,Yes,1,1,0,0
6,Spain,,52000.0,No,0,0,0,1
7,France,48.0,79000.0,Yes,1,1,0,0
8,Germany,50.0,83000.0,No,0,0,1,0
9,France,37.0,67000.0,Yes,1,1,0,0


#### Splitting Data into Training Set and Test Set

In [181]:
dataset_transformed.drop(['Country', 'Purchased'], axis=1, inplace=True)
dataset_transformed

Unnamed: 0,Age,Salary,Purchased_labeled,France,Germany,Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


In [182]:
dataset_transformed.rename(columns={'Purchased_labeled':'Purchased'}, inplace=True)

In [183]:
y = dataset_transformed['Purchased']
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int32

In [186]:
X = dataset_transformed.drop(['Purchased'], axis=1)
X

Unnamed: 0,Age,Salary,France,Germany,Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,,0,1,0
5,35.0,58000.0,1,0,0
6,,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


In [187]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

In [189]:
X

array([[4.40000000e+01, 7.20000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.70000000e+01, 4.80000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [3.00000000e+01, 5.40000000e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.80000000e+01, 6.10000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [4.00000000e+01, 6.37777778e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.50000000e+01, 5.80000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [3.87777778e+01, 5.20000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [4.80000000e+01, 7.90000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [5.00000000e+01, 8.30000000e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.70000000e+01, 6.70000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

In [191]:
y = y.to_numpy()
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [192]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [193]:
X_train

array([[4.00000000e+01, 6.37777778e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.70000000e+01, 6.70000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.70000000e+01, 4.80000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [3.87777778e+01, 5.20000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [4.80000000e+01, 7.90000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [3.80000000e+01, 6.10000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [4.40000000e+01, 7.20000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [3.50000000e+01, 5.80000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

In [194]:
X_test

array([[3.0e+01, 5.4e+04, 0.0e+00, 1.0e+00, 0.0e+00],
       [5.0e+01, 8.3e+04, 0.0e+00, 1.0e+00, 0.0e+00]])

#### Feature Scaling

In [206]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)

In [207]:
X_train.round(2)

array([[ 0.26,  0.12, -1.  ,  2.65, -0.77],
       [-0.25,  0.46,  1.  , -0.38, -0.77],
       [-1.98, -1.53, -1.  , -0.38,  1.29],
       [ 0.05, -1.11, -1.  , -0.38,  1.29],
       [ 1.64,  1.72,  1.  , -0.38, -0.77],
       [-0.08, -0.17, -1.  , -0.38,  1.29],
       [ 0.95,  0.99,  1.  , -0.38, -0.77],
       [-0.6 , -0.48,  1.  , -0.38, -0.77]])

In [208]:
X_test.round(2)

array([[3.0e+01, 5.4e+04, 0.0e+00, 1.0e+00, 0.0e+00],
       [5.0e+01, 8.3e+04, 0.0e+00, 1.0e+00, 0.0e+00]])