# Data Preprocessing

Data preprocessing is an important step for machine learning tasks.


## Step 1. Loading libraries

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

## Step 2. Loading dataset

In [73]:
dataset = pd.read_csv('Data.csv')

In [8]:
#see data - top 5 rows

dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## Step 3. Analyse data

In [17]:
# see complete dataset
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [34]:
# see statistical values
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [37]:
# find total number of rows and columns
dataset.shape

(10, 4)

## Step 4. Create X and y

In [99]:
# in X all columns except the last one - indepenedent variables
X = dataset.iloc[:,:-1].values

In [22]:
# y is last column - dependent variables
y = dataset.iloc[:,-1].values

In [23]:
# print X
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [24]:
# print y
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Step 5. Handling missing values

There are following ways to handle missing values:

1. Remove that obersvation

### dataset.dropna(inplace=True)

2. Replace NULL values by mean of the column
3. Replace by median OR most frequent.

In [40]:
# loading library for handling missing data

from sklearn.impute import SimpleImputer

In [45]:
# create object of Inputer class

imputer = SimpleImputer(missing_values = np.nan , strategy = 'mean') # here strategy could be median or most_frequent

In [114]:
# fit imputer on X
imputer.fit(X[:,1:3])

SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)

In [115]:
#replace Nan values by mean

X[:,1:3] = imputer.transform(X[:,1:3]) # we can only take those columns whose values are NUMERIC.


In [116]:
# check X

X # not find any Nan value

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

## Step 6. Encode categorical data

- Category data is data in which order has no significance.
- There is a need to change text data to numeric. Since working with numeric data is more proper.


In [54]:
# import library

from sklearn.preprocessing import LabelEncoder

In [117]:
# create object
label_enc = LabelEncoder()

In [118]:
# Apply on X
X[:,0] = label_enc.fit_transform(X[:,0])

In [119]:
# Check X
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [120]:
# apply on Dependent variable
label_enc_y = LabelEncoder()
y = label_enc_y.fit_transform(y)

In [121]:
# check y
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Step 7. Use of OneHotEncoder

It is used to convert one column into multiple columns.


In [61]:
#import library
from sklearn.preprocessing import OneHotEncoder

In [122]:
# create object
onehot_enc = OneHotEncoder(categorical_features = [0])

In [123]:
X = onehot_enc.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [124]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

## Step 8. Split data into training and test data 

In [130]:
# importing libraries
from sklearn.model_selection import train_test_split

In [132]:
# split daat set into two parts - 70% is training data and 30% is testing data
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [135]:
# check values
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8, 5)
(2, 5)
(8,)
(2,)


## Step 9. Feature Scaling

In machine learning algorithms, we use Euclidian distances at many places. Since that value is using square value of the number so big number have big weightage in result.

We can mitigate this impact by using fature scaling.


In [142]:
#importing library

from sklearn.preprocessing import StandardScaler

In [143]:
# creating object

sc_x = StandardScaler()

In [144]:
X_train = sc_x.fit_transform(X_train)

In [145]:
X_test = sc_x.transform(X_test)

In [146]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [147]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])

#### ------------------- End of the Document -----------------------------------------------------