# Data Preprocessing

### Import Libraries

In [4]:
import numpy as np
import pandas as pd

## Importing the dataset

In [5]:
dataset = pd.read_excel('Book.xlsx')

In [6]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,no
1,Spain,27.0,48000.0,yes
2,Germany,30.0,54000.0,no
3,Spain,38.0,61000.0,no
4,Germany,40.0,,yes
5,France,35.0,58000.0,yes
6,Spain,,52000.0,no
7,France,48.0,79000.0,yes
8,Germany,50.0,83000.0,no
9,France,37.0,67000.0,yes


In [7]:
x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [8]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [9]:
y

0     no
1    yes
2     no
3     no
4    yes
5    yes
6     no
7    yes
8     no
9    yes
Name: Purchased, dtype: object

## Taking Care of Missing Data

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')

In [12]:
imputer.fit(x.iloc[:,1:3])

In [13]:
x.iloc[:,1:3] = imputer.transform(x.iloc[:,1:3])

In [14]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


# Encoding Categorial Data

## Independent Variable

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [16]:
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[0])],remainder = 'passthrough')

In [17]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [18]:
x = np.array(ct.fit_transform(x))

In [19]:
x

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

# Dependent Variable

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
le = LabelEncoder()

In [22]:
y = le.fit_transform(y)

In [23]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Dataset Split

In [24]:
from sklearn.model_selection import train_test_split

In [30]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [31]:
x_train

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04]])

In [32]:
x_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04]])

In [33]:
y_train

array([1, 0, 1, 0, 1, 1, 0, 0])

In [34]:
y_test

array([0, 1])

# Feature Scaleing

In [37]:
from sklearn.preprocessing import StandardScaler

In [38]:
sc = StandardScaler()

In [39]:
x_train[:,3:] = sc.fit_transform(x_train[:,3:])

In [41]:
x_test[:,3:] = sc.fit_transform(x_test[:,3:])

In [42]:
x_train

array([[ 1.        ,  0.        ,  0.        , -0.7529426 , -0.62603778],
       [ 1.        ,  0.        ,  0.        ,  1.00845381,  1.01304295],
       [ 1.        ,  0.        ,  0.        ,  1.79129666,  1.83258331],
       [ 0.        ,  1.        ,  0.        , -1.73149616, -1.09434656],
       [ 1.        ,  0.        ,  0.        , -0.36152118,  0.42765698],
       [ 0.        ,  1.        ,  0.        ,  0.22561096,  0.05040824],
       [ 0.        ,  0.        ,  1.        , -0.16581046, -0.27480619],
       [ 0.        ,  0.        ,  1.        , -0.01359102, -1.32850095]])

In [43]:
x_test

array([[ 0.,  1.,  0.,  1.,  1.],
       [ 0.,  0.,  1., -1., -1.]])