#  Data Preprocessing Tools


### import libraries

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### Import dataset

In [9]:
dataset= pd.read_csv("Data.csv")
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1]

#  [:,:-1] all row without target /dependent varible column
# The colon (:) before the comma means "select all rows" in the dataset.
#The :-1 after the comma means "select all columns except the last one.

# iloc[:,-1] just i need last column / dependent variable

In [10]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [11]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


### Handling Missing Values

In [12]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(x[:,1:3]) # handling missing value in c 1 and 2 by  calculate mean on available data
x[:,1:3]=imputer.transform (x[:,1:3])    # replace the missing data by mean when used fit

In [None]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding  Categorical Variables

### Encoding  Independent Categorical Variables

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer( transformers=[('encoder', OneHotEncoder(), [0])],remainder='passthrough')
x=np.array(ct.fit_transform(x))


In [14]:
print(x)
# the country result as unique Id vector

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding dependent Categorical Variables

In [15]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y= le.fit_transform(y)

# fit the No=0 / Yes=1 . transform :apply fit and replace

In [16]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting Dataset into Train and Test data

In [19]:
from sklearn.model_selection import train_test_split
x_train ,x_test ,y_train ,y_test =train_test_split(x,y,test_size=0.2, random_state=1)
#apply train test split for data / 20% of data is testing and 80% of data is training

In [20]:
x_train
# 80% of data for train / 8 rows

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [21]:
x_test
# 20% of data for test / 2 rows

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

## Feature Scaling

In [24]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train[:,3:]=sc.fit_transform (x_train[:,3:])
x_test [:,3:]=sc.transform(x_test [:,3:])

# x_train[] : means what the coloum i need to do feature scaling and c3 and c4 based on x_train result

In [25]:
x_train

array([[0.0, 0.0, 1.0, -0.19159184384578554, -1.0781259408412427],
       [0.0, 1.0, 0.0, -0.014117293757057846, -0.07013167641635404],
       [1.0, 0.0, 0.0, 0.5667085065333239, 0.6335624327104546],
       [0.0, 0.0, 1.0, -0.3045301939022487, -0.307866172742979],
       [0.0, 0.0, 1.0, -1.901801144700799, -1.4204636155515822],
       [1.0, 0.0, 0.0, 1.1475343068237056, 1.2326533634535488],
       [0.0, 1.0, 0.0, 1.4379472069688966, 1.5749910381638883],
       [1.0, 0.0, 0.0, -0.7401495441200352, -0.5646194287757336]],
      dtype=object)

In [26]:
x_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)