# Here we starting all the process of data preprocessing.

### IMPORTING THE LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### IMPORTING THE DATASET

In [10]:
dt=pd.read_csv('data.csv')
dt

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
# Syntax= df.iloc[row_position, column_position]
x=dt.iloc[ :,:-1].values
y=dt.iloc[:,-1].values

In [20]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [21]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### HANDLING MISSING DATA

- Here we are going to use scikit learn to handle missing data.
- The calss we are going to use from scikit learn is simple imputer. First we are going to import the simple imputer than going to create the instance of that class, this object will allow us to exactly replace this missing salary here by average of the salaries and then we will have an updated dataset.

In [22]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean') # this is just an object 
# Now applying this to our matrix of features.
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [23]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### ENCODING CATEGORICAL DATA 

- It will be difficult for machine learning model to compute some correlation between these columns, the features and the outcomes(dependent variable) and therefore we will have to turn these strings, these categories into numbers.
- Here we are going to perform one hot encoding. To do this we are going to use two classes first one is the column transformer class from the composed module of scikit learn library and the second class is one hot encoder class from the preprocessing modules of the same scikit learn library.
- CT use below is basically column transformation

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct= ColumnTransformer(transformers = [('encoder',OneHotEncoder(), [0])], remainder='passthrough') ## Not yet connected to our matrix of features.
x = np.array(ct.fit_transform(x))

In [27]:
print(x)

[[0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y= le.fit_transform(y)

In [29]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## SPLITTING DATASET INTO TRAINING AND TESTING DATA  

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1) 

In [31]:
print(x_train)

[[1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 35.0 58000.0]]


In [32]:
print(x_test)

[[1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


In [33]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [34]:
print(y_test)

[0 1]


## Feature Scaling

- Feature scaling helps us to put all the features on the same scale
- For some of the machine learning model inorder to avoid some features to be dominated by other features we need feature scaling.It is not required to apply feature scaling in every machine learning model , it is required in some of them only where the value of variables vary a lot.
- There are two main feature scaling techniques:-
- Standardization-> Consist of subtracting each value of your feature by mean of all the values of the feature and then deviding by standard deviation which is square root of the variance. And this will put all the values of the feature between aroung -3 and +3. When you apply this transformation on all the features of your dataset then all the features will take value between around -3 and +3.
- Normalization -> Consist of subtracting each value of your feature by minimum value of the feature and then deviding by the difference between maximum value of feature and minimum value of feature, All the values of our feature will become between 0 and 1.
- Normalization is recommended when you have a normal distribution in most of your features.
- Standardization actually works well all the time.

In [35]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,3:] =sc.fit_transform(x_train[:,3:])
x_test[:,3:]=sc.transform(x_test[:,3:])

In [36]:
print(x_train)

[[1.0 0.0 0.0 1.2909944487358056 -0.19159184384578545 -1.0781259408412425]
 [1.0 0.0 1.0 -0.7745966692414834 -0.014117293757057777
  -0.07013167641635372]
 [0.0 1.0 0.0 -0.7745966692414834 0.566708506533324 0.633562432710455]
 [1.0 0.0 0.0 1.2909944487358056 -0.30453019390224867
  -0.30786617274297867]
 [1.0 0.0 0.0 1.2909944487358056 -1.9018011447007988 -1.420463615551582]
 [0.0 1.0 0.0 -0.7745966692414834 1.1475343068237058 1.232653363453549]
 [1.0 0.0 1.0 -0.7745966692414834 1.4379472069688968 1.5749910381638885]
 [0.0 1.0 0.0 -0.7745966692414834 -0.7401495441200351 -0.5646194287757332]]


In [37]:
print(x_test)

[[1.0 0.0 1.0 -0.7745966692414834 -1.4661817944830124 -0.9069571034860727]
 [0.0 1.0 0.0 -0.7745966692414834 -0.44973664397484414 0.2056403393225306]]
