# Data Preprocessing in Machine learning
## Example 1
Salary and purchase details from different countries

#### Importing req Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer  #Handling missing Value
from sklearn.preprocessing import LabelEncoder  #Handling Categorical Data
from sklearn.model_selection import train_test_split  #split the data into train and test
from sklearn.preprocessing import StandardScaler  #feature scaling

#### 1) Getting proper datasets and importing it:

In [2]:
data = pd.read_csv('D:\AIML\ML\data\Data.csv')

In [3]:
# showing first 5 data
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
# getting statistical details of the data
data.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [5]:
# check null values how many are there
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [6]:
# Extracting independent vairables
x = data.iloc[:,:-1]
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [7]:
#Extracting dependent variables
y = data.iloc[:,3]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

#### 2) Deal with missing data:

In [8]:
#Handling missing Value
imputer = SimpleImputer(missing_values = np.nan,strategy= 'mean')
x.iloc[:,1:] = imputer.fit_transform(x.iloc[:,1:])
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


#### 3) Encoding Categorical data:

In [9]:
#Handling Categorical Data
le = LabelEncoder()
x.iloc[:,0] = le.fit_transform(x.iloc[:,0])
x

  x.iloc[:,0] = le.fit_transform(x.iloc[:,0])


Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


#### 4) One Hot Encoding:

In [10]:
# OneHotEncoding can be done using get_dummies function

dummy = pd.get_dummies(data.Country,drop_first=True)
dummy

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,1,0
3,0,1
4,1,0
5,0,0
6,0,1
7,0,0
8,1,0
9,0,0


In [11]:
# merging dummy with x
merged = pd.concat([x,dummy],axis=1)
merged

Unnamed: 0,Country,Age,Salary,Germany,Spain
0,0,44.0,72000.0,0,0
1,2,27.0,48000.0,0,1
2,1,30.0,54000.0,1,0
3,2,38.0,61000.0,0,1
4,1,40.0,63777.777778,1,0
5,0,35.0,58000.0,0,0
6,2,38.777778,52000.0,0,1
7,0,48.0,79000.0,0,0
8,1,50.0,83000.0,1,0
9,0,37.0,67000.0,0,0


In [12]:
x = merged.drop(['Country'],axis=1)
x

Unnamed: 0,Age,Salary,Germany,Spain
0,44.0,72000.0,0,0
1,27.0,48000.0,0,1
2,30.0,54000.0,1,0
3,38.0,61000.0,0,1
4,40.0,63777.777778,1,0
5,35.0,58000.0,0,0
6,38.777778,52000.0,0,1
7,48.0,79000.0,0,0
8,50.0,83000.0,1,0
9,37.0,67000.0,0,0


In [13]:
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

#### 5) Splitting the Dataset into the Training set and Test set

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=0)

In [15]:
x_train

Unnamed: 0,Age,Salary,Germany,Spain
4,40.0,63777.777778,1,0
9,37.0,67000.0,0,0
1,27.0,48000.0,0,1
6,38.777778,52000.0,0,1
7,48.0,79000.0,0,0
3,38.0,61000.0,0,1
0,44.0,72000.0,0,0
5,35.0,58000.0,0,0


#### 6) Feature Scaling

In [16]:
#feature scaling
ss = StandardScaler()
x_train.iloc[:,:2] = ss.fit_transform(x_train.iloc[:,:2])
x_train

Unnamed: 0,Age,Salary,Germany,Spain
4,0.263068,0.123815,1,0
9,-0.253501,0.461756,0,0
1,-1.975398,-1.530933,0,1
6,0.052614,-1.11142,0,1
7,1.640585,1.720297,0,0
3,-0.081312,-0.167514,0,1
0,0.951826,0.986148,0,0
5,-0.597881,-0.482149,0,0
