# Data Preprocessing Tools

## Importing the libraries

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [19]:
dataset = pd.read_csv('Data.csv')
#seperating independent variables into x, and dependent variable into y
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [20]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [0]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [21]:

from sklearn.impute import SimpleImputer
#to replace age and salary column by corresponding mean value using sklearn
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [22]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#to replace city column with 3 columns (dummy variables using sklearn)
#1transformers=[(action, the class object who will do the transformation, 
#column index on which to apply transformation)
#remainder='passthrough' will keep all other columns as it is, otherwise it will keep only 
#newly created columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
#fit_transform does not return o/p in numpy array format it is array format. 
#and ML models will need the data in numpy array. so conversions is neccessary
X = np.array(ct.fit_transform(X))

In [0]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [56]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_model=le.fit(y)
label_model.classes_

#y = le.fit_transform(y)

array(['No', 'Yes'], dtype=object)

In [0]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [62]:
data={
    "city":["spain","germany","france","spain","germany","france"],
    "age":[89,77,56,67,66,77],
    "gender":["female","male","male","male","female",np.NaN],
    'review':['good',"good","bad","good","bad","good"],
     'education':["UG","PG","PHD","PG","UG","PHD"],
     'Purchase':["yes","yes","no","yes","no","yes"]}

In [63]:
df=pd.DataFrame(data)
df

Unnamed: 0,city,age,gender,review,education,Purchase
0,spain,89,female,good,UG,yes
1,germany,77,male,good,PG,yes
2,france,56,male,bad,PHD,no
3,spain,67,male,good,PG,yes
4,germany,66,female,bad,UG,no
5,france,77,,good,PHD,yes


In [69]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder=OneHotEncoder(drop='first',sparse=False)
x=df.iloc[:,0].values
x
one_hot_encoder.fit(x.reshape(-1, 1))
x=one_hot_encoder.transform(x.reshape(-1, 1))
x



array([[0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.]])

In [44]:
from sklearn.impute import SimpleImputer
#to replace age and salary column by corresponding mean value using sklearn
g=df.iloc[:,1].values
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
imputer.fit(g.reshape(-1,1))

g = imputer.transform(g.reshape(-1,1))
g

array([['female'],
       ['male'],
       ['male'],
       ['male'],
       ['female'],
       ['male']], dtype=object)

In [51]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['bad','good'],['UG','PG','PHD']])
data_ord=df.iloc[:,2:4]
#data_ord
model=oe.fit(data_ord)

#model
#data_ord=oe.fit_transform(data_ord)
#data_ord

In [53]:
#To find what categories are given in the model
model=oe.fit(data_ord)
model.categories_

[array(['bad', 'good'], dtype=object),
 array(['UG', 'PG', 'PHD'], dtype=object)]

## Splitting the dataset into the Training set and Test set

In [0]:
from sklearn.model_selection import train_test_split
#test_size=20%  train_set=80%
#random_state=1 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [0]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [0]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [0]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [0]:
print(y_test)

[0 1]


## Feature Scaling

feature scaling is needed because if age and salary you consider, values in salary column will be very big than age and canot comapare, salary data will dominate the ML model. hence feature scaling is needed.
<b>It has to be done after test_training data split</b>
because for feature scaling it calculates mean and std deviation, we don't want to consider test data for calculating mean and deviation while scaling training data. Otherwise data of test data set leackage will be there.
hence consider train and test data as separate data sets and then do scaling. hence it is needed to do it after train test split

2 types of seature scaling
1. Standardization --- all values will be in range _3 to -3 
2. normalization------ all values will be in range -1 to +1

example:
 if we are using linear regression
 
 v=b1+b2*x1+b3*x2
 
 if x1 values are big and x2 values are small, then b2*x1 will dominate the ans. hence we go for feature scaling
 
 <b>feature scaling need not be applied on dummy variables, generated for city, because these values are already in range</b>
 if we apply on dummy variables we will loose data, because 0 and 1's are helping to tell us the country name


In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
#on test data we will use only transform because model is already fit by 
#using training data.
#which hase added formula
#the same calculations we are using on test data.
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [0]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [0]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
