# Data Preprocessing

## importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
display(dataset.head())

X_df = dataset.iloc[:, :-1] # every rows, every columns except last
display(X_df.head())

y_df = dataset.iloc[:, 3] # every rows, only last column
display(y_df.head())

X = dataset.iloc[:, :-1].values
y = y_df.values

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,


0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object

## Taking care of missing data

In [3]:
from sklearn.impute import SimpleImputer
missingvalues = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
missingvalues = missingvalues.fit(X[:, 1:3])
X[:, 1:3]=missingvalues.transform(X[:, 1:3])


X_df2 = pd.DataFrame(X)

X_df2.columns = X_df.columns

display(X_df2.head())

Unnamed: 0,Country,Age,Salary
0,France,44,72000.0
1,Spain,27,48000.0
2,Germany,30,54000.0
3,Spain,38,61000.0
4,Germany,40,63777.8


## Encoding categorical data

### Encoding the independant variable

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder_X = LabelEncoder() #assign numerical values to categorical values (ex: countries, cities...)
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
# creates column for each categorical value, each row where categorical value applies --> 1, else 0
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

X_df3 = round(pd.DataFrame(X), 0)
X_df3.columns = ['France','Spain','Germany', X_df.columns[1], X_df.columns[2]]
display(X_df3.head())

Unnamed: 0,France,Spain,Germany,Age,Salary
0,1,0,0,44,72000.0
1,0,0,1,27,48000.0
2,0,1,0,30,54000.0
3,0,0,1,38,61000.0
4,0,1,0,40,63777.8


### Encoding the dependant variable

In [5]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y) #assign numerical values to boolean values (ex: yes/no, true/false...)

y_df2 = round(pd.DataFrame(y), 0)
y_df2.columns = [dataset.columns[-1]]
display(y_df2.head())

Unnamed: 0,Purchased
0,0
1,1
2,0
3,0
4,1


## Splitting the dataset into the Training set and Test set

Training Dataset: The sample of data used to fit the model.
The actual dataset that we use to train the model (weights and biases in the case of Neural Network). 
The model sees and learns from this data.

Test Dataset: The sample of data used to provide an unbiased evaluation of a final model fit on the training dataset.
provides the gold standard used to evaluate the model.
only used once a model is completely trained
contains carefully sampled data that spans the various classes that the model would face, when used in the real world.

Split dataset into training set (X_train, y_train) and test set (X_test, y_test), 
test_size = proportion of the dataset to include in the test split


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) # 20% in test set

## Feature Scaling

some algorithms requires features to be normalized as important to have values on same scale for acurate results

Obviously age and estimated salary features  have different units, one in years, the other in $

Standardization involves rescaling the features such that 

-->they have the properties of a standard normal distribution with a mean of zero and a standard deviation of one

In [7]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)