# Data Preprocessing Tools

## Importing the libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Importing the dataset

In [5]:
dataset = pd.read_csv('Data.csv')

# Features and dependent variable vector
# Features are the columns with which you will predict the 
# the dependend variable 

# matrix of features
X = dataset.iloc[:, :-1].values #take all columns except last one

# dependent variable vector 
y = dataset.iloc[:, -1].values

## Taking care of missing data

In [7]:
from sklearn.impute import SimpleImputer
# Replace missing values with average of all other
# values in column

#print(dataset.head(10))

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# expects all columns of X with num values
# all the rows, columns 1 and 2 
imputer.fit(X[:, 1:3])

# call transform method to replace missing values
# change first and second columns of X to be updated
# with tranformed values
X[:, 1:3] = imputer.transform(X[:, 1:3])



In [8]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [9]:
# be aware of encoding with numerical order
# model may interprit a relationship
# Better to turn each category into it's own column
# creates binary vectors for each column
# one-hot encoding 

# binary outcomes can be rolaced by zero and one without problems

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# transformer: what tranformation we want to do and what columns
# remainder: specify we want to keep columns that are not transformed
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

#do not need to fit. Included in CT
# X NEEDS to be a numpy array. Fit transorm does not return by default
X = np.array(ct.fit_transform(X))

In [10]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [11]:

from sklearn.preprocessing import LabelEncoder
# only need to imut y (one vector)
le = LabelEncoder()
le.fit_transform(y)
y = le.fit_transform(y)

In [12]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

#### Feature Scaling AFTER splitting dataset
Training set to train on existing observations. Test for evaluating model on new observations (emulated future data).

Feature scaling consists of scaling features to make sure they take all values in the same scale.

Why after? Test set is supposed to be brand new set on which you evaluate your model. Not something you are supposed to work on gor training. If we apply scaling before the split, mean and std of values in test set will be gathered. _Information leakage_

In [13]:
from sklearn.model_selection import train_test_split

# 4 variables to pass into function
# expects matrix of features (X) and y (dependent variable)
# Split size (80:20 split recommended)
# Random state only needed for consistency in random factors
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [15]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [16]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [17]:
print(y_test)

[0 1]


## Feature Scaling

Main two techniques:
- Standardisation - between -3 and 3 
    - Works well all the time. Good catch all. 
- Normalisation - between 0 and 1 
    - Recommended when you have a normal distribution in most of your features.



In [18]:
# do we apply feature scaling to dummy variables? NO! Point of standardization is to have values in same range.
# not necessary for dummy variables. Standardization will only make it worse
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform (X_train[:, 3:])
# for test set ONLY transorm 
X_test[:, 3:] = sc.transform (X_test[:, 3:])

In [21]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [20]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
