# Preprocessing with sklearn
 Workalong with Machine Learning and Statistics module.  
 https://scikit-learn.org/stable/modules/preprocessing.html

In [1]:
import pandas as pd
import sklearn.preprocessing as pre
import numpy as np

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ianmcloughlin/datasets/master/iris.csv")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [3]:
# Create a dataframe with only the numeric values for pre-processing
x = df.iloc[:, 0:4]
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


**One-off scaling**

In [17]:
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
# pre = sklearn preprocessing = Standardisation
# This is to do it once off - better use method in cell below 
# for repeatability for validation / testing datasets
xscale = pd.DataFrame(pre.scale(x), columns=x.columns) # Need columns = ... to bring the column names in
xscale

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
5,-0.537178,1.939791,-1.169714,-1.052180
6,-1.506521,0.788808,-1.340227,-1.183812
7,-1.021849,0.788808,-1.283389,-1.315444
8,-1.748856,-0.362176,-1.340227,-1.315444
9,-1.143017,0.098217,-1.283389,-1.447076


**Fitting and transforming**

In [5]:
# Standard scaler is used to transform dataset reproducibly
scaler = pre.StandardScaler()
# Fitting it to x data
scaler.fit(x)
# Show the mean and std dev of each solumn
scaler.mean_, scaler.scale_

(array([5.84333333, 3.05733333, 3.758     , 1.19933333]),
 array([0.82530129, 0.43441097, 1.75940407, 0.75969263]))

In [6]:
# Transform the dta via the scaler we created
xscale = pd.DataFrame(scaler.transform(x), columns=x.columns)
xscale

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
5,-0.537178,1.939791,-1.169714,-1.052180
6,-1.506521,0.788808,-1.340227,-1.183812
7,-1.021849,0.788808,-1.283389,-1.315444
8,-1.748856,-0.362176,-1.340227,-1.315444
9,-1.143017,0.098217,-1.283389,-1.447076


In [7]:
# Can reapply this scaler to future data 
# (for testing / validating / using purposes)
scaler.transform(np.array([[1.0, 0.5, 10.0, 4.1]]))

array([[-5.86856386, -5.88689864,  3.54779219,  3.81821089]])

**Output Values**

In [8]:
# Need to extract class to encode them
y = df[['class']]
y

Unnamed: 0,class
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa
5,setosa
6,setosa
7,setosa
8,setosa
9,setosa


In [9]:
# OneHotEncoder is a scaler for categorical data
encoder = pre.OneHotEncoder()
encoder.fit(y)
yencoded = encoder.transform(y)
yencoded.toarray()

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [18]:
# Can reverse the encoding
encoder.inverse_transform(yencoded)

array([['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
    

**Whitening**  
Removes correlations between values.

In [11]:
x.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [12]:
import sklearn.decomposition as dec

In [19]:
# PCA - Principle Component Analysis 
# n_components = number of input variables
pca = dec.PCA(n_components=4, whiten=True)
pca.fit(x)
xwhite = pd.DataFrame(pca.transform(x), columns=x.columns)
xwhite
# Very hard to get back to original dataset at this stage

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-1.305338,0.648369,-0.099817,-0.014654
1,-1.319935,-0.359309,-0.752573,-0.641421
2,-1.404967,-0.294244,0.064007,-0.129341
3,-1.335109,-0.646140,0.112849,0.489524
4,-1.327023,0.663304,0.322103,0.396788
5,-1.109222,1.504884,0.603153,0.156755
6,-1.371677,-0.181605,0.922164,0.311836
7,-1.277141,0.331668,-0.078236,0.293406
8,-1.403699,-1.173960,0.074232,0.173233
9,-1.299809,-0.230959,-0.706690,0.364640


In [20]:
# Now nothing is correlated
xwhite.corr().round()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,0.0,0.0,0.0
sepal_width,0.0,1.0,-0.0,0.0
petal_length,0.0,-0.0,1.0,-0.0
petal_width,0.0,0.0,-0.0,1.0


In [15]:
# Also standardise
xwhite.mean().round()

sepal_length   -0.0
sepal_width    -0.0
petal_length   -0.0
petal_width    -0.0
dtype: float64

In [16]:
xwhite.std().round()

sepal_length    1.0
sepal_width     1.0
petal_length    1.0
petal_width     1.0
dtype: float64

## End 