# Chapter 02 - Scale Machine Learning Data

In [12]:
# Import modules
from ch01_load_and_convert_data import  load_csv, str_column_to_float
from math import sqrt

### Normalize data

* Normalization here refer to rescaling an input variable to the range between 0 and 1.

In [2]:
# Function to get min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        # Get the values of a specific column
        col_values = [row[i] for row in dataset]
        # Get the min value
        value_min = min(col_values)
        # Get the max value
        value_max = max(col_values)
        # Add them to a list
        minmax.append([value_min, value_max])
    return minmax

In [4]:
# Testing the function
dataset = [[50, 30], [20, 90]]
print(dataset)

minmax = dataset_minmax(dataset)
print(minmax)

[[50, 30], [20, 90]]
[[20, 50], [30, 90]]


The calculation to normalize a single value for a column is:

scaled value = $\frac{value - min}{max - min}$

In [6]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [7]:
# Testing the functions with a small dataset
dataset = [[50, 30], [20, 90]]
print(dataset)

minmax = dataset_minmax(dataset)
print(minmax)

normalize_dataset(dataset, minmax)
print(dataset)

[[50, 30], [20, 90]]
[[20, 50], [30, 90]]
[[1.0, 0.0], [0.0, 1.0]]


In [9]:
# Normalizing the diabetes dataset
filename = './data/pima-indians-diabetes.csv'
dataset = load_csv(filename)

# Visualizing the shape of the dataset
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))

# Convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

print("Before the normalization")
print(dataset[0])

# Calculate the min and max for each column
minmax = dataset_minmax(dataset)

# Normalize columns
nomalize_dataset(dataset, minmax)
print("After the normalization")
print(dataset[0])

Loaded data file ./data/pima-indians-diabetes.csv with 768 rows and 9 columns
Before the normalization
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
After the normalization
[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]


### Standardize Data

Standardization is a rescaling technique that refers to centering the distribution of the data on the value 0 and the stardard deviation to the value 1.

* Mean and stardard deviation need to be know prior to scaling.

In [15]:
# Calculate column mean
def column_means(dataset):
    # Create a list to means
    means = [0 for i in range(len(dataset[0]))]
    # Iterate by each row in dataset, get the
    # values of a column, calculate the 
    # mean and assign to a means' list. 
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means

In [13]:
# Calculate columns standard deviations
def column_stdevs(dataset, means):
    # Create a list to stdevs
    stdevs = [0 for i in range(len(dataset[0]))]
    # Iterate by each row in dataset, get the
    # values of a column, calculate the 
    # variance and the sum of variance and 
    # assign to a stdevs' list.
    for i in range(len(dataset[0])):
        variance = [pow(row[i] - means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
    # Calculate the sqrt of sum of variances divided by
    # the number of rows minus 1
    stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs] 
    return stdevs

In [16]:
# Testing the functions with a small dataset
dataset = [[50, 30], [20, 90], [30, 50]]
print(dataset)

means = column_means(dataset)
stdevs = column_stdevs(dataset, means)

print(means)
print(stdevs)

[[50, 30], [20, 90], [30, 50]]
[33.333333333333336, 56.666666666666664]
[15.275252316519467, 30.550504633038933]


The calculation to stardardize a given value is as follow:

$standardized-value_i = \frac{value_i - mean}{stdev}$

In [17]:
# standardize dataset
def standardize_dataset(dataset, means, stdevs):
    # Iterate by each value in a dataset and calculate
    # a new value. 
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]

In [18]:
# Testing the functions with a small dataset
dataset = [[50, 30], [20, 90], [30, 50]]
print(dataset)

# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)
print(means)
print(stdevs)

# Standardize dataset
standardize_dataset(dataset, means, stdevs)
print(dataset)

[[50, 30], [20, 90], [30, 50]]
[33.333333333333336, 56.666666666666664]
[15.275252316519467, 30.550504633038933]
[[1.0910894511799618, -0.8728715609439694], [-0.8728715609439697, 1.091089451179962], [-0.21821789023599253, -0.2182178902359923]]


In [19]:
# Standardize the diabetes dataset
filename = './data/pima-indians-diabetes.csv'
dataset = load_csv(filename)

# Visualizing the shape of the dataset
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))

# Convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

print("Before the normalization")
print(dataset[0])

# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)

# Standardize dataset
standardize_dataset(dataset, means, stdevs)
print(dataset[0])

Loaded data file ./data/pima-indians-diabetes.csv with 768 rows and 9 columns
Before the normalization
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067]


### When to Normalize and Standardize

* Standardization assumes the our data is in normal distribution or close to normal. In that case, in wich our data has this characteristic, standardization is the best method. 
* If our data is not normally distributed, normalization is the best method.

### Future works

To researching and implementing:

* Normalization that permits a configurable range, such as -1 to 1 and more.
* Standardization that permits a configurable spread, such as 1, 2 or more standard deviations
from the mean.
* Exponential transforms such as logarithm, square root and exponents.
* Power transforms such as Box-Cox for fixing the skew in normally distributed data.