In [1]:
# Loading and scaling datasets
# Self-paced exercises followed from the following book :
# Machine Learning Algorithms from Scratch with Python (by Jason Brownlee)
# This notebook demonstrates sample code from chapters 1 and 2

from csv import reader

# Load a CSV file
def load_csv(filename):
    file = open(filename, 'r')
    lines = reader(file)
    dataset = list(lines)
    return dataset

# Load dataset
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'.format(filename, len(dataset), len(dataset[0])))

# Note : This function will load empty lines as valid rows of data. This is undesirable behavior and we can improve the function
# to properly take care of empty data rows.

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns.


In [4]:
from csv import reader

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Load Pima Indians Diabetes dataset
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'.format(filename, len(dataset), len(dataset[0])))

# Note: The reader function in csv module loads data columns as string values. To enable further processing of data,
# we need to convert values to float. The following output shows raw data format we get from using csv module.
print(dataset[0])

# We can use our simple function to improve runtime data format. This can be done using our simple function.
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print(dataset[0])


Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns.
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]


In [5]:
from csv import reader

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Note: We're now using Iris dataset, which has class labels in the last column. Because numeric values are preferred,
# we defined a simple function that maps each unique class label to an integer. We now load and process Iris dataset.

# Load Iris dataset
filename = 'iris.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'.format(filename, len(dataset), len(dataset[0])))

# Convert string columns to float (here, we must process feature columns only; i.e. columns 1 to 4)
for i in range(4):
    str_column_to_float(dataset, i)

# Our dataset looks good, but class labels are still in string form, as shown in the following output
print(dataset[0])

# Convert class labels to nominal integer values
lookup = str_column_to_int(dataset, 4)

# The Iris dataset is now better prepared for further processing in ML algorithms
print(dataset[0])
print(lookup)


Loaded data file iris.csv with 150 rows and 5 columns.
[5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
[5.1, 3.5, 1.4, 0.2, 2]
{'Iris-versicolor': 0, 'Iris-virginica': 1, 'Iris-setosa': 2}


In [6]:
# Note: In practice, we'll always use proven tools and libraries for common ML tasks like loading and cleaning data.
# The above sample codes are for demonstration purpose only. Common ML stack libraries, like Numpy and Pandas are much
# more robust and flexible to use in Data Preparation stage of the machine learning workflow.

# Research Notes :
# There are many areas that require additional research and experiment, such as these important considerations :
# 1. Other sources of data (like SQL tables and views, NoSQL collections, Graph databases, Streaming data, etc.)
# 2. Mapping input data to a format suitable for later stages
# 3. Common data cleansing concerns (missing data, invalid data, malformed data, etc.)

# Chapter 1 of JB book is now complete.

In [7]:
# Chapter 2  : Scale Machine Learning Data
# Book title : Machine Learning Algorithms from Scratch with Python (by Jason Brownlee)

# Note: Many ML algorithms need input (and output) data to fall inside a limited range (usually : 0 to 1 or -1 to 1).
# This is called scaling the data.
# There are two basic techniques for scaling data, namely : Normalization and Standardization

# **** Normalization ****
# This technique requires minimum and maximum values for each feature in the dataset.
# It's pretty easy to prepare for normalization. We need a function to calculate min-max values for all features.
# We also need a function to rescale a dataset to normal form (i.e. normalizing each column in each row)

# Find the min and max values for each column in the given dataset
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        min_value = min(col_values)
        max_value = max(col_values)
        minmax.append([min_value, max_value])
    return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


In [10]:
# Note: We're going to use functions from chapter 1 to load Diabetes dataset, convert values to float and rescale dataset
# It's good to think about the whole process as a pipeline with different stages. Here, we have the following stages in our pipeline :
# Stage 1 - Data Acquisition : Import data from external source (i.e. CSV data file)
# Stage 2 - Data Preparation : Convert string values to floating point numbers
# Stage 3 - Data Preparation : Rescale dataset columns using Normalization technique

from csv import reader

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Find the min and max values for each column in the given dataset
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        min_value = min(col_values)
        max_value = max(col_values)
        minmax.append([min_value, max_value])
    return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Sample pipeline demo
# Stage 1 - Data Acquisition : Import data from external source (i.e. CSV data file)
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'.format(filename, len(dataset), len(dataset[0])))

# Stage 2 - Data Preparation : Convert string values to floating point numbers
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print(dataset[0])

# Stage 3 - Data Preparation : Rescale dataset columns using Normalization technique
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
print(dataset[0])


Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns.
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]


In [11]:
# **** Standardization ****
# This technique requires special statistical values; namely, Mean and Standard Deviation, for each feature in the dataset.
# We need to implement 3 functions to support standardization technique. These functions are summarized below :
# 1. column_means   : Calculates mean value for each column in dataset
# 2. column_stddevs : Calculates standard deviation for each column in dataset
# 3. standardize_dataset : Rescales all feature values in a dataset using standardization formula

from math import sqrt

# Calculate column means
def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means

# Calculate column standard deviations
def column_stddevs(dataset, means):
    stddevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i] - means[i], 2) for row in dataset]
        stddevs[i] = sum(variance)
    stddevs = [sqrt(x / float(len(dataset) - 1)) for x in stddevs]
    return stddevs

# Standardize dataset
def standardize_dataset(dataset, means, stddevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stddevs[i]


In [13]:
# Note: We're going to use functions from chapter 1 to load Diabetes dataset, convert values to float and rescale dataset
# We have the same stages as we had in normalization demo :
# Stage 1 - Data Acquisition : Import data from external source (i.e. CSV data file)
# Stage 2 - Data Preparation : Convert string values to floating point numbers
# Stage 3 - Data Preparation : Rescale dataset columns using Standardization technique

from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Calculate column means
def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means

# Calculate column standard deviations
def column_stddevs(dataset, means):
    stddevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i] - means[i], 2) for row in dataset]
        stddevs[i] = sum(variance)
    stddevs = [sqrt(x / float(len(dataset) - 1)) for x in stddevs]
    return stddevs

# Standardize dataset
def standardize_dataset(dataset, means, stddevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stddevs[i]

# Sample pipeline demo
# Stage 1 - Data Acquisition : Import data from external source (i.e. CSV data file)
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'.format(filename, len(dataset), len(dataset[0])))

# Stage 2 - Data Preparation : Convert string values to floating point numbers
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print(dataset[0])

# Stage 3 - Data Preparation : Rescale dataset columns using Standardization technique
means = column_means(dataset)
stddevs = column_stddevs(dataset, means)
standardize_dataset(dataset, means, stddevs)
print(dataset[0])


Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns.
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067]


In [None]:
# Note: Normal and Standard scaling are the two basic techniques for handling large variations in numeric data.
# There are many more transforms that we may need to apply, depending on useful insight we can get from basic data analysis.
# Statistical analysis and summaries, as well as many available visualizations, can give us this valuable insight.
# Preparatory stages in a typical machine learning workflow require a lot of knowledge and experience that can only be gained
# by research and practice.

# Chapter 2 of JB book is now complete.