# Function approximation

* Inputs == Predictors == Independent variables == features
* Outputs == responses == Dependent variables


* The last column of a Dataset always corresponds to the class label

# Input

In [9]:
from csv import reader

def load_csv(filename):
    with open(filename, 'r') as file:
        lines = reader(file)
        dataset = list(lines)
    dataset = [ row for row in dataset if row ]
    return dataset
    
def str_column_to_float(dataset, column):     
    """ Convert string numerical entries to floats. """
    for row in dataset:                                                     
        row[column] = float(row[column].strip())
                                         
def str_column_to_int(dataset, column):    
    """ Convert string label entries to ints. """
    class_values = [row[column] for row in dataset]                         
    unique = set(class_values)                                              
    lookup = dict()                                                         
    for i, value in enumerate(unique):                                      
        lookup[value] = i                                               
    for row in dataset:                                                     
        row[column] = lookup[row[column]]                               
    return lookup

In [10]:
filename = '../datasets/pima-indians-diabetes.csv'             
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))

print(dataset[0])
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i) 
print(dataset[0])



filename = '../datasets/iris.csv' 
dataset = load_csv(filename)
print('\n\nLoaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))

print(dataset[0])  
for i in range(4):               
    str_column_to_float(dataset, i) 
print(dataset[0])

lookup = str_column_to_int(dataset, 4)  
print(lookup)

Loaded data file ../datasets/pima-indians-diabetes.csv with 768 rows and 9 columns
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]


Loaded data file ../datasets/iris.csv with 150 rows and 5 columns
['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
[5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
{'Iris-virginica': 0, 'Iris-versicolor': 1, 'Iris-setosa': 2}


# Scaling

Scale of input and output to be equivalent.

Standardization assumes your data conforms to a normal distribution.
Normalization is more sensitive to outliers.

In [16]:
def dataset_minmax(dataset):
    minmax = []
    for i in range(len(dataset[0])): # for each feature
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append( [value_min, value_max] )
    return minmax

def normalize_dataset(dataset):
    """ Scaled value = value - min / max - min """
    minmax = dataset_minmax(dataset)
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i]-minmax[i][0]) / (minmax[i][1]-minmax[i][0])
    return dataset

In [18]:
# normalize 
filename = '../datasets/pima-indians-diabetes.csv'             
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
# convert strings to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i) 
print(dataset[0])
dataset = normalize_dataset(dataset)                                              
print(dataset[0])

Loaded data file ../datasets/pima-indians-diabetes.csv with 768 rows and 9 columns
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]


In [21]:
from math import sqrt

def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means

def column_stdevs(dataset):
    """ var = sum( value - mean )**2 / . """
    means = column_means(dataset)
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        valriance = [ (row[i]-means[i])**2.0 for row in dataset ]
        stdevs[i] = sum(valriance)
    stdevs = [ sqrt(x/float(len(dataset)-1)) for x in stdevs ]
    return stdevs, means

def standardize_dataset(dataset):
    stdevs, means = column_stdevs(dataset)
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]
    return dataset

In [22]:
# standardize
filename = '../datasets/pima-indians-diabetes.csv'             
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
# convert strings to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i) 
print(dataset[0])                                                                                                      
# standardize dataset                                                           
dataset = standardize_dataset(dataset)                                     
print(dataset[0])

Loaded data file ../datasets/pima-indians-diabetes.csv with 768 rows and 9 columns
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067]


# Resampling Methods

If multiple algorithms are compared the same train test split should be used for consistent comparison.

$k$-fold cross validation helps reduce noise of performance estimates.
Here, the algorithm is trained and evaluated $k$ times and the performance is summarized by taking the mean of the performance score.

Train on $k$-1 folds and evaluate on the $k$th one.
Then repeat so that each of the $k$ groups is given an opportunity to be used as a test set.

**A quick way to check if the fold sizes are representative is to calculate summary statistics (i.e., mean and standard deviation) and see how much the values differ from the statistics of the entire set.**



In [24]:
import os
from time import time
from random import seed
from random import randrange

def train_test_split(dataset, split=0.60):
    seed( os.getpid() * time())
    train = []
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append( dataset_copy.pop(index) )
    return train, dataset_copy

In [28]:
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]                   
train, test = train_test_split(dataset)                                         
print(train)                                                                    
print(test)

[[7], [1], [8], [3], [10], [5]]
[[2], [4], [6], [9]]


In [78]:
def cross_validation_split(dataset, n_folds=3):
    seed( os.getpid() * time())
    dataset_split = []
    dataset_copy = list(dataset)
    fold_size = len(dataset) // n_folds
    for i in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append( dataset_copy.pop(index) )
        dataset_split.append( fold )
    return dataset_split

In [79]:
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]                   
folds = cross_validation_split(dataset)                                      
for fold in folds:
    print(fold)

[[4], [1], [6]]
[[2], [10], [8]]
[[5], [3], [9]]


# Evaluation Metrics

In [80]:
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual))

In [81]:
# Test accuracy
actual    = [0,0,0,0,0,1,1,1,1,1]
predicted = [0,1,0,0,0,1,0,1,1,1]
accuracy = accuracy_metric(actual, predicted)
print(accuracy)

0.8


In [82]:
def confusion_matrix(actual, predicted):
    unique = list(set(actual))
    matrix = [[] for i in range(len(unique))]
    for i in range(len(unique)):
        matrix[i] = [0 for i in range(len(unique))]
        
    lookup = dict()
    for i, value in enumerate(unique): # assign id to each label
        lookup[value] = i
    for i in range(len(actual)):
        x = lookup[actual[i]]
        y = lookup[predicted[i]]
        matrix[x][y] += 1
    return unique, matrix

def print_confusion_matrix(actual, predicted):
    unique, matrix = confusion_matrix(actual, predicted)
    print(' '*3, end='')
    for i in unique:
        print(i, end='  ')
    print()
    for i in range(len(matrix)):
        print(unique[i], matrix[i])

In [83]:
# Test confusion matrix with integers
actual    = [0,0,0,0,0,1,1,1,1,1]
predicted = [0,1,1,0,0,1,0,1,1,1]
print_confusion_matrix(actual, predicted)

   0  1  
0 [3, 2]
1 [1, 4]


In [84]:
def mae_metric(actual, predicted):    
    """ Mean Absolute Error"""
    sum_error = 0.0                                                         
    for i in range(len(actual)):                                            
        sum_error += abs(predicted[i] - actual[i])                      
    return sum_error / float(len(actual))

def rmse_metric(actual, predicted):                                             
    sum_error = 0.0                                                         
    for i in range(len(actual)):                                            
        prediction_error = predicted[i] - actual[i]                     
        sum_error += (prediction_error ** 2)                            
    mean_error = sum_error / float(len(actual))                             
    return sqrt(mean_error)

# Baselines

In [85]:
def random_algorithm(train, test):                                              
    output_values = [row[-1] for row in train]                              
    unique = list(set(output_values))                                       
    predicted = []                                           
    for row in test:                                                        
        index = randrange(len(unique))                                  
        predicted.append(unique[index])                                 
    return predicted                                                        
                                                                                
seed(1)                                                                         
train = [[0], [1], [0], [1], [0], [1]]                                          
test = [[None], [None], [None], [None]]                                         
predictions = random_algorithm(train, test)                                     
print(predictions)

[0, 0, 1, 0]


In [86]:
def zero_rule_algorithm_classification(train, test):                            
    output_values = [row[-1] for row in train]                              
    prediction = max(set(output_values), key=output_values.count)           
    predicted = [prediction for i in range(len(train))]                     
    return predicted                                                        
                                                                                
seed(1)                                                                         
train = [['0'], ['0'], ['0'], ['0'], ['1'], ['1']]                              
test = [[None], [None], [None], [None]]                                         
predictions = zero_rule_algorithm_classification(train, test)                   
print(predictions)

['0', '0', '0', '0', '0', '0']


In [87]:
def zero_rule_algorithm_regression(train, test):                                
    output_values = [row[-1] for row in train]                              
    prediction = sum(output_values) / float(len(output_values))             
    predicted = [prediction for i in range(len(test))]                      
    return predicted                                                        
                                                                                
seed(1)                                                                         
train = [[10], [15], [12], [15], [18], [20]]                                    
test = [[None], [None], [None], [None]]                                         
predictions = zero_rule_algorithm_regression(train, test)                       
print(predictions)

[15.0, 15.0, 15.0, 15.0]


# Test Harness

## Train Test Split

In [88]:
def evaluate_algorithm_ttsplit(dataset, algorithm, split, *args):    
    # Train Test Split
    train, test = train_test_split(dataset, split)                          
    test_set = []                                                      
    for row in test:                                                        
        row_copy = list(row)     
        # delete class label
        row_copy[-1] = None                                             
        test_set.append(row_copy)
    # Fit
    predicted = algorithm(train, test_set, *args)                           
    actual = [row[-1] for row in test] 
    # Measure
    accuracy = accuracy_metric(actual, predicted)                           
    return accuracy 

In [89]:
filename = '../datasets/pima-indians-diabetes.csv'             
dataset = load_csv(filename)
# convert strings to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i) 

# evaluate algorithm                                                            
split = 0.6                                                                     
accuracy = evaluate_algorithm_ttsplit(dataset, zero_rule_algorithm_classification, split)
print('Accuracy: {}'.format(accuracy))    

Accuracy: 0.6547231270358306


## K-fold learning

In [93]:
def evaluate_algorithm_kfold(dataset, algorithm, n_folds, *args):  
    # K folds
    folds = cross_validation_split(dataset, n_folds)                        
    scores = []                                                       
    for fold in folds:                                                      
        train_set = list(folds)                                         
        train_set.remove(fold)                                          
        train_set = sum(train_set, [])                                  
        test_set = list()
        # prep test set for each iteration
        for row in fold:                                                
            row_copy = list(row)                                    
            test_set.append(row_copy)                               
            row_copy[-1] = None           
        # fit
        predicted = algorithm(train_set, test_set, *args)               
        actual = [row[-1] for row in fold]                              
        accuracy = accuracy_metric(actual, predicted)                   
        scores.append(accuracy)                                         
    return scores

In [94]:
filename = '../datasets/pima-indians-diabetes.csv'             
dataset = load_csv(filename)
# convert strings to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i) 

# evaluate algorithm                                                            
n_folds = 5
accuracy = evaluate_algorithm_kfold(dataset, zero_rule_algorithm_classification, n_folds)
print('Accuracy: {}'.format(accuracy))  

Accuracy: [0.6797385620915033, 0.6339869281045751, 0.6666666666666666, 0.6209150326797386, 0.6470588235294118]


In [95]:
# Save ypour work

In [96]:
!jupyter nbconvert --to script Getting_Started.ipynb

[NbConvertApp] Converting notebook Getting_Started.ipynb to script
[NbConvertApp] Writing 14501 bytes to Getting_Started.py
