In [None]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Functions

In [None]:
def standard_units(x):
    return (x - np.mean(x))/np.std(x)

def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(np.array(list(new_point)), np.array(list(row)))
    return attributes.apply(distance_from_point)

def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))

def closest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

def majority(topkclasses):
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

## Counterfeit Currency

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
banknotes.group('Class')

### Standardize the data

In [None]:
# Standardize data

banknotes_SU = Table().with_columns(
    'WaveletVar SU', standard_units(banknotes.column('WaveletVar')),
    'WaveletSkew SU', standard_units(banknotes.column('WaveletSkew')),
    'WaveletCurt SU', standard_units(banknotes.column('WaveletCurt')),
    'Class', banknotes.column('Class'))

banknotes_SU

### Split data into training and testing tables

In [None]:
banknotes.num_rows

In [None]:
# 70%/30% for Training/Testing 

banknotes.num_rows * 0.70

In [None]:
shuffled_banknotes_SU = banknotes_SU.sample(with_replacement=False) 
banknotes_SU_train = shuffled_banknotes_SU.take(np.arange(960))
banknotes_SU_test  = shuffled_banknotes_SU.take(np.arange(960, 1372))

In [None]:
banknotes_SU_test.group('Class')

### Classify data

In [None]:
attributes = banknotes_SU_test.drop('Class')

classify(banknotes_SU_train, attributes.row(7), 9)

How to choose K? That's complicated...

### Determine the accuracy of the Classifier

In [None]:
evaluate_accuracy(banknotes_SU_train, banknotes_SU_test, 9)

This accuracy will depend on the randomly selected training and testing datasets. Let's assume that the accuracy is 99%.

### If we test a random bill and classify it as counterfeit, what is the probability that it is _actually_ counterfeit?

P(counterfeit) = 0.0001    
P(Not counterfeit) = 0.9999

__Assume that our accuracy if for both detecting counterfeit and non counterfeit bills__

P(test+ | counterfeit) = 0.99    
P(test- | counterfeit) = 0.01    

P(test+ | not counterfeit) = 0.01    
P(test- | counterfeit) = 0.99   

In [None]:
(0.0001 * 0.99) / ((0.0001 * 0.99) + (0.9999 * 0.01)) 