# Lecture 37: Evaluating the Accuracy of Classifiers

In [1]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Functions from last time

In [2]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1-pt2)**2))

In [3]:
def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    row1_array = np.array(row1)
    row2_array = np.array(row2)
    return distance(row1_array,row2_array)

In [4]:
def distances(training, example):
    """
    Compute distance between example and every row in training.
    Return training augmented with Distance column
    """
    distances = make_array()
    attributes_only = training.drop('Class')
    
    for row in attributes_only.rows:
        distances = np.append(distances,row_distance(row,example)) # append distance between row and example
        
    return training.with_column('Distance_to_ex', distances)

In [5]:
def closest(training, example, k):
    """
    Return a table of the k closest neighbors to example
    """
    sorted_distances = distances(training,example).sort('Distance_to_ex')
    return sorted_distances.take(np.arange(k))

## Returning to Google Science Fair example

In [6]:
# Data
patients = Table.read_table('breast-cancer.csv').drop('ID')
attributes = patients.drop('Class')
attributes.show(3)

Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
5,1,1,1,2,1,3,1,1
5,4,4,5,7,10,3,2,1
3,1,1,1,2,2,3,1,1


In [7]:
# Test our closest function
example = attributes.row(21)
closest(patients.exclude(21), example, 5)

Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class,Distance_to_ex
8,4,4,5,4,7,7,8,2,0,4.3589
10,5,7,4,4,10,8,9,1,1,4.47214
7,4,4,3,4,10,6,9,1,1,5.09902
10,3,6,2,3,5,4,10,2,1,5.38516
10,5,5,6,3,10,7,9,2,1,5.38516


In [8]:
# Count up the classes
closest(patients.exclude(21), example, 5).group('Class').sort('count', descending=True)

Class,count
1,4
0,1


In [9]:
def majority_class(topk):
    """
    Return the class with the highest count
    """
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

In [10]:
def classify(training, example, k):
    """
    Return the majority class among the 
    k nearest neighbors of example
    """
    return majority_class(closest(training, example, k))

## Test our classifier

In [11]:
classify(patients.exclude(21), example, 5)

1

In [12]:
patients.take(21)

Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
10,5,5,3,6,7,7,10,1,1


In [13]:
new_example = attributes.row(10)
classify(patients.exclude(10), new_example, 5)

0

In [14]:
patients.take(10)

Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
1,1,1,1,1,1,3,1,1,0


In [15]:
another_example = attributes.row(15)
classify(patients.exclude(15), another_example, 5)

0

In [16]:
patients.take(15)

Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
7,4,6,4,6,1,4,3,1,1


## Review of the Steps ##

- `distance(pt1, pt2)`: Returns the distance between the arrays `pt1` and `pt2`
- `row_distance(row1, row2)`: Returns the distance between the rows `row1` and `row2`
- `distances(training, example)`: Returns a table that is `training` with an additional column `'Distance'` that contains the distance between `example` and each row of `training`
- `closest(training, example, k)`: Returns a table of the rows corresponding to the k smallest distances 
- `majority_class(topk)`: Returns the majority class in the `'Class'` column
- `classify(training, example, k)`: Returns the predicted class of `example` based on a `k` nearest neighbors classifier using the historical sample `training`

## Accuracy of a Classifier ##

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set  = shuffled.take(np.arange(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        is_correct = (c == test.column('Class').item(i))
        num_correct = num_correct + is_correct
    return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 5)

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
evaluate_accuracy(training_set, test_set, 11)

In [None]:
evaluate_accuracy(training_set, test_set, 1)

## Updating Probabilities ##

In [None]:
n = 100
second = round(n * 0.6)
third = round(n * 0.4)

year = np.array(['Second'] * second + ['Third'] * third)
major = np.array(['Declared'] * (round(second * 0.5)) + ['Undeclared'] * (round(second * 0.5)) + \
                 ['Declared'] * (round(third * 0.8))  + ['Undeclared'] * (round(third * 0.2)))
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)

In [None]:
students.show(3)

In [None]:
students.pivot('Major', 'Year')

In [None]:
# Verify: 60% of students are Second years, 40% are Third years
60 / (60 + 40)

In [None]:
# Verify: 50% of Second years have Declared
30 / 60

In [None]:
# Verify: 80% of Third years have Declared
32 / 40

In [None]:
# Chance of second year, given that they have declared
# P(second year | declared)

30 / 62

In [None]:
# P(third year | declared)

32 / 62

## Tree Diagram Calculation

In [None]:
# P(second year | declared), from tree diagram

(0.6 * 0.5) / (0.6 * 0.5 + 0.4 * 0.8)

## Decisions ##

In [None]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = np.array(['Disease'] * disease  +  ['No disease'] * no_disease)
    result = np.array(['Test +'] * (round(disease * 0.99)) + ['Test -'] * (round(disease * 0.01)) +\
                      ['Test +'] * (round(no_disease * 0.05)) + ['Test -'] * (round(no_disease * 0.95)) )
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

In [None]:
create_population(1/1000, 100000)

In [None]:
99 / (99 + 4995)

In [None]:
#P(disease | tested +)

# = P(disease & tested +) / P(tested +)

(0.001 * 0.99) / (0.001*0.99 + 0.999*0.05)

## Subjective Probabilities

In [None]:
# P(disease | tested +)

# = P(disease & tested +) / P(tested +)

# if prior probability of disease is 1/10

(0.1 * 0.99) / (0.1*0.99 + 0.9*0.05)

In [None]:
create_population(1/10, 10000)

In [None]:
990/(990+450)

In [None]:
# P(disease | tested +)
# if prior probability of disease is 0.5

(0.5 * 0.99) / (0.5*0.99 + 0.5*0.05)

In [None]:
create_population(0.5, 10000)

In [None]:
4950/(4950+250)