#### Alexa Andrews and Jeffrey Mulderink  
#### Group name: aa_jm_knn   
#### Project title: Improved kNN classifier


Our data set was looking to predict whether a person would default on their credit card payment based

In [13]:
import utils
import numpy as np
import math
import copy
import random

header, table = utils.open_csv_with_header("default_of_credit_card_clients.csv")

np.random.shuffle(table)
table = table[:250]

In [5]:
def get_random_attribute_subset(table, header, num_values):
    '''
        Returns a copy table with a random columns removed
        Param table: A table to remove attributes from
        Param header: The attribute names
        Param num_values: The number of attributes to keep
        Returns: A tuple with the first item being the table with num_values random attibutes
                and the second item a list of the names of the attributes it chose
    '''
    smaller_table = copy.deepcopy(table)
    num_attributes = len(smaller_table[0])
    indices_to_remove = random.sample(range(0, num_attributes-1), num_attributes-num_values) 
    indices_to_remove.sort(reverse=True)
    for c in indices_to_remove:
        for r, _ in enumerate(smaller_table):
            del smaller_table[r][c] 
        
    attributes_kept = [header[i] for i in range(num_attributes) if i not in indices_to_remove]
        
    return smaller_table, attributes_kept

This following cell tests how different k values impact the performace of a kNN classifier. We were surprised by how large K was for optimal results. These forms of classifier evaluation all tended to be highest in the upper 60 to 100 range, after which they would drop off. 

In [15]:
def create_kNN_classifier_vary_k(table, start_k=9, end_k=99, step=6, measurement='a'):
    '''
        This function uses stratified cross fold validation to test different k values for a table.
        It can return measurements of accuracy ('a'), recall('r'), precision('p'), or F-measure('f')
        Param table: A table to test kNN on
        Param start_k: The minimum k value to test.
        Param end_k: The maximum k value to test
        Param step: The step between k values tested. 
        Param measurement: The measurement type to return 
        Returns: A list of tuples (measurement_value, k)
    '''
    folds = utils.get_stratified_folds(table)
    
    
    results = []
    for k in range(start_k, end_k, step):
        print("testing at k=%d" % k)
        predictions, actuals = [], [] 
        for i, fold in enumerate(folds):
            train = [instance for fold in folds[:i] for instance in fold] + [instance for fold in folds[i+1:] for instance in fold]
            test, train = utils.normalize_attributes(fold, train)
            utils.remove_column(train, -1) # remove the class column before prediction
            for test_instance in test:
                predictions.append(utils.make_kNN_prediction(test_instance[:-1], train, k))
                actuals.append(test_instance[-1])
        
        if measurement == 'a':
            correct = [predictions[i] == actuals[i] for i in range(len(predictions))]
            results.append((correct.count(True) / len(correct), k))
        else:
            true_positives = [predictions[i]==1 and actuals[i]==1 for i in range(len(predictions))]
            if measurement == 'r':
                predicted_positives = predictions.count(1)
                results.append((true_positives.count(True)/predicted_positives,k))
            elif measurement == 'p':
                actual_positives = actuals.count(1)
                results.append((true_positives.count(True)/actual_positives,k))
            elif measurement == 'f':
                recall = true_positives.count(True) / predictions.count(1)
                precision = true_positives.count(True) / actuals.count(1)
                results.append((2*precision*recall/(precision+recall),k))
            else:
                print("error - invalid measurement", measurement)
                break
    return results

accuracies = create_kNN_classifier_vary_k(table, start_k=50, end_k=110)
# print("Accuracies for variable k\n", accuracies)
accuracies.sort(reverse=True)
print("sorted", accuracies)

recalls = create_kNN_classifier_vary_k(table, 50, 110, measurement='r')
# print("Recall values for variable k\n", recalls)
recalls.sort(reverse=True)
print("sorted", recalls)

precisions = create_kNN_classifier_vary_k(table, 50, 110, measurement='p')
# print("Precision values for variable k\n", precisions)
precisions.sort(reverse=True)
print("sorted", precisions)

f_measures = create_kNN_classifier_vary_k(table, 50, 110, measurement='f')
# print("F-measure values for variable k\n", f_measures)
f_measures.sort(reverse=True)
print("sorted", f_measures)

testing at k=50


AttributeError: module 'utils' has no attribute 'remove_column'

In [10]:
def create_kNN_classifier_vary_attributes(table, header, k, iterations=20, F=10, measurement='a'):
    '''
        k: nearest neighbors
        iterations: number of random subsets of attributes tested
        F: number of attributes per subset
    '''
    
    results = []
    for i in range(iterations):
        print("testing random attribute set", i+1, "of", iterations)
        current_table, current_attribs = get_random_attribute_subset(table, header, F)
        folds = utils.get_stratified_folds(current_table)
        predictions, actuals = [], []
        for i, fold in enumerate(folds):
            train = [instance for fold in folds[:i] for instance in fold] + [instance for fold in folds[i+1:] for instance in fold]
            test, train = utils.normalize_attributes(fold, train)
            utils.remove_column(train, -1) # remove the class column before prediction
            for test_instance in test:
                predictions.append(utils.make_kNN_prediction(test_instance[:-1], train, k))
                actuals.append(test_instance[-1])
        if measurement == 'a':
            correct = [predictions[i] == actuals[i] for i in range(len(predictions))]
            results.append((correct.count(True) / len(correct), current_attribs))
        else:
            true_positives = [predictions[i]==1 and actuals[i]==1 for i in range(len(predictions))]
            if measurement == 'r':
                predicted_positives = predictions.count(1)
                results.append((true_positives.count(True)/predicted_positives, current_attribs))
            elif measurement == 'p':
                actual_positives = actuals.count(1)
                results.append((true_positives.count(True)/actual_positives, current_attribs))
            elif measurement == 'f':
                recall = true_positives.count(True) / predictions.count(1)
                precision = true_positives.count(True) / actuals.count(1)
                results.append((2*precision*recall/(precision+recall), current_attribs))
    
    return accuracies

table = utils.remove_column(table, 0)

accuracies = create_kNN_classifier_vary_attributes(table, header, accuracies[0][1], 30)
accuracies.sort(reverse=True)
print("\nBest 5 accuracies for variable attribute subset\n", accuracies[:5])
best_feature_set_indices = [header.index(x) for x in accuracies[0][1]]
print(best_feature_set_indices)




testing random attribute set 1 of 20
testing random attribute set 2 of 20
testing random attribute set 3 of 20
testing random attribute set 4 of 20
testing random attribute set 5 of 20
testing random attribute set 6 of 20
testing random attribute set 7 of 20
testing random attribute set 8 of 20
testing random attribute set 9 of 20
testing random attribute set 10 of 20
testing random attribute set 11 of 20
testing random attribute set 12 of 20
testing random attribute set 13 of 20
testing random attribute set 14 of 20
testing random attribute set 15 of 20
testing random attribute set 16 of 20
testing random attribute set 17 of 20
testing random attribute set 18 of 20
testing random attribute set 19 of 20
testing random attribute set 20 of 20

Best 5 accuracies for variable attribute subset
 [(0.9, ['ID', 'SEX', 'PAY_4', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT2', 'PAY_AMT4', 'default payment next month']), (0.884, ['LIMIT_BAL', 'SEX', 'AGE', 'PAY_0', 'PAY_3', 'PAY_4'

In [None]:
def create_kNN_classifier_vary_weights(table, attributes, k): pass