#### Alexa Andrews  
#### Jeffrey Mulderink  
#### Group name: aa_jm_knn   
#### Project title: Improved kNN classifier  

Our data set was looking to predict whether a person would default on their credit card payment based

In [2]:
import utils
import numpy as np
import math
import copy
import random

header, table = utils.open_csv_with_header("default_of_credit_card_clients.csv")

np.random.shuffle(table)
table = table[:250]

In [7]:
def get_random_attribute_subset(table, header, num_values):
    '''
        Returns a copy table with a random columns removed
    '''
    smaller_table = copy.deepcopy(table)
    num_attributes = len(smaller_table[0])
    indices_to_remove = random.sample(range(0, num_attributes-1), num_attributes-num_values) 
    indices_to_remove.sort(reverse=True)
    for c in indices_to_remove:
        for r, _ in enumerate(smaller_table):
            del smaller_table[r][c] 
        
    attributes_kept = [header[i] for i in range(num_attributes) if i not in indices_to_remove]
        
    return smaller_table, attributes_kept

In [5]:
def create_kNN_classifier_vary_k(table):
    folds = utils.get_stratified_folds(table)
    
    
    accuracies = []
    for k in range(27, 89, 6):
        print("testing at k=%d" % k)
        predictions, actuals = [], [] 
        for i, fold in enumerate(folds):
            train = [instance for fold in folds[:i] for instance in fold] + [instance for fold in folds[i+1:] for instance in fold]
            test, train = utils.normalize_attributes(fold, train)
            for test_instance in test:
                predictions.append(utils.make_kNN_prediction(test_instance, train, k))
                actuals.append(test_instance[-1])
        correct = [predictions[i] == actuals[i] for i in range(len(predictions))]
        accuracies.append((correct.count(True) / len(correct), k))
    
    return accuracies

accuracies = create_kNN_classifier_vary_k(table)
print("Accuracies for variable k\n", accuracies)
accuracies.sort(reverse=True)
print("sorted", accuracies)

testing at k=27
testing at k=33
testing at k=39
testing at k=45
testing at k=51
testing at k=57
testing at k=63
testing at k=69
testing at k=75
testing at k=81
testing at k=87
Accuracies for variable k
 [(0.728, 27), (0.724, 33), (0.724, 39), (0.796, 45), (0.808, 51), (0.796, 57), (0.796, 63), (0.796, 69), (0.796, 75), (0.792, 81), (0.792, 87)]
sorted [(0.808, 51), (0.796, 75), (0.796, 69), (0.796, 63), (0.796, 57), (0.796, 45), (0.792, 87), (0.792, 81), (0.728, 27), (0.724, 39), (0.724, 33)]


In [10]:
def create_kNN_classifier_vary_attributes(table, header, k, iterations=20, F=10):
    '''
        k: nearest neighbors
        iterations: number of random subsets of attributes tested
        F: number of attributes per subset
    '''
    
    accuracies = []
    for i in range(iterations):
        print("testing random attribute set", i+1, "of", iterations)
        current_table, current_attribs = get_random_attribute_subset(table, header, F)
        folds = utils.get_stratified_folds(current_table)
        predictions, actuals = [], []
        for i, fold in enumerate(folds):
            train = [instance for fold in folds[:i] for instance in fold] + [instance for fold in folds[i+1:] for instance in fold]
            test, train = utils.normalize_attributes(fold, train)
            for test_instance in test:
                predictions.append(utils.make_kNN_prediction(test_instance, train, k))
                actuals.append(test_instance[-1])
        correct = [predictions[i] == actuals[i] for i in range(len(predictions))]
        accuracies.append((correct.count(True) / len(correct), current_attribs))
    
    return accuracies

accuracies = create_kNN_classifier_vary_attributes(table, header, accuracies[0][1], 20)
    
accuracies.sort(reverse=True)
print("\nBest 5 accuracies for variable attribute subset\n", accuracies[:5])

best_feature_set_indices = [header.index(x) for x in accuracies[0][1]]
print(best_feature_set_indices)

testing random attribute set 1 of 20
testing random attribute set 2 of 20
testing random attribute set 3 of 20
testing random attribute set 4 of 20
testing random attribute set 5 of 20
testing random attribute set 6 of 20
testing random attribute set 7 of 20
testing random attribute set 8 of 20
testing random attribute set 9 of 20
testing random attribute set 10 of 20
testing random attribute set 11 of 20
testing random attribute set 12 of 20
testing random attribute set 13 of 20
testing random attribute set 14 of 20
testing random attribute set 15 of 20
testing random attribute set 16 of 20
testing random attribute set 17 of 20
testing random attribute set 18 of 20
testing random attribute set 19 of 20
testing random attribute set 20 of 20

Best 5 accuracies for variable attribute subset
 [(0.9, ['ID', 'SEX', 'PAY_4', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT2', 'PAY_AMT4', 'default payment next month']), (0.884, ['LIMIT_BAL', 'SEX', 'AGE', 'PAY_0', 'PAY_3', 'PAY_4'

In [None]:
def create_kNN_classifier_vary_weights(table, attributes, k): pass