In [85]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import numpy
import time 
import matplotlib.pyplot as plt

from deap import algorithms
from deap import base
from deap import creator
from deap import tools

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


In [86]:
#Load Clean Data
clean_data = pd.read_csv('clean1.data',header=None)
clean_labels = clean_data.iloc[:, -1]
clean_data = clean_data.drop(clean_data.columns[[-1,0,1]],axis = 1)
clean_data.head(10)

In [87]:
import pandas as pd

#Load .Dat into a list
items = []
index = 0
infile = open('vehicle.dat','r')
for line in infile:
    line_string = line.strip().split(' ')
    items.append(line_string)
infile.close()

#Transform list into dataframe
vehicle_df = pd.DataFrame(items)
vehicle_labels = vehicle_df.iloc[:, -1]
vehicle_data = vehicle_df.drop(vehicle_df.columns[[-1,]],axis = 1)

vehicle_data.head()

In [88]:
#Set Params
total_features = 18



In [89]:
#Deap Params

creator.create("FitnessMin", base.Fitness, weights=(-1.0,-1.0))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
# Attribute generator 
toolbox.register("attr_bool", random.randint, 0, 1)
# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, 
    toolbox.attr_bool, 18)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [90]:
#Evaluation Function
def wrappereval(individual):
    #Evaluate Accuracy
    temp_features = []
    index = 0
    feature_count = 0
    for indicator in individual:
        if indicator == 1:
            temp_features.append(index)
            feature_count += 1
        index += 1
        
    if feature_count == 0:
        return 1,1
    
    temp_df = vehicle_data.iloc[:, temp_features]
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(temp_df, vehicle_labels)
    preds = knn.predict(temp_df)
    score = accuracy_score(vehicle_labels, preds) 
    
    #Evaluate Feature Ratio
    feature_ratio = feature_count / total_features
    
    return 1 - score, feature_ratio

In [91]:
#Register these above functions in DEAP toolbox
toolbox.register("evaluate", wrappereval)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutFlipBit, indpb = 0.05)
toolbox.register("select", tools.selNSGA2)

In [92]:
def mainwrapper(seedno):
    random.seed(seedno)
    NGEN = 20
    MU = 100
    LAMBDA = 98
    CXPB = 0.8
    MUTPB = 0.2

    pop = toolbox.population(n=MU)
    hof = tools.HallOfFame(maxsize = 10)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", numpy.mean, axis=0)
    stats.register("std", numpy.std, axis=0)
    stats.register("min", numpy.min, axis=0)
    stats.register("max", numpy.max, axis=0)

    tic = time.clock()
    algorithms.eaMuPlusLambda(pop, toolbox, MU, LAMBDA, CXPB, MUTPB, NGEN, stats,
                              halloffame=hof)
    toc = time.clock()
    comptime = toc - tic
    
    return pop, stats, hof, comptime

In [93]:
hofs = []
for seed in range(3):
    pop, stats, hof, comptime = mainwrapper(seed)
    hofs.append(hof)

    

In [94]:
#Graphs
all_accs, all_ratios = [],[]
for i in range(3):
    hof = hofs[i]
    accuracies, ratios = [], []
    for sol in hof:
        acc, f_r = wrappereval(sol)
        accuracies.append(acc)
        ratios.append(f_r)
    all_accs.append(accuracies)
    all_ratios.append(ratios)

fig, axs = plt.subplots(3, figsize=(8, 20))
fig.suptitle('Vertically stacked subplots')
axs[0].scatter(all_accs[0], all_ratios[0])
axs[1].scatter(all_accs[1], all_ratios[1])
axs[2].scatter(all_accs[2], all_ratios[2])


In [95]:
#Compare Error Rates with entire feature set
listofones = [1] * total_features
acc, f_r = wrappereval(listofones)
print(acc)