-
Notifications
You must be signed in to change notification settings - Fork 6
/
genetic_algorithm.py
277 lines (231 loc) · 10.6 KB
/
genetic_algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""
Optimize architecture of a neural network via teh Genetic Algorithm.
Find the "best" vector representing layers and number of nodes per layer.
"""
import random
import operator
from neural_network import NeuralNetwork
import homework
class Individual(object):
"""An individual in a population."""
def __init__(self, vector, fitness_function):
"""Initialize with an encoded vector and fitness function."""
self.phenotype = self.validate_vector(vector)
self.fitness_function = fitness_function
def validate_vector(self, vector, max_nodes=100):
"""Make sure no vectors w/ elements > max_nodes per layer."""
validated_vector = []
for node_count in vector:
if node_count > max_nodes:
new_node_count = max_nodes
else:
new_node_count = node_count
validated_vector.append(new_node_count)
return validated_vector
def fitness(self):
"""Get the individual's fitness. EXPENSIVE."""
return self.fitness_function(self.phenotype)
class GANN(object):
"""Perform global search for neural network architecture."""
def __init__(self, population_size=10, crossover_points=[.3, .6],
mutation_rate=.05, fitness_function=None, swap_rate=.2,
num_generations=5, encoding='binary', step_size=2,
max_layers=10, max_nodes=100):
"""Initialize with evolution parameters."""
if fitness_function is None:
raise Exception('Must pass a fitness function.')
self.population_size = population_size
self.crossover_points = crossover_points
self.mutation_rate = mutation_rate
self.fitness_function = fitness_function
self.swap_rate = swap_rate
self.num_generations = num_generations
self.encoding = encoding
self.step_size = step_size
self.max_layers = max_layers
# max nodes PER LAYER
self.max_nodes = max_nodes
# this is the key object used for performing selection
self.population_fitness = []
# Initialize the population at t=0 - build list of Individual objects
initial_population = []
for individual in range(0, population_size):
individual_arr = []
num_layers = random.randint(1, max_layers)
for layer in range(0, num_layers):
num_nodes = random.randint(1, max_nodes)
individual_arr.append(num_nodes)
if num_layers < max_layers:
for iteration in range(0, max_layers - num_layers):
individual_arr.append(0)
# else it has already been filled
new_individual = Individual(self.slice_vector(individual_arr),
self.fitness_function)
initial_population.append(new_individual)
# t = 0 find fitness for each autogenerated
self.population_fitness = self.calc_population_fitness(
initial_population)
def genotype(self, indiv):
"""Encode phenotype according to specified encoding scheme."""
hidden_layers = indiv.phenotype[1:-1]
if self.encoding == 'binary':
genotype = str()
for layer in hidden_layers:
genotype += format(layer, '07b')
return genotype
else:
return hidden_layers
def slice_vector(self, vector):
"""Remove extra zeroes from vector."""
try:
return vector[:vector.index(0)]
except ValueError:
# if no zero
return vector
def parse_bitstring(self, genotype):
"""Parse a genotype."""
num_chars = len(bin(self.max_nodes)) - 2
return [genotype[i:i + num_chars] for i in
range(0, len(genotype), num_chars)]
def vector_from_bitstring(self, genotype):
"""Get a vector from a bitstring."""
vector = [int(el, 2) for el in self.parse_bitstring(genotype)]
return self.slice_vector(vector)
def calc_population_fitness(self, individuals):
"""Get fitness score of each individual."""
fitness_scores = [indiv.fitness() for indiv in individuals]
list_of_genotypes = [self.genotype(indiv) for indiv in individuals]
population_fitness = zip(list_of_genotypes, fitness_scores)
return population_fitness
# note: scores are NOT normalized
def select_for_crossover(self):
"""Select individuals for crossover based on fitness."""
# sort fitness tuples
self.population_fitness = sorted(self.population_fitness,
key=operator.itemgetter(1),
reverse=True)
# will select POPULATION SIZE - NUMBER TO REPLACE
slice_to = (self.population_size -
int(self.swap_rate * self.population_size))
return self.population_fitness[:slice_to]
def apply_crossover(self, genotype_1, genotype_2):
"""Reproduction between two parent genotypes."""
crossover_points = [int(el * self.population_size)
for el in self.crossover_points]
new_genotype = (
genotype_1[:crossover_points[0]] +
genotype_2[crossover_points[0]:crossover_points[1]] +
genotype_1[crossover_points[1]:]
)
return new_genotype
def apply_mutation(self, child_genotype):
"""Apply mutuation on one individual offspring."""
if self.encoding == 'binary':
parsed_genotype = self.parse_bitstring(child_genotype)
# only non-zero genes except the first one will be considered
# remove trailing zeroes, except the first zero
try:
first_zero = parsed_genotype.index('0000000')
genes_to_consider = parsed_genotype[:first_zero + 1]
except ValueError:
# if no zeroes
genes_to_consider = parsed_genotype
new_child = str()
for gene in genes_to_consider:
for c in gene:
if random.random() < self.mutation_rate:
if c == '0':
c = '1'
elif c == '1':
c = '0'
new_child += c
# Binary string length follows from # of nodes and layers
binary_string_length = ((len(bin(self.max_nodes)) - 2) *
self.max_layers)
return new_child.ljust(binary_string_length, '0')
# integer real-valued version
else:
try:
first_zero = child_genotype.index(0)
genes_to_consider = child_genotype[:first_zero + 1]
except ValueError:
genes_to_consider = child_genotype
new_child = []
for gene in genes_to_consider:
if random.random() < self.options['mutation_rate']:
gene += self.options['step_size']
new_child.append(gene)
return (new_child + [0] * self.max_layers)[:self.max_layers]
def generate_offspring(self, selected):
"""Generate offspring of selected individual."""
num_to_generate = int(self.swap_rate * self.population_size)
offspring = []
for iteration in range(0, num_to_generate):
# select 2 parents randomly
parent_1 = random.choice(selected)
if self.encoding == 'binary':
parent_2 = random.choice(list(set(selected) - set((parent_1))))
else:
parent_2 = random.choice(selected)
# breed and mutate
child = self.apply_crossover(parent_1[0], parent_2[0])
offspring.append(self.apply_mutation(child))
# convert offspring to Individual objects
if self.encoding == 'binary':
offspring = [Individual(self.vector_from_bitstring(c),
self.fitness_function) for c in offspring]
else:
offspring = [Individual(c, self.fitness_function)
for c in offspring]
return offspring
def evolve(self):
"""Replace part of the population through 1 generation."""
# select fitter individuals for mating
selected = self.select_for_crossover()
# generate offspring - these will replace the weaker individuals
offspring = self.generate_offspring(selected)
# get fitness tuples on offspring
offspring_fitness = self.calc_population_fitness(offspring)
# replace weaker individuals, fitness tuples already calculated above
num_to_replace = len(offspring)
self.population_fitness = (self.population_fitness[
:(self.population_size - num_to_replace)] +
offspring_fitness)
def final_genotype(self):
"""Get the best fitness tuple."""
self.population_fitness = sorted(self.population_fitness,
key=operator.itemgetter(1),
reverse=True)
genotypes = [el[0] for el in self.population_fitness]
best = genotypes[0]
if self.encoding == 'binary':
return self.vector_from_bitstring(best)
return best[:best.index(0)]
def run(self):
"""Evolve for the specified number of generations."""
for generation in range(0, self.num_generations):
self.evolve()
fitness_scores = [el[1] for el in self.population_fitness]
print 'Final accuracy score: %s' % max(fitness_scores)
best_vector = self.final_genotype()
print 'Best vector: %s' % best_vector
return best_vector
if __name__ == '__main__':
training_data = [line for line in open('data/train.csv')]
preprocessed_data_dict = homework.preprocess_training_data(training_data)
split_data_dict = homework.split_data(preprocessed_data_dict['X'],
preprocessed_data_dict['y'],
0.2)
def get_accuracy(nn_hidden_structure=[10]):
"""Get accuracy from a neural network."""
nn = NeuralNetwork(nn_hidden_structure)
nn.fit(split_data_dict['X_train'],
split_data_dict['y_train'],
learning_rate=.1,
num_epochs=100000,
momentum=0.0,
lmbda=0.1)
return nn.get_accuracy(split_data_dict['X_test'],
split_data_dict['y_test'])
search = GANN(fitness_function=get_accuracy)
search.run()