-
Notifications
You must be signed in to change notification settings - Fork 0
/
Genetic_Algorithim_Component.py
331 lines (279 loc) · 14.4 KB
/
Genetic_Algorithim_Component.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import pandas as pd
import random
import Neural_Network #imports the neural net file
from Neural_Network import classifier
#import Plots_Generated_from_collected_data #imports file
import compare_results_to_actual_data
#classifier = '.../PycharmProjects/Machine_Learning_Baseball_Algorithim/Neural_Network_saved'
'''
since the goal is to find the stats that make the most difference while still making the playoffs
points will be rewarded for any stat below average since that indicates the stat has no significant impact upon wins
and shouldn't be what teams focus on
conversely any stat below average will be rewarded with a higher contribution fitness score
The base rates are far apart enough that there shouldn't be a chance for a bad team to receive a high enough score to be a playoff team
ideal is 1
for each stat if the stat is > 1
take difference squared between stat and ideal and subtract from base score
for each stat <= 1
take difference squared from stat and ideal and add it to the base score
Some stats benefit from beineg lower and their evaluation will be inverse of the above function
starting breeding population will be 10 randomly chosen from existing data
top 2 will be parents
fitness equation base scores to encourage playoff teams based on winning percentage
likely playoff team = 200
likely good team = 150
likely average team = 100
likely bad team = 50
print statments have been left in from debugging by are commented out
'''
GENERATIONS = 10 #sets the main loop
#dataset compiled from previous operations in Data_extraction_and_sanitization.py
DATASET = pd.read_csv('.../PycharmProjects/Machine_Learning_Baseball_Algorithim/team_stats_ave_no_excess_stats.csv', index_col=None)
POPULATION_TEAMS = []
IDEAL_POPULATION_SIZE = 10
MUTATION_RATE = 0.05
MAKE_AVE_RATE = 0.2
def get_number_of_stats_tracked():
random_subset = DATASET.sample(n=IDEAL_POPULATION_SIZE)
#drop lable column, that will be determined by NN
random_subset1 = random_subset.drop(columns=['Classification'])
#print("columns" + str(len(random_subset1.columns)))
NUMBER_OF_STATS_TRACKED = len(random_subset1.columns)
#print("number stat tracked: " + str(NUMBER_OF_STATS_TRACKED))
#print(random_subset1.head(10))
return random_subset1
BASE_POPULATION = get_number_of_stats_tracked()
#print("base population length: " + str(len(BASE_POPULATION)))
NUMBER_OF_STATS_TRACKED = len(BASE_POPULATION.columns)
#print("number of stats tracked" + str(NUMBER_OF_STATS_TRACKED))
'''
class contains the stat values for a single team in the population
Each stat is treated as if they are genes in the chromosome
'''
class Team_chromosomes:
#generates a new statistical value
def make_new_stat_value(self):
#1 represents the average value of a team
#the selection of a randomint is added to 1 to create a stat that can vary around the average
new_value = 1 + random.randint(-60000000,60000000)/100000000
#print("new stat value" + str(new_value))
return new_value
def make_ave_stat_value(self):
new_value = 1.0
return new_value
#returns the stat genes in the form of a dictionary
#so the neural net can make a prediction
def format_chromosome_as_dict(self, chromosome):
("format_chromosome_as_dict")
default_value = 1
#keys to dictionary are the samea stats that are tracked by the neural net
keys = ['R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E' ,'DP','FP']
dictionary = dict.fromkeys(['R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E' ,'DP','FP'],default_value)
count = 0
for gene in chromosome:
dictionary[keys[count]] = [gene]
count += 1
#print("dictionary" + str(dictionary))
return dictionary
#passes the dictionary generated in format_chromosome_as_dict
#to the neural network and returns classification prediction
def get_nn_classification(self, dict):
pred = Neural_Network.get_prediction(classifier, dict)
#print(pred)
print("prediction:" + str(pred))
return pred
'''
in most cases having a stat that is above average will benefit the team
However, in baseball some stats are more beneficial to be lower than higher
An adjustment to the way those stats contribute to fittness score is needed
are all such stats and the method below corrects for them
'''
def lower_than_average_benefits_fittness_score( gene, score):
if gene < 1:
score = score - gene**2
else:
score = score + gene**2
return score
#uses the Neural Net prediction and stats in chromosoeme to set fitness score
#Base value is set by team prediction: Playoff team > Good Team > Average Team > Bad Team
#Any stat below average (1) provides a positive benefit to fittness score
#Any stat above average (1) provides a negitive contribution to fittness score
#The balancing is to prevent the algorithim from just generating an ever increasing value for each stat
def set_fitness_score(self, chromosome, NN_prediction):
#print("set_fittness_score" + str(NN_prediction))
prediction_string = str(NN_prediction)
score = 0
count = 0
if prediction_string == "Bad Team":
score = score + 50
if prediction_string == "Average Team":
score = score +100
if prediction_string == "Good Team":
score = score +150
if prediction_string == "Playoff Team":
score = score +200
#print("set score:")
for gene in chromosome:
#results were skewing towards significantly lower number of at bats which is not consistant with real baseball
#this reduces the possible variance to a more reasonable level
if count == 1:
score = score + 1 + random.randint(-100,100)/1000
# SO -stike outs by batters, CS-caught stealing, RA-runs allowed, ER-Earned Runs, ERA, HA-hits allowed, HRA-hr allowed, BBA- BB allowed and E-errors
elif count != 7 and count != 9 and count != 12 and count != 13 and count != 14 and count != 19 and count != 20 and count != 21 and count !=22 and count != 23 :
if gene > 1:
score = score - gene**2
else:
score = score + gene**2
else:
score = self.lower_than_average_benefits_fittness_score(gene, score)
#print("score is : " + str(score))
count += 1
return score
#constructor for the chromosome class
#sets intiail values and calls methods
def __init__(self, length):
self.genes_made_of_stats = []
self.fittness_score = 0
self.current_length = 0
#print("new chromosome created")
#print("length" + str(length))
#print("current length :" + str(self.current_length) + " numb stats tracked" + str(length))
while self.current_length < length:
new_gene = self.make_new_stat_value()
self.genes_made_of_stats.append(new_gene)
self.current_length += 1
prediction = self.get_nn_classification(self.format_chromosome_as_dict(self.genes_made_of_stats))
self.fittness_score = self.set_fitness_score(self.genes_made_of_stats, prediction)
#end chromosome class
#copy of a previous method for accessing outside of the chromosome class
def format_chromosome_as_dict(chromosome):
#print("format_chromosome_as_dict")
default_value = 1
keys = ['R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E' ,'DP','FP']
dictionary = dict.fromkeys(['R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E' ,'DP','FP'],default_value)
count = 0
for gene in chromosome.genes_made_of_stats:
dictionary[keys[count]] = [gene]
count += 1
#print("dictionary" + str(dictionary))
return dictionary
#one point crossover in the middle from 2 parents
#impliments mutation as well as binary selection of genes
def cross_over( parent1, parent2):
#makes new chromosome that will be filled with genes from parents
new_team_chromosome = Team_chromosomes(NUMBER_OF_STATS_TRACKED)
count = 0
for gene in new_team_chromosome.genes_made_of_stats:
if count > NUMBER_OF_STATS_TRACKED/2:
new_team_chromosome.genes_made_of_stats[count] = parent1.genes_made_of_stats[count]
else:
new_team_chromosome.genes_made_of_stats[count] = parent2.genes_made_of_stats[count]
#mutation is occuring here
mutation_indicator = random.random()
if mutation_indicator < MUTATION_RATE:
new_team_chromosome.genes_made_of_stats[count] = new_team_chromosome.make_new_stat_value()
#this section takes values and replaces them with a 1.0
#since in this model 1 is a value that does not contribute positivly or negativly to fitness score
#it simulates a GA fed by a binary selection of genes
make_ave = random.random()
if make_ave < MAKE_AVE_RATE:
new_team_chromosome.genes_made_of_stats[count] = new_team_chromosome.make_ave_stat_value()
count += 1
#gets neural net classification for new child chromosome
prediction = new_team_chromosome.get_nn_classification(new_team_chromosome.format_chromosome_as_dict(new_team_chromosome.genes_made_of_stats))
#sets fittness score for newly created chromosome
new_team_chromosome.set_fitness_score(new_team_chromosome.genes_made_of_stats, prediction)
format_chromosome_as_dict(new_team_chromosome)
return new_team_chromosome
#builds the population that the Gentic Algorithim will be implimented on
def build_POPULATION_TEAMS( population):
#stores the new population
NEW_POPULATION_TEAMS = []
#loops till population is full
while len(NEW_POPULATION_TEAMS) < IDEAL_POPULATION_SIZE:
#print("numbrer of stats tracked " +str(NUMBER_OF_STATS_TRACKED))
NEW_POPULATION_TEAMS.append(Team_chromosomes(NUMBER_OF_STATS_TRACKED))
#sorts population so largest fittness score is first
NEW_POPULATION_TEAMS.sort(key=lambda x: x.fittness_score, reverse=True)
#NEW_POPULATION_TEAM = NEW_POPULATION_TEAMS[0:4] #moved here from below for loop to see if it makes a difference
for chrom in POPULATION_TEAMS:
#print("at begining of for loop"+str(chrom.fittness_score))
#commented code used for larger population size with more than 2 parents
'''
for using more than top 2 parent selection for crossove
NEW_POPULATION_TEAM = NEW_POPULATION_TEAMS[0:4]
randint = random.randint(0,4)
randint2 = random.randint(0,4)
while randint == randint2:
randint = random.randint(0,4)
NEW_POPULATION_TEAMS.append(cross_over(NEW_POPULATION_TEAMS[randint], NEW_POPULATION_TEAMS[randint2]))
'''
#using just 2 parents
NEW_POPULATION_TEAM = NEW_POPULATION_TEAMS[0:2]
NEW_POPULATION_TEAMS.append(cross_over(NEW_POPULATION_TEAMS[0], NEW_POPULATION_TEAMS[1]))
#print("at end of for loop"+ str(chrom.fittness_score))
return NEW_POPULATION_TEAMS
dataFrame_with_results = pd.DataFrame(columns= ['R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E' ,'DP','FP'])
def take_dicts_form_dataframe (dict):
#print("take_dicts_form_dataframe")
updated_dataFrame = dataFrame_with_results
updated_dataFrame =updated_dataFrame.append(pd.DataFrame.from_dict(dict))
#print("test:" + str(updated_dataFrame['ERA'].values))
return updated_dataFrame
'''
main loop that goes until the number of generations has been reached
'''
count = 0 #tracks generation loops
TOP_SCORES = [] #stores each genrations top scores
TOP_TEAMS = []
BEST_SCORE = 0 #stores the best score
while count < GENERATIONS:
#builds a new population each loop
POPULATION_TEAMS = build_POPULATION_TEAMS(POPULATION_TEAMS)
count += 1
print("generations:" + str(count))
#sorts population so largest fittness score is first
POPULATION_TEAMS.sort(key=lambda x: x.fittness_score, reverse=True)
#removes any additional population that may have been appended
POPULATION_TEAMS = POPULATION_TEAMS[0:IDEAL_POPULATION_SIZE]
#print(str(POPULATION_TEAMS))
print_count = 0
for chrom in POPULATION_TEAMS:
if print_count == 0:
TOP_SCORES.append(chrom.fittness_score)
#print("top score updated")
TOP_TEAMS.append(chrom)
if BEST_SCORE < chrom.fittness_score:
BEST_SCORE = chrom.fittness_score
#print("best score updated:" + str(chrom.fittness_score))
print(str(chrom.fittness_score))
print_count += 1
gen_count_for_print = 1
for score in TOP_SCORES:
print('Top Score for Generation '+str(gen_count_for_print) +': ' + str(score) )
gen_count_for_print += 1
top_team_gen_count = 1
'''
I tried to put this loop and write command in a method but for some reason it just won't work
do not touch it!
'''
for chrom in TOP_TEAMS:
dataFrame_with_results=take_dicts_form_dataframe(format_chromosome_as_dict(chrom))
#compares the fittnes scores to the average to normalize them
def get_normalized_fitness_scores(list):
mean = sum(list)/len(list)
count = 0
#print("mean:" + str(mean))
normalized_fitness_scores=[]
while count < len(list):
normalized_fitness_scores.append(list[count]/mean)
count += 1
return normalized_fitness_scores
#appends the normalized scores to the dataFrame_with_results and writes them to the output.csv file
dataFrame_with_results['score']= get_normalized_fitness_scores(TOP_SCORES)
with open("/Users/aarondavidselya@gmail/PycharmProjects/Machine_Learning_Baseball_Algorithim/output.csv", 'a') as f:
dataFrame_with_results.to_csv(f, index=None)
#Calls the build_Plot method from the imported file
#Plots_Generated_from_collected_data.build_Plots()
print("The algorithim generated a team with the top score of" + "{0:.4f}".format(round(TOP_SCORES[0],4))+ ".\nA similar score was, achieved by the following historical teams:\n")
compare_results_to_actual_data.set_score_and_get_simmilar_teams(TOP_SCORES[0])