-
Notifications
You must be signed in to change notification settings - Fork 2
/
run.py
350 lines (304 loc) · 15.9 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env python
#############################
# ChaLearn AutoML challenge #
#############################
# Usage: python run.py input_dir output_dir
# This sample code can be used either
# - to submit RESULTS depostited in the res/ subdirectory or
# - as a template for CODE submission.
#
# The input directory input_dir contains 5 subdirectories named by dataset,
# including:
# dataname/dataname_feat.type -- the feature type "Numerical", "Binary", or "Categorical" (Note: if this file is abscent, get the feature type from the dataname.info file)
# dataname/dataname_public.info -- parameters of the data and task, including metric and time_budget
# dataname/dataname_test.data -- training, validation and test data (solutions/target values are given for training data only)
# dataname/dataname_train.data
# dataname/dataname_train.solution
# dataname/dataname_valid.data
#
# The output directory will receive the predicted values (no subdirectories):
# dataname_test_000.predict -- Provide predictions at regular intervals to make sure you get some results even if the program crashes
# dataname_test_001.predict
# dataname_test_002.predict
# ...
# dataname_valid_000.predict
# dataname_valid_001.predict
# dataname_valid_002.predict
# ...
#
# Result submission:
# =================
# Search for @RESULT to locate that part of the code.
# ** Always keep this code. **
# If the subdirectory res/ contains result files (predicted values)
# the code just copies them to the output and does not train/test models.
# If no results are found, a model is trained and tested (see code submission).
#
# Code submission:
# ===============
# Search for @CODE to locate that part of the code.
# ** You may keep or modify this template or subtitute your own code. **
# The program saves predictions regularly. This way the program produces
# at least some results if it dies (or is terminated) prematurely.
# This also allows us to plot learning curves. The last result is used by the
# scoring program.
# We implemented 2 classes:
# 1) DATA LOADING:
# ------------
# Use/modify
# D = DataManager(basename, input_dir, ...)
# to load and preprocess data.
# Missing values --
# Our default method for replacing missing values is trivial: they are replaced by 0.
# We also add extra indicator features where missing values occurred. This doubles the number of features.
# Categorical variables --
# The location of potential Categorical variable is indicated in D.feat_type.
# NOTHING special is done about them in this sample code.
# Feature selection --
# We only implemented an ad hoc feature selection filter efficient for the
# dorothea dataset to show that performance improves significantly
# with that filter. It takes effect only for binary classification problems with sparse
# matrices as input and unbalanced classes.
# 2) LEARNING MACHINE:
# ----------------
# Use/modify
# M = MyAutoML(D.info, ...)
# to create a model.
# Number of base estimators --
# Our models are ensembles. Adding more estimators may improve their accuracy.
# Use M.model.n_estimators = num
# Training --
# M.fit(D.data['X_train'], D.data['Y_train'])
# Fit the parameters and hyper-parameters (all inclusive!)
# What we implemented hard-codes hyper-parameters, you probably want to
# optimize them. Also, we made a somewhat arbitrary choice of models in
# for the various types of data, just to give some baseline results.
# You probably want to do better model selection and/or add your own models.
# Testing --
# Y_valid = M.predict(D.data['X_valid'])
# Y_test = M.predict(D.data['X_test'])
#
# ALL INFORMATION, SOFTWARE, DOCUMENTATION, AND DATA ARE PROVIDED "AS-IS".
# ISABELLE GUYON, CHALEARN, AND/OR OTHER ORGANIZERS OR CODE AUTHORS DISCLAIM
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY PARTICULAR PURPOSE, AND THE
# WARRANTY OF NON-INFRIGEMENT OF ANY THIRD PARTY'S INTELLECTUAL PROPERTY RIGHTS.
# IN NO EVENT SHALL ISABELLE GUYON AND/OR OTHER ORGANIZERS BE LIABLE FOR ANY SPECIAL,
# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF SOFTWARE, DOCUMENTS, MATERIALS,
# PUBLICATIONS, OR INFORMATION MADE AVAILABLE FOR THE CHALLENGE.
#
# Main contributors: Isabelle Guyon and Arthur Pesah, March-October 2014
# Lukasz Romaszko April 2015
# Originally inspired by code code: Ben Hamner, Kaggle, March 2013
# Modified by Ivan Judson and Christophe Poulain, Microsoft, December 2013
# =========================== BEGIN USER OPTIONS ==============================
# Verbose mode:
##############
# Recommended to keep verbose = True: shows various progression messages
from sklearn.metrics import classification_report
from lib.models import OurAutoML
verbose = True # outputs messages to stdout and stderr for debug purposes
verbose = False
# Debug level:
##############
# 0: run the code normally, using the time budget of the tasks
# 1: run the code normally, but limits the time to max_time
# 2: run everything, but do not train, generate random outputs in max_time
# 3: stop before the loop on datasets
# 4: just list the directories and program version
debug_mode = 0
# Time budget
#############
# Maximum time of training in seconds PER DATASET (there are 5 datasets).
# The code should keep track of time spent and NOT exceed the time limit
# in the dataset "info" file, stored in D.info['time_budget'], see code below.
# If debug >=1, you can decrease the maximum time (in sec) with this variable:
max_time = 90
# Maximum number of cycles
##########################
# Your training algorithm may be fast, so you may want to limit anyways the
# number of points on your learning curve (this is on a log scale, so each
# point uses twice as many time than the previous one.)
max_cycle = 1#100
# ZIP your code
###############
# You can create a code submission archive, ready to submit, with zipme = True.
# This is meant to be used on your LOCAL server.
import datetime
zipme = False # use this flag to enable zipping of your code submission
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M")
submission_filename = '../automl_sample_submission_' + the_date
# I/O defaults
##############
# Use default location for the input and output data:
# If no arguments to run.py are provided, this is where the data will be found
# and the results written to. Change the root_dir to your local directory.
default_input_dir = "../ML/datadir/"
default_output_dir = "res_/"
# default_output_dir = "res_score_45/"
# _MODE = "TEST_SCORE"
_MODE = "FIT"
# =========================== END USER OPTIONS ================================
# Version of the sample code
# Change in 1.1: time is measured by time.time(), not time.clock(): we keep track of wall time
# Changes in version 2: examples of models from Lukasz; GPU code; fixes
# 2.6 fixes again, should use 25% of the total time budget
# 2.7 zipping fix, disabled GPUs
version = 2.7
# General purpose functions
import os
from sys import argv, path
import numpy as np
import time
overall_start = time.time()
# Our directories
# Note: On codalab, there is an extra sub-directory called "program"
running_on_codalab = False
run_dir = os.path.abspath(".")
codalab_run_dir = os.path.join(run_dir, "program")
if os.path.isdir(codalab_run_dir):
run_dir=codalab_run_dir
running_on_codalab = True
print "Running on Codalab!"
lib_dir = os.path.join(run_dir, "lib")
res_dir = os.path.join(run_dir, "res")
# Our libraries
path.append (run_dir)
path.append (lib_dir)
import lib.data_io as data_io # general purpose input/output functions
from lib.data_io import vprint # print only in verbose mode
from lib.data_manager import DataManager # load/save data and get info about them
if debug_mode >= 4 or running_on_codalab: # Show library version and directory structure
data_io.show_version()
data_io.show_dir(run_dir)
# =========================== BEGIN PROGRAM ================================
if __name__=="__main__" and debug_mode<4:
#### Check whether everything went well (no time exceeded)
execution_success = True
#### INPUT/OUTPUT: Get input and output directory names
if len(argv)==1: # Use the default input and output directories if no arguments are provided
input_dir = default_input_dir
output_dir = default_output_dir
else:
input_dir = argv[1]
output_dir = os.path.abspath(argv[2]);
# Move old results and create a new output directory
if not(running_on_codalab):
data_io.mvdir(output_dir, '../'+output_dir+'_'+the_date)
data_io.mkdir(output_dir)
#### INVENTORY DATA (and sort dataset names alphabetically)
datanames = data_io.inventory_data(input_dir)
#### DEBUG MODE: Show dataset list and STOP
if debug_mode>=3:
data_io.show_io(input_dir, output_dir)
print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n')
data_io.write_list(datanames)
datanames = [] # Do not proceed with learning and testing
# ==================== @RESULT SUBMISSION (KEEP THIS) =====================
# Always keep this code to enable result submission of pre-calculated results
# deposited in the res/ subdirectory.
if len(datanames)>0:
vprint( verbose, "************************************************************************")
vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******")
vprint( verbose, "************************************************************************")
OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE!
if OK:
vprint( verbose, "[+] Success")
datanames = [] # Do not proceed with learning and testing
else:
vprint( verbose, "======== Some missing results on current datasets!")
vprint( verbose, "======== Proceeding to train/test:\n")
# =================== End @RESULT SUBMISSION (KEEP THIS) ==================
# ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) =================
overall_time_budget = 0
print("output_dir", output_dir)
for basename in datanames: # Loop over datasets
vprint( verbose, "************************************************")
vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********")
vprint( verbose, "************************************************")
# ======== Learning on a time budget:
# Keep track of time not to exceed your time budget. Time spent to inventory data neglected.
start = time.time()
# ======== Creating a data object with data, informations about it
vprint( verbose, "======== Reading and converting data ==========")
D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, verbose=verbose)
vprint(verbose, D)
# ======== Keeping track of time
if debug_mode<1:
time_budget = D.info['time_budget'] # <== HERE IS THE TIME BUDGET!
else:
time_budget = max_time
########## PETERSBURG HACKATHON: 2 min per dataset
time_budget = 2 * 60
overall_time_budget = overall_time_budget + time_budget
time_spent = time.time() - start
vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent))
if time_spent >= time_budget:
vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task")
execution_success = False
continue
# ========= Creating a model, knowing its assigned task from D.info['task'].
# The model can also select its hyper-parameters based on other elements of info.
# vprint( verbose, "======== Creating model ==========")
# M = MyAutoML(D.info, verbose, debug_mode)
# print M
# ========= Iterating over learning cycles and keeping track of time
time_spent = time.time() - start
time_budget = time_budget - time_spent # Remove time spent so far
start = time.time() # Reset the counter
time_spent = 0 # Initialize time spent learning
time_spent_last = 0 # Initialize time spent learning
time_stock = 5
cycle = 0
print("{:~^{n}}".format(basename.capitalize(), n=50))
autoML = OurAutoML(D.info)
X_train, Y_train = D.data['X_train'], D.data['Y_train']
n_estimators = 500
autoML.preprocess_bin_cl(X_train, Y_train, n_estimators=int(n_estimators/2))
cycle=1
while cycle <= 1: # max_cycle:
begin = time.time()
vprint( verbose, "=========== " + basename.capitalize() +" Training cycle " + str(cycle) +" ================")
print("{} estimators".format(n_estimators))
prev_n_estimators = n_estimators
if _MODE == "TEST_SCORE":
autoML.fit_and_count_av_score(X_train, Y_train,
n_estimators=n_estimators,
test_size=0.4)
else:
autoML.fit(X_train, Y_train, n_estimators=n_estimators)
vprint( verbose, "[+] Fitting success, time spent so far %5.2f sec" % (time.time() - start))
# Make predictions
Y_valid = autoML.predict(D.data['X_valid'])
Y_test = autoML.predict(D.data['X_test'])
print("({} s)".format( "%5.2f"%(time.time() - start)))
vprint( verbose, "[+] Prediction success, time spent so far %5.2f sec" % (time.time() - start))
if cycle:
# Write results
filename_valid = basename + '_valid_' + str(cycle).zfill(3) + '.predict'
data_io.write(os.path.join(output_dir,filename_valid), Y_valid)
filename_test = basename + '_test_' + str(cycle).zfill(3) + '.predict'
data_io.write(os.path.join(output_dir,filename_test), Y_test)
vprint( verbose, "[+] Results saved, time spent so far %5.2f sec" % (time.time() - start))
time_spent = time.time() - start
vprint( verbose, "[+] End cycle, remaining time %5.2f sec" % (time_budget-time_spent))
cycle += 1
time_spent_last = time.time() - begin
time_budget = time_budget - time_spent_last # Remove time spent so far
if zipme and not(running_on_codalab):
vprint( verbose, "========= Zipping this directory to prepare for submit ==============")
data_io.zipdir(submission_filename + '.zip', ".")
overall_time_spent = time.time() - overall_start
print("\nOverall time: {} s".format(overall_time_spent + overall_time_budget))
if execution_success:
vprint( verbose, "[+] Done")
vprint( verbose, "[+] Overall time spent %5.2f sec " % overall_time_spent + ":: Overall time budget %5.2f sec" % overall_time_budget)
else:
vprint( verbose, "[-] Done, but some tasks aborted because time limit exceeded")
vprint( verbose, "[-] Overall time spent %5.2f sec " % overall_time_spent + " > Overall time budget %5.2f sec" % overall_time_budget)
if running_on_codalab:
if execution_success:
exit(0)
else:
exit(1)