machine-learning-tools

Create conda enviroment with tensorflow and hwtmode

conda env create -f environment_from_history.yaml

git clone https://github.com/NCAR/HWT_mode.git

cd HWT_mode

git checkout -b ahijevyc remotes/origin/ahijevyc

pip install .

Train Dense Neural Network

train_stormrpts_dnn.py

usage: train_stormrpts_dnn.py [-h] [--batchnorm] [--batchsize BATCHSIZE] [--clobber] [-d]
                              [--dropout DROPOUT] [--epochs EPOCHS] [--fhr FHR [FHR ...]]
                              [--fits FITS [FITS ...]] [--flash FLASH]
                              [--folds FOLDS [FOLDS ...]] [--glm] [--kfold KFOLD]
                              [--ifile IFILE]
                              [--learning_rate LEARNING_RATE]
                              [--model {HRRR,NSC1km,NSC3km-12sec,NSC15km}]
                              [--neurons NEURONS [NEURONS ...]] [--nfits NFITS]
                              [--nprocs NPROCS] [--optimizer {adam,sgd}]
                              [--reg_penalty REG_PENALTY] [--rptdist RPTDIST]
                              [--savedmodel SAVEDMODEL] [--seed SEED] [--trainend TRAINEND]
                              [--trainstart TRAINSTART] [--testend TESTEND]
                              [--teststart TESTSTART] [--suite SUITE] [--twin TWIN]

train/test dense neural network

options:
  -h, --help            show this help message and exit
  --batchnorm           use batch normalization (default: False)
  --batchsize BATCHSIZE
                        nn training batch size (default: 1024)
  --clobber             overwrite any old outfile, if it exists (default: False)
  -d, --debug
  --dropout DROPOUT     fraction of neurons to drop in each hidden layer (0-1) (default: 0.0)
  --epochs EPOCHS       number of training epochs (default: 30)
  --fhr FHR [FHR ...]   train with these forecast hours. Testing scripts only use this list to
                        verify correct model for testing; no filter applied to testing data.
                        In other words you test on all forecast hours in the testing data,
                        regardless of whether the model was trained with the same forecast
                        hours. (default: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])
  --fits FITS [FITS ...]
                        work on specific fit(s) so you can run many in parallel (default:
                        None)
  --flash FLASH         GLM flash count threshold (default: 10)
  --folds FOLDS [FOLDS ...]
                        work on specific fold(s) so you can run many in parallel (default:
                        None)
  --glm                 Use GLM (default: False)
  --kfold KFOLD         apply kfold cross validation to training set (default: 5)
  --ifile IFILE         Read this parquet input file. Otherwise guess which one to read.
                        (default: None)
  --learning_rate LEARNING_RATE
                        learning rate (default: 0.001)
  --model {HRRR,NSC1km,NSC3km-12sec,NSC15km}
                        prediction model (default: HRRR)
  --neurons NEURONS [NEURONS ...]
                        number of neurons in each nn layer (default: [16,16])
  --nfits NFITS         number of times to fit (train) model (default: 5)
  --nprocs NPROCS       verify this many forecast hours in parallel (default: 0)
  --optimizer {adam,sgd}
                        optimizer (default: adam)
  --reg_penalty REG_PENALTY
                        L2 regularization factor (default: 0.01)
  --rptdist RPTDIST     severe weather report max distance (default: 40)
  --savedmodel SAVEDMODEL
                        filename of machine learning model (default: None)
  --seed SEED           random number seed for reproducability (default: None)
  --trainend TRAINEND   training set end (default: None)
  --trainstart TRAINSTART
                        training set start (default: None)
  --testend TESTEND     testing set end (default: 20220101T00)
  --teststart TESTSTART
                        testing set start (default: 20201202T12)
  --suite SUITE         name for suite of training features (default: default)
  --twin TWIN           time window in hours (default: 2)

Test Dense Neural Network

test_stormrpts_dnn.py

usage: test_stormrpts_dnn.py [-h] [--batchnorm] [--batchsize BATCHSIZE] [--clobber] [-d]
                             [--dropout DROPOUT] [--epochs EPOCHS] [--fhr FHR [FHR ...]]
                             [--fits FITS [FITS ...]] [--flash FLASH]
                             [--folds FOLDS [FOLDS ...]] [--glm] [--kfold KFOLD]
                             [--ifile IFILE] [--learning_rate LEARNING_RATE]
                             [--model {HRRR,NSC1km,NSC3km-12sec,NSC15km}]
                             [--neurons NEURONS [NEURONS ...]] [--nfits NFITS]
                             [--nprocs NPROCS] [--optimizer {adam,sgd}]
                             [--reg_penalty REG_PENALTY] [--rptdist RPTDIST]
                             [--savedmodel SAVEDMODEL] [--seed SEED] [--trainend TRAINEND]
                             [--trainstart TRAINSTART] [--testend TESTEND]
                             [--teststart TESTSTART] [--suite SUITE] [--twin TWIN]

train/test dense neural network

options:
  -h, --help            show this help message and exit
  --batchnorm           use batch normalization (default: False)
  --batchsize BATCHSIZE
                        nn training batch size (default: 1024)
  --clobber             overwrite any old outfile, if it exists (default: False)
  -d, --debug
  --dropout DROPOUT     fraction of neurons to drop in each hidden layer (0-1) (default: 0.0)
  --epochs EPOCHS       number of training epochs (default: 30)
  --fhr FHR [FHR ...]   train with these forecast hours. Testing scripts only use this list to
                        verify correct model for testing; no filter applied to testing data.
                        In other words you test on all forecast hours in the testing data,
                        regardless of whether the model was trained with the same forecast
                        hours. (default: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])
  --fits FITS [FITS ...]
                        work on specific fit(s) so you can run many in parallel (default:
                        None)
  --flash FLASH         GLM flash count threshold (default: 10)
  --folds FOLDS [FOLDS ...]
                        work on specific fold(s) so you can run many in parallel (default:
                        None)
  --glm                 Use GLM (default: False)
  --kfold KFOLD         apply kfold cross validation to training set (default: 5)
  --ifile IFILE         Read this parquet input file. Otherwise guess which one to read.
                        (default: None)
  --learning_rate LEARNING_RATE
                        learning rate (default: 0.001)
  --model {HRRR,NSC1km,NSC3km-12sec,NSC15km}
                        prediction model (default: HRRR)
  --neurons NEURONS [NEURONS ...]
                        number of neurons in each nn layer (default: [16,16])
  --nfits NFITS         number of times to fit (train) model (default: 5)
  --nprocs NPROCS       verify this many forecast hours in parallel (default: 0)
  --optimizer {adam,sgd}
                        optimizer (default: adam)
  --reg_penalty REG_PENALTY
                        L2 regularization factor (default: 0.01)
  --rptdist RPTDIST     severe weather report max distance (default: 40)
  --savedmodel SAVEDMODEL
                        filename of machine learning model (default: None)
  --seed SEED           random number seed for reproducability (default: None)
  --trainend TRAINEND   training set end (default: None)
  --trainstart TRAINSTART
                        training set start (default: None)
  --testend TESTEND     testing set end (default: 20220101T00)
  --teststart TESTSTART
                        testing set start (default: 20201202T12)
  --suite SUITE         name for suite of training features (default: default)
  --twin TWIN           time window in hours (default: 2)

history notes

Jan 2023

Eliminate layers argument

Eliminate layers argument. Get the number of layers from the length of the neurons list. Allows the layers to have different numbers of neurons. In the filenames, remove the ".xlayer" substring. For n-layer models, replace .16n. with .(16n)xn. For example, with a 3-layer model, .15n. becomes .15n15n15n. For 1-layer models, no change is needed.

clean up

Cleaned up nn/ directory by moving 250+ hyperparameter search models to nn/hyperparam_search.HRRR/.

Models trained in particular regions masked by convective mode probability moved to nn/modemask.NSC/.

Removed nn_ prefix from saved model names.

orphan scores.txt files

scores.txt files with no corresponding ML model tucked away in nn/orphan_score_files/. Unfortunately these 2 score.txt files showing improvement with storm mode for tornado forecasts have no corresponding model:

NSC3km-12sec.default.rpt_40km_2hr.1024n.ep10.f01-f48.bs1024.SGD.L20.01.lr0.01.0.0.1fold.scores.txt
nn_NSC3km-12sec.with_CNN_DNN_storm_mode_nprob.rpt_40km_2hr.1024n.ep10.f01-f48.bs1024.SGD.L20.01.lr0.01.0.0.1fold.scores.txt

Trained new models with same hyperparameters but they did not show the same improvement with storm mode. Maybe previous results were due to a code bug (e.g. inconsistent training and testing set time periods, forecast hour range, and scaling factors), small sample size (noise), buggy variables in the 3-km training set (W_DN_MAX, W_DN_MIN, yearly and daily time sin/cos components), fewer training variables (LR75, MLCINH, REFL_COM, UP_HELI_MIN), or a longer training and testing period.

NSC training period changed

An older iteration of 3-km NSC data went from 20101024 to 20191020.

Now with 1-km and 15-km NSC data available for comparison and because 1-km is so expensive, the time range ends at 20170330. Since the older iteration had more 2017, 2018, and 2019 data, it made sense to partition the training and testing data as late as 20160701. That partitioning allowed a full season in the testing set. However, now that we stop at 20170330, to ensure a full season in the testing set, we use an earlier partition: 20160101. Old models trained through 20160701 and tested through 20191020 were moved to subdirectory nn/trainend20160701.NSC/.

correct forecast hour range

Corrected fhr list going forward, both in config.yaml and output filenames. It was hard-coded to f01-f48 for a long time. That made sense for HRRR, but NSC only went to fhr=36. Moreover, if you want to train with storm mode, the range is f12-f35.

Trim the training set by eliminating forecast hours not in the requested list of forecast hours (args.fhr). Note, testing scripts only check the requested fhr list to ensure the correct model is used for testing; testing data are not trimmed based on forecast hour. In other words you may test data from forecast hours 1-11 even if the model was only trained with 12-35.

Dec 2021

accidentally deleted all important .py scripts (except HWT_mode_train.py) by adding them to git and removing .git directory. I was trying to change branch from master to main.

Removed files matching catalog.py check*py com*py ens*py get*py HWT*py loop_through_dates.py make_scaler.py ncar_ensemble_num_fields.py neural_network_train_gridded.py random_forest_preprocess_gridded.py read_pred.py run_HWT_mode_train.py saveNewMap.py scalar2vector.py show_importances.py showtop2021HWT.py verify_forecasts_bss_spatial.py

Name		Name	Last commit message	Last commit date
Latest commit History 301 Commits
HRRR		HRRR
nn		nn
notebooks		notebooks
predictor_suites		predictor_suites
G211.py		G211.py
LICENSE		LICENSE
README.md		README.md
RRFS.py		RRFS.py
bootstrap.py		bootstrap.py
catalog_upscaled.py		catalog_upscaled.py
check_W_DN_MAX.py		check_W_DN_MAX.py
check_storm_diagnostics.py		check_storm_diagnostics.py
checkyaml.py		checkyaml.py
compute_env_allfields.py		compute_env_allfields.py
compute_env_allfields_40km.py		compute_env_allfields_40km.py
compute_env_allfields_ncar.py		compute_env_allfields_ncar.py
compute_env_allfields_ncar_parallel.py		compute_env_allfields_ncar_parallel.py
compute_env_allfields_parallel.py		compute_env_allfields_parallel.py
configyaml2savedmodel.py		configyaml2savedmodel.py
daily_test_stormrpts_dnn.py		daily_test_stormrpts_dnn.py
environment.yaml		environment.yaml
environment_from_history.yaml		environment_from_history.yaml
fieldinfo.py		fieldinfo.py
get_osr_gridded_by_day_hr.py		get_osr_gridded_by_day_hr.py
get_osr_gridded_new.py		get_osr_gridded_new.py
look_at_scores.py		look_at_scores.py
look_scaling_values_NSC.py		look_scaling_values_NSC.py
make_osr_climo.py		make_osr_climo.py
missing_models.py		missing_models.py
ml_functions.py		ml_functions.py
modeprob-corr.rpts.py		modeprob-corr.rpts.py
ncar_ensemble_num_fields.py		ncar_ensemble_num_fields.py
neural_network_forecast_gridded.py		neural_network_forecast_gridded.py
neural_network_interp.py		neural_network_interp.py
neural_network_lrp.py		neural_network_lrp.py
neural_network_plot_gridded.py		neural_network_plot_gridded.py
neural_network_preprocess_conv.py		neural_network_preprocess_conv.py
neural_network_train_conv.py		neural_network_train_conv.py
neural_network_train_gridded.py		neural_network_train_gridded.py
neural_network_visualize.py		neural_network_visualize.py
nn_byfhr.py		nn_byfhr.py
nn_scores.py		nn_scores.py
plot_bss_quilt.py		plot_bss_quilt.py
plot_compute_scores_parallel.py		plot_compute_scores_parallel.py
plot_domains.py		plot_domains.py
plot_fss_hourly.py		plot_fss_hourly.py
plot_prob_forecast.py		plot_prob_forecast.py
plot_prob_series.py		plot_prob_series.py
plot_reflectivity.py		plot_reflectivity.py
plot_reliability.py		plot_reliability.py
plot_stats_hourly.py		plot_stats_hourly.py
plot_storm_histo.py		plot_storm_histo.py
plot_storm_patch.py		plot_storm_patch.py
quickkde.py		quickkde.py
random_forest_forecast.py		random_forest_forecast.py
random_forest_forecast_ncarrt.py		random_forest_forecast_ncarrt.py
random_forest_plot.py		random_forest_plot.py
random_forest_preprocess_gridded.py		random_forest_preprocess_gridded.py
random_forest_preprocess_objects.py		random_forest_preprocess_objects.py
random_forest_proximity.py		random_forest_proximity.py
random_forest_train_gridded.py		random_forest_train_gridded.py
random_forest_train_objects.py		random_forest_train_objects.py
read_ncar_ensemble_ml.py		read_ncar_ensemble_ml.py
repeated_hrrr-zarr_4casts.py		repeated_hrrr-zarr_4casts.py
run_HWT_mode_train.py		run_HWT_mode_train.py
run_compute_env_nsc.py		run_compute_env_nsc.py
run_daily.bash		run_daily.bash
run_hagelslag.py		run_hagelslag.py
run_random_forest.py		run_random_forest.py
run_test_stormrpts.dnn.bash		run_test_stormrpts.dnn.bash
run_test_with_mask.csh		run_test_with_mask.csh
run_train_stormrpts_dnn.bash		run_train_stormrpts_dnn.bash
run_upscale_HRRR-ZARR.csh		run_upscale_HRRR-ZARR.csh
saveNewMap.py		saveNewMap.py
scaling_values_all.pk		scaling_values_all.pk
scaling_values_all_NSC1km.pk		scaling_values_all_NSC1km.pk
show_importances.py		show_importances.py
show_mode_examples.py		show_mode_examples.py
show_stormrpt_importances.py		show_stormrpt_importances.py
sighailplt.py		sighailplt.py
statisticplot.py		statisticplot.py
storm_locations_training.py		storm_locations_training.py
test_stormrpts_dnn.py		test_stormrpts_dnn.py
test_stormrpts_dnn_with_mask.py		test_stormrpts_dnn_with_mask.py
thin_output.py		thin_output.py
train_mode_dnn.py		train_mode_dnn.py
train_stormrpts_dnn.py		train_stormrpts_dnn.py
upscale_HRRR-ZARR.py		upscale_HRRR-ZARR.py
usamask.pk		usamask.pk
verify_forecasts.py		verify_forecasts.py
verify_forecasts_bss_spatial.py		verify_forecasts_bss_spatial.py
verify_forecasts_spc.py		verify_forecasts_spc.py
visualizecv.py		visualizecv.py
write_grib.py		write_grib.py

License

ahijevyc/machine-learning

Folders and files

Latest commit

History