Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
Merge pull request #9 from radibnia77/main
Improve accuracy
  • Loading branch information
xun-hu-at-futurewei-com committed Jul 15, 2021
2 parents 26b7e73 + d531f51 commit 41c31687dccd13bcf8b098ae3e2cb0fae5d044b5
Showing 9 changed files with 1,229 additions and 159 deletions.
@@ -0,0 +1,188 @@
product_tag: 'dlpm'
pipeline_tag: '05222021'
factdata_table_name: 'factdata_hq_09222020'

log:
level: 'WARN' # log level for spark and app

pipeline:
filter: # This is for data filtering- si and region
region_mapping_table: 'region_mapping'
output_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
init_start_bucket: 0
bucket_size: 1000
bucket_step: 100
new_bucket_size: 10
condition: ''
new_si_list: [
'06',
'11',
'05',
'04',
'03',
'02',
'01',
'l03493p0r3',
'x0ej5xhk60kjwq',
'g7m2zuits8',
'w3wx3nv9ow5i97',
'a1nvkhk62q',
'g9iv6p4sjy',
'c4n08ku47t',
'b6le0s4qo8',
'd9jucwkpr3',
'p7gsrebd4m',
'a8syykhszz',
'l2d4ec6csv',
'j1430itab9wj3b',
's4z85pd1h8',
'z041bf6g4s',
'71bcd2720e5011e79bc8fa163e05184e',
'a47eavw7ex',
'68bcd2720e5011e79bc8fa163e05184e',
'66bcd2720e5011e79bc8fa163e05184e',
'72bcd2720e5011e79bc8fa163e05184e',
'f1iprgyl13',
'q4jtehrqn2',
'm1040xexan',
'd971z9825e',
'a290af82884e11e5bdec00163e291137',
'w9fmyd5r0i',
'x2fpfbm8rt',
'e351de37263311e6af7500163e291137',
'k4werqx13k',
'5cd1c663263511e6af7500163e291137',
'17dd6d8098bf11e5bdec00163e291137',
'd4d7362e879511e5bdec00163e291137',
'15e9ddce941b11e5bdec00163e291137' ]
time_series: # This is done on whole bucketized data
input_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
conditions: []
yesterday: "2020-05-31" # data is used for training from -<prepare_past_days> to -1(yesterday)
prepare_past_days: 90
bucket_size: 10 # maximum number of buckets to process starting from 0
bucket_step: 1 # size of bucket batch that is processed in one iteration
output_table_name: '{product_tag}_{pipeline_tag}_tmp_ts' # name of the hive table that keeps cleansed and normalized data before writing it into tfrecords, over-writes the existing table
uckey_clustring: # This is done on whole data, not slicing on buckets
pre_cluster_table_name: '{product_tag}_{pipeline_tag}_tmp_pre_cluster'
create_pre_cluster_table: True
output_table_name: '{product_tag}_{pipeline_tag}_tmp_cluster'
cluster_size:
number_of_virtual_clusters: 1000
datapoints_min_th: 0.05 #was [0.15]
datapoints_th_uckeys: 0.5
datapoints_th_clusters: 0.5
popularity_norm: 0.01
popularity_th: 5
median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once
normalization: # This is done on whole data, not slicing on buckets
output_table_name: '{product_tag}_{pipeline_tag}_trainready'
columns: {
'price_cat':['1','2','3'],
'a': ['','1','2','3','4','5','6'],
'g':['','g_f','g_m','g_x'],
't':['UNKNOWN','3G','4G','WIFI','2G'],
'r':['', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82'],
'ipl':['', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82'],
'si':[
'06',
'11',
'05',
'04',
'03',
'02',
'01',
'l03493p0r3',
'x0ej5xhk60kjwq',
'g7m2zuits8',
'w3wx3nv9ow5i97',
'a1nvkhk62q',
'g9iv6p4sjy',
'c4n08ku47t',
'b6le0s4qo8',
'd9jucwkpr3',
'p7gsrebd4m',
'a8syykhszz',
'l2d4ec6csv',
'j1430itab9wj3b',
's4z85pd1h8',
'z041bf6g4s',
'71bcd2720e5011e79bc8fa163e05184e',
'a47eavw7ex',
'68bcd2720e5011e79bc8fa163e05184e',
'66bcd2720e5011e79bc8fa163e05184e',
'72bcd2720e5011e79bc8fa163e05184e',
'f1iprgyl13',
'q4jtehrqn2',
'm1040xexan',
'd971z9825e',
'a290af82884e11e5bdec00163e291137',
'w9fmyd5r0i',
'x2fpfbm8rt',
'e351de37263311e6af7500163e291137',
'k4werqx13k',
'5cd1c663263511e6af7500163e291137',
'17dd6d8098bf11e5bdec00163e291137',
'd4d7362e879511e5bdec00163e291137',
'15e9ddce941b11e5bdec00163e291137' ]
}
holidays: ['2019-11-09', '2019-11-10', '2019-11-11', '2019-11-25', '2019-11-26', '2019-11-27','2019-11-28', '2019-12-24','2019-12-25', '2019-12-26','2019-12-31', '2020-01-01', '2020-01-02', '2020-01-19','2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-02-08']
tfrecords:
tfrecords_hdfs_path: 'factdata.tfrecord.{pipeline_tag}' # it is hdfs location for tfrecords, over-writes the existing files
tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'
distribution:
output_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution'
output_detail_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_detail'

tfrecorder_reader:
tfrecords_local_path: './factdata.tfrecord.{pipeline_tag}' # it us local path for tfrecords, over-writes the existing files
data_dir: 'data/vars'
valid_threshold: 0.0 # default=0.0, type=float, help="Series minimal length threshold (pct of data length)"
add_days: 0 # default=64, type=int, help="Add N days in a future for prediction"
start: '' # help="Effective start date. Data before the start is dropped"
end: '' # help="Effective end date. Data past the end is dropped"
corr_backoffset: 0 # default=0, type=int, help="Offset for correlation calculation"
batch_size: 11880 # batch size of exmaples in tfrecord
duration: 90 # time series length, This has to less or equal prepare_past_days
tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'

trainer:
name: 's32' # default='s32', help='Model name to identify different logs/checkpoints'
hparam_set: 's32' # default='s32', help="Hyperparameters set to use (see hparams.py for available sets)"
n_models: 1 # default=1, type=int, help="Jointly train n models with different seeds"
multi_gpu: false # default=False, action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model"
seed: 5 # default=5, type=int, help="Random seed"
logdir: 'data/logs' # efault='data/logs', help="Directory for summary logs"
max_epoch: 250 # type=int, default=100, help="Max number of epochs"
patience: 2 # type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True"
train_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for training"
eval_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for evaluation"
eval_memsize: 15 # type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size"
gpu: 0 # default=0, type=int, help='GPU instance to use'
gpu_allow_growth: false # default=False, action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start'
save_best_model: false # default=False, action='store_true', help='Save best model during training. Requires do_eval=True'
forward_split: false # default=True, dest='forward_split', action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True'
side_split: false # default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True'
do_eval: false # default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training"
write_summaries: true # default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries"
verbose: false # default=False, action='store_true', help='Print additional information during graph construction'
asgd_decay: 0.99 # type=float, help="EMA decay for averaged SGD. Not use ASGD if not set"
tqdm: true # default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training"
max_steps: 20000 # type=int, help="Stop training after max steps"
save_from_step: 100 # type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step"
predict_window: 10 # default=3, type=int, help="Number of days to predict"

save_model:
table: '{product_tag}_{pipeline_tag}_model_stat'
data_dir: data/vars
ckpt_dir: data/cpt/s32
saved_dir: data/vars
model_version: 'version_{pipeline_tag}'
model_name: 'model_{product_tag}_{pipeline_tag}'
train_window: 60 # Should be same as the one in hparams

elastic_search:
es_host: "10.213.37.41"
es_port: 9200
es_index: 'model_stats'
es_type: 'stat'
@@ -0,0 +1,162 @@
product_tag: 'dlpm'
pipeline_tag: '05142021_1500'
factdata_table_name: 'factdata_hq_09222020'

log:
level: 'WARN' # log level for spark and app

pipeline:
filter: # This is for data filtering- si and region
region_mapping_table: 'region_mapping'
output_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
init_start_bucket: 0
bucket_size: 1000
bucket_step: 100
new_bucket_size: 10
condition: ''
new_si_list: [
'15e9ddce941b11e5bdec00163e291137',
'66bcd2720e5011e79bc8fa163e05184e',
'7b0d7b55ab0c11e68b7900163e3e481d',
'a8syykhszz',
'w3wx3nv9ow5i97',
'x2fpfbm8rt',
'17dd6d8098bf11e5bdec00163e291137',
'5cd1c663263511e6af7500163e291137',
'68bcd2720e5011e79bc8fa163e05184e',
'71bcd2720e5011e79bc8fa163e05184e',
'a290af82884e11e5bdec00163e291137',
'a47eavw7ex',
'b6le0s4qo8',
'd4d7362e879511e5bdec00163e291137',
'd971z9825e',
'd9jucwkpr3',
'e351de37263311e6af7500163e291137',
'f1iprgyl13',
'j1430itab9wj3b',
'k4werqx13k',
'l03493p0r3',
'l2d4ec6csv',
'p7gsrebd4m',
's4z85pd1h8',
'w9fmyd5r0i',
'x0ej5xhk60kjwq',
'z041bf6g4s']
time_series: # This is done on whole bucketized data
input_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
conditions: []
yesterday: "2020-05-31" # data is used for training from -<prepare_past_days> to -1(yesterday)
prepare_past_days: 90
bucket_size: 10 # maximum number of buckets to process starting from 0
bucket_step: 1 # size of bucket batch that is processed in one iteration
output_table_name: '{product_tag}_{pipeline_tag}_tmp_ts' # name of the hive table that keeps cleansed and normalized data before writing it into tfrecords, over-writes the existing table
uckey_clustring: # This is done on whole data, not slicing on buckets
pre_cluster_table_name: '{product_tag}_{pipeline_tag}_tmp_pre_cluster'
create_pre_cluster_table: True
output_table_name: '{product_tag}_{pipeline_tag}_tmp_cluster'
cluster_size:
number_of_virtual_clusters: 50 # was [1000]
datapoints_min_th: 0.05 # was [0.15]
datapoints_th_uckeys: 0 # was [0.5]
datapoints_th_clusters: 0.5
popularity_norm: 0.01
popularity_th: 5
median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once
normalization: # This is done on whole data, not slicing on buckets
output_table_name: '{product_tag}_{pipeline_tag}_trainready'
columns: {
'price_cat':['1','2','3'],
'a': ['','1','2','3','4','5','6'],
'g':['','g_f','g_m','g_x'],
't':['UNKNOWN','3G','4G','WIFI','2G'],
'r':['', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82'],
'ipl':['', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82'],
'si':[
'15e9ddce941b11e5bdec00163e291137',
'66bcd2720e5011e79bc8fa163e05184e',
'7b0d7b55ab0c11e68b7900163e3e481d',
'a8syykhszz',
'w3wx3nv9ow5i97',
'x2fpfbm8rt',
'17dd6d8098bf11e5bdec00163e291137',
'5cd1c663263511e6af7500163e291137',
'68bcd2720e5011e79bc8fa163e05184e',
'71bcd2720e5011e79bc8fa163e05184e',
'a290af82884e11e5bdec00163e291137',
'a47eavw7ex',
'b6le0s4qo8',
'd4d7362e879511e5bdec00163e291137',
'd971z9825e',
'd9jucwkpr3',
'e351de37263311e6af7500163e291137',
'f1iprgyl13',
'j1430itab9wj3b',
'k4werqx13k',
'l03493p0r3',
'l2d4ec6csv',
'p7gsrebd4m',
's4z85pd1h8',
'w9fmyd5r0i',
'x0ej5xhk60kjwq',
'z041bf6g4s']
}
holidays: ['2019-11-09', '2019-11-10', '2019-11-11', '2019-11-25', '2019-11-26', '2019-11-27','2019-11-28', '2019-12-24','2019-12-25', '2019-12-26','2019-12-31', '2020-01-01', '2020-01-02', '2020-01-19','2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-02-08']
tfrecords:
tfrecords_hdfs_path: 'factdata.tfrecord.{pipeline_tag}' # it is hdfs location for tfrecords, over-writes the existing files
tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'
distribution:
output_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution'
output_detail_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_detail'

tfrecorder_reader:
tfrecords_local_path: './factdata.tfrecord.{pipeline_tag}' # it us local path for tfrecords, over-writes the existing files
data_dir: 'data/vars'
valid_threshold: 0.0 # default=0.0, type=float, help="Series minimal length threshold (pct of data length)"
add_days: 0 # default=64, type=int, help="Add N days in a future for prediction"
start: '' # help="Effective start date. Data before the start is dropped"
end: '' # help="Effective end date. Data past the end is dropped"
corr_backoffset: 0 # default=0, type=int, help="Offset for correlation calculation"
batch_size: 11880 # batch size of exmaples in tfrecord
duration: 90 # time series length, This has to less or equal prepare_past_days
tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'

trainer:
name: 's32' # default='s32', help='Model name to identify different logs/checkpoints'
hparam_set: 's32' # default='s32', help="Hyperparameters set to use (see hparams.py for available sets)"
n_models: 1 # default=1, type=int, help="Jointly train n models with different seeds"
multi_gpu: false # default=False, action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model"
seed: 5 # default=5, type=int, help="Random seed"
logdir: 'data/logs' # efault='data/logs', help="Directory for summary logs"
max_epoch: 250 # type=int, default=100, help="Max number of epochs"
patience: 2 # type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True"
train_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for training"
eval_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for evaluation"
eval_memsize: 15 # type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size"
gpu: 0 # default=0, type=int, help='GPU instance to use'
gpu_allow_growth: false # default=False, action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start'
save_best_model: false # default=False, action='store_true', help='Save best model during training. Requires do_eval=True'
forward_split: false # default=True, dest='forward_split', action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True'
side_split: false # default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True'
do_eval: false # default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training"
write_summaries: true # default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries"
verbose: false # default=False, action='store_true', help='Print additional information during graph construction'
asgd_decay: 0.99 # type=float, help="EMA decay for averaged SGD. Not use ASGD if not set"
tqdm: true # default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training"
max_steps: 20000 # type=int, help="Stop training after max steps"
save_from_step: 100 # type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step"
predict_window: 10 # default=3, type=int, help="Number of days to predict"

save_model:
table: '{product_tag}_{pipeline_tag}_model_stat'
data_dir: data/vars
ckpt_dir: data/cpt/s32
saved_dir: data/vars
model_version: 'version_{pipeline_tag}'
model_name: 'model_{product_tag}_{pipeline_tag}'
train_window: 60 # Should be same as the one in hparams

elastic_search:
es_host: "10.213.37.41"
es_port: 9200
es_index: 'model_stats'
es_type: 'stat'

0 comments on commit 41c3168

Please sign in to comment.