Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
Merge pull request #8 from radibnia77/main
Improve accuracy
  • Loading branch information
xun-hu-at-futurewei-com committed Jun 25, 2021
2 parents 119175f + 78db902 commit 26b7e738022c14a1075198f52b65fa36ef0c0f4e
Showing 4 changed files with 20 additions and 15 deletions.
@@ -1,5 +1,5 @@
product_tag: 'dlpm'
pipeline_tag: '05182021_1500'
pipeline_tag: '05182021_1500' # IMPORTANT: The pipeline tag has to be changed before each run to prevent record duplication.
factdata_table_name: 'factdata_hq_09222020'

log:
@@ -61,7 +61,7 @@ pipeline:
cluster_dense_num_ratio_cap: 0.01
datapoints_min_th: 0.12 #was [0.15]
datapoints_th_uckeys: 0.12
datapoints_th_clusters: 0.2
datapoints_th_clusters: 0.5
popularity_norm: 0.01
popularity_th: 4
median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once
@@ -141,11 +141,12 @@ def remove_weak_uckeys(df, popularity_th, datapoints_min_th):
return df


def denoise(df):
def denoise(df, percentile):
df = df.withColumn('nonzero_p', udf(
lambda ts: 1.0 * sum(ts) / len([_ for _ in ts if _ != 0]) if len(
[_ for _ in ts if _ != 0]) != 0 else 0.0, FloatType())(df.ts))
df = df.withColumn('ts', udf(lambda ts, nonzero_p: [i if i and i > (nonzero_p / 10.0) else 0 for i in ts],

df = df.withColumn('ts', udf(lambda ts, nonzero_p: [i if i and i > (nonzero_p / percentile) else 0 for i in ts],
ArrayType(IntegerType()))(df.ts, df.nonzero_p))
return df

@@ -182,12 +183,9 @@ def run(hive_context, cluster_size_cfg, input_table_name,
# remove weak uckeys
df = remove_weak_uckeys(df, popularity_th, datapoints_min_th)

# replace nan and zero with median
# replace nan with
df = transform.replace_nan_with_zero(df)

# denoising uckeys: remove some datapoints of the uckey
df = denoise(df)

# add normalized popularity = mean_n
# df, _ = transform.normalize_ohe_feature(df, ohe_feature='p')

@@ -250,13 +248,10 @@ def run(hive_context, cluster_size_cfg, input_table_name,
# add normalized popularity = mean_n
df, _ = transform.normalize_ohe_feature(df, ohe_feature='p')

df = df.filter(udf(lambda p_n, ts: not is_spare(datapoints_th_clusters, -
sys.maxsize-1)(p_n, ts), BooleanType())(df.p_n, df.ts))
df = df.filter(udf(lambda p_n, ts: not is_spare(datapoints_th_clusters, -sys.maxsize - 1)(p_n, ts), BooleanType())(df.p_n, df.ts))

# mean/10 for now, mean = mean of (non zero ts)
df = df.withColumn('nonzero_p', udf(lambda ts: 1.0 * sum([_ for _ in ts if _ != 0])/(len([_ for _ in ts if _ != 0])), FloatType())(df.ts))

df = df.withColumn('ts', udf(lambda ts, nonzero_p: [_ if _ > (nonzero_p/percentile) else 0 for _ in ts], ArrayType(IntegerType()))(df.ts, df.nonzero_p))
# denoising uckeys: remove some datapoints of the uckey
df = denoise(df, percentile)

__save_as_table(df, output_table_name, hive_context, True)

@@ -182,7 +182,7 @@ def run(cfg):
quarter_autocorr = numpy.ones((x[0].size,), dtype=float)
page_indx = list(x[0])

fill_isolated_zeros(x[21])
# fill_isolated_zeros(x[21])
tensors = dict(
hits=pd.DataFrame(x[21], index=page_indx, columns=date_list),
lagged_ix=lagged_ix,
@@ -61,6 +61,14 @@ def normalize_ts(ts):

def predict_daily_uckey(days, serving_url, forecaster, model_stats, columns):

def _denoise(ts):
non_zero_ts = [_ for _ in ts if _ != 0]
nonzero_p = 0.0
if len(non_zero_ts) > 0:
nonzero_p = 1.0 * sum(ts) / len(non_zero_ts)

return [i if i > (nonzero_p / 10.0) else 0 for i in ts]

def _helper(cols):
day_list = days[:]
ucdoc_attribute_map = {}
@@ -88,6 +96,8 @@ def _helper(cols):
# remove science 06/21/2021
# model_input_ts = replace_with_median(model_input_ts)

model_input_ts = _denoise(model_input_ts)

ts_n = normalize_ts(model_input_ts)
ucdoc_attribute_map['ts_n'] = ts_n

0 comments on commit 26b7e73

Please sign in to comment.