Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly calculate new_high_sigma #714

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 35 additions & 53 deletions vast_pipeline/pipeline/new_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,14 @@ def get_image_rms_measurements(
return group


def parallel_get_rms_measurements(
def parallel_get_new_high_sigma(
df: pd.DataFrame, edge_buffer: float = 1.0
) -> pd.DataFrame:
"""
Wrapper function to use 'get_image_rms_measurements'
in parallel with Dask. nbeam is not an option here as that parameter
is fixed in forced extraction and so is made sure to be fixed here to. This
may change in the future.
Wrapper function to use 'get_image_rms_measurements' in parallel with Dask
and calculate the new high sigma. nbeam is not an option here as that
parameter is fixed in forced extraction and so is made sure to be fixed
here to. This may change in the future.

Args:
df:
Expand Down Expand Up @@ -233,30 +233,48 @@ def parallel_get_rms_measurements(
).compute(num_workers=n_cpu, scheduler='processes')
)

# We don't need all of the RMS measurements, just the lowest. Keeping all
# of them results in huge memory usage when merging. However, there is an
# existing bug: https://github.com/askap-vast/vast-pipeline/issues/713
# that means that we actually want the _highest_ in order to reproduce the
# current behaviour. Fixing the bug is beyond the scope of this PR because
# it means rewriting tests and test data.

df_to_merge = (df.drop_duplicates('source')
.drop(['img_diff_rms_path'], axis=1)
)

out_to_merge = (out.sort_values(
by=['source', 'img_diff_true_rms'], ascending=False
by=['source', 'img_diff_true_rms'], ascending=True
)
.drop_duplicates('source')
)

df = df_to_merge.merge(
new_sources_df = df_to_merge.merge(
out_to_merge[['source', 'img_diff_true_rms']],
left_on='source', right_on='source',
how='left'
)

# this removes those that are out of range
new_sources_df['img_diff_true_rms'] = (
new_sources_df['img_diff_true_rms'].fillna(0.)
)
new_sources_df = new_sources_df[
new_sources_df['img_diff_true_rms'] != 0
]

return df
# calculate the true sigma
new_sources_df['true_sigma'] = (
new_sources_df['flux_peak'].values
/ new_sources_df['img_diff_true_rms'].values
)

# keep only the highest for each source, rename for the daatabase
new_sources_df = (
new_sources_df
.set_index('source')
.rename(columns={'true_sigma': 'new_high_sigma'})
)

# moving forward only the new_high_sigma columns is needed, drop all
# others.
new_sources_df = new_sources_df[['new_high_sigma']]

return new_sources_df


def new_sources(
Expand Down Expand Up @@ -429,48 +447,12 @@ def new_sources(
# to measure the true rms at the source location.

# measure the actual rms in the previous images at
# the source location.

# PR#713: This part of the code should be rewritten to reflect the new
# behaviour of parallel_get_rms_measurements. That function should be
# renamed to something like parallel_get_new_high_sigma and all of the
# subsequent code in this function moved into it.
# the source location and calculate the corresponding S/N

new_sources_df = parallel_get_rms_measurements(
new_sources_df = parallel_get_new_high_sigma(
new_sources_df, edge_buffer=edge_buffer
)

# this removes those that are out of range
new_sources_df['img_diff_true_rms'] = (
new_sources_df['img_diff_true_rms'].fillna(0.)
)
new_sources_df = new_sources_df[
new_sources_df['img_diff_true_rms'] != 0
]

# calculate the true sigma
new_sources_df['true_sigma'] = (
new_sources_df['flux_peak'].values
/ new_sources_df['img_diff_true_rms'].values
)

# We only care about the highest true sigma
# new_sources_df = new_sources_df.sort_values(
# by=['source', 'true_sigma']
# )

# keep only the highest for each source, rename for the daatabase
new_sources_df = (
new_sources_df
# .drop_duplicates('source')
.set_index('source')
.rename(columns={'true_sigma': 'new_high_sigma'})
)

# moving forward only the new_high_sigma columns is needed, drop all
# others.
new_sources_df = new_sources_df[['new_high_sigma']]

logger.info(
'Total new source analysis time: %.2f seconds', timer.reset_init()
)
Expand Down
Loading