In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/peaks_summary.csv'

In [None]:
# Load all cleaned peak CSVs into a combined DataFrame
DATA_DIR = Path('/home/calder/Documents/asassn-research/calder/peak_results/cleaned')
csv_paths = sorted(DATA_DIR.glob('peaks_*_clean.csv'))
if not csv_paths:
    raise FileNotFoundError(f'No cleaned peak CSVs found in {DATA_DIR}')

dfs = {}
frames = []
for path in csv_paths:
    frame = pd.read_csv(path)
    frame = frame.copy()
    if 'source_file' not in frame.columns:
        frame['source_file'] = path.name
    else:
        frame['source_file'] = frame['source_file'].fillna(path.name)
    file_key = path.stem
    frame['file_key'] = file_key
    if 'mag_bin' in frame.columns:
        frame['mag_bin'] = frame['mag_bin'].fillna(file_key.replace('peaks_', '').replace('_clean', ''))
    else:
        frame['mag_bin'] = file_key.replace('peaks_', '').replace('_clean', '')
    dfs[file_key] = frame
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)
print(f'Loaded {len(csv_paths)} files with {len(df):,} total rows.')


In [None]:
# Scrollable display of the combined dataset
from IPython.display import display
max_rows = 500  # adjust if you want a different cap
display(df.head(max_rows).style.set_table_attributes('style="display:inline-block"').set_table_styles([
    {'selector': 'table', 'props': 'max-height: 400px; overflow: auto; display: block;'}
]))


In [None]:

# Quick structural look at the table
display(df.head())
df.info()


In [None]:

# Columns that contain stringified Python literals (e.g. "[106, 176, ...]") → convert to real lists
list_like_cols = [
    "g_peaks_idx", "g_peaks_jd",
    "v_peaks_idx", "v_peaks_jd",
]

def parse_literal(value):
    if pd.isna(value) or value == "":
        return []
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return []

for col in list_like_cols:
    if col in df.columns:
        df[col] = df[col].apply(parse_literal)

# Optional: verify that the reported counts (g_n_peaks / v_n_peaks) match the parsed list lengths
if {"g_n_peaks", "g_peaks_idx"}.issubset(df.columns):
    df["g_peaks_count_check"] = df["g_peaks_idx"].apply(len)
if {"v_n_peaks", "v_peaks_idx"}.issubset(df.columns):
    df["v_peaks_count_check"] = df["v_peaks_idx"].apply(len)

df[["g_n_peaks", "g_peaks_count_check", "v_n_peaks", "v_peaks_count_check"]].head()


In [None]:

# Descriptive stats for the key fractional columns
fraction_cols = [c for c in ["g_dip_fraction", "g_jump_fraction", "v_dip_fraction", "v_jump_fraction"] if c in df.columns]
df[fraction_cols].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])


In [None]:

# Top candidates by g_dip_fraction (and v-band if available)
top_n = 10
cols_for_display = [c for c in ["asas_sn_id", "mag_bin", "g_dip_fraction", "g_n_peaks", "v_dip_fraction", "v_n_peaks", "raw_median_min_camera", "raw_median_max_camera"] if c in df.columns]
display(df.sort_values("g_dip_fraction", ascending=False)[cols_for_display].head(top_n))

if "v_dip_fraction" in df.columns:
    display(df.sort_values("v_dip_fraction", ascending=False)[cols_for_display].head(top_n))


In [None]:

# Top candidates by g_dip_fraction (and v-band if available)
top_n = 10
cols_for_display = [c for c in ["asas_sn_id", "mag_bin", "g_dip_fraction", "g_n_peaks", "v_dip_fraction", "v_n_peaks", "raw_median_min_camera", "raw_median_max_camera"] if c in df.columns]
display(df.sort_values("g_dip_fraction", ascending=False)[cols_for_display].head(top_n))

if "v_dip_fraction" in df.columns:
    display(df.sort_values("v_dip_fraction", ascending=False)[cols_for_display].head(top_n))


In [None]:

# Distribution plots for dip fractions and peak counts
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

sns.histplot(data=df, x="g_dip_fraction", bins=30, ax=axes[0], color="#1f77b4")
axes[0].set_title("Distribution of g_dip_fraction")

sns.histplot(data=df, x="v_dip_fraction", bins=30, ax=axes[1], color="#ff7f0e")
axes[1].set_title("Distribution of v_dip_fraction")

sns.histplot(data=df, x="g_n_peaks", bins=np.arange(df["g_n_peaks"].max() + 2) - 0.5, ax=axes[2], color="#2ca02c", discrete=True)
axes[2].set_title("g_n_peaks count distribution")

sns.histplot(data=df, x="v_n_peaks", bins=np.arange(df["v_n_peaks"].max() + 2) - 0.5, ax=axes[3], color="#d62728", discrete=True)
axes[3].set_title("v_n_peaks count distribution")

plt.tight_layout()
plt.show()


In [None]:

# How many targets are marked dip-dominated (g / v), and how often both bands are dip-dominated
dip_dom_summary = {
    "g_is_dip_dominated_true": df["g_is_dip_dominated"].sum(),
    "v_is_dip_dominated_true": df["v_is_dip_dominated"].sum(),
    "both_true": (df["g_is_dip_dominated"] & df["v_is_dip_dominated"]).sum(),
    "either_true": (df["g_is_dip_dominated"] | df["v_is_dip_dominated"]).sum(),
}
pd.Series(dip_dom_summary)


In [None]:

# Camera perspective: how many unique cameras appear as min/max contributors?
camera_cols = [c for c in ["raw_median_min_camera", "raw_median_max_camera"] if c in df.columns]
camera_counts = {}
for col in camera_cols:
    camera_counts[f"unique_{col}"] = df[col].nunique()
    camera_counts[f"{col}_value_counts"] = df[col].value_counts()
camera_counts


In [None]:

# Optional: scatter comparing dip fraction vs number of peaks (G-band)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df,
    x="g_n_peaks",
    y="g_dip_fraction",
    hue="g_is_dip_dominated",
    palette={True: "#d62728", False: "#1f77b4"}
)
plt.title("g_dip_fraction vs g_n_peaks")
plt.xlabel("g_n_peaks")
plt.ylabel("g_dip_fraction")
plt.legend(title="g_is_dip_dominated")
plt.tight_layout()
plt.show()


In [None]:

# Optional: scatter comparing dip fraction vs number of peaks (G-band)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df,
    x="g_n_peaks",
    y="g_dip_fraction",
    hue="g_is_dip_dominated",
    palette={True: "#d62728", False: "#1f77b4"}
)
plt.title("g_dip_fraction vs g_n_peaks")
plt.xlabel("g_n_peaks")
plt.ylabel("g_dip_fraction")
plt.legend(title="g_is_dip_dominated")
plt.tight_layout()
plt.show()


In [None]:

# Quick table of the columns most relevant to dips/cameras for reporting
report_columns = [
    "asas_sn_id",
    "g_n_peaks", "g_dip_fraction", "g_is_dip_dominated",
    "v_n_peaks", "v_dip_fraction", "v_is_dip_dominated",
    "raw_median_min_camera", "raw_median_max_camera",
]
report_columns = [c for c in report_columns if c in df.columns]
report_df = df[report_columns].copy()
report_df.head(20)


In [43]:
# Match the known ASAS-SN IDs against the combined dataframe
from IPython.display import display, Markdown
target_ids = [
    335007754417,
    231929175915,
    60130040391,
    377958261591,
    438086977939,
    360777377116,
    635655234580,
    412317159120,
    317827964025,
    438086901547,
    515396514761,
    463856535113,
    120259184943,
    25770019815,
    68720274411,
    644245387906,
    94489418658,
    515397118400,
    661425129485,
    326417831663,
    266288137752,
    532576686103,
    352187470767,
    609886184506,
    455267102087,
    472447294641,
    377957522430,
    601296043597,
    223339338105,
    42950993887,
    549756680252,
    77310927636,
    83014,
    56800,
    60130353420,
    266288912762,
    77310917396,
    128849502096,
]

if 'asas_sn_id' not in df.columns:
    raise KeyError("Combined dataframe is missing the 'asas_sn_id' column")

id_strings = {str(t) for t in target_ids}
mask = df['asas_sn_id'].astype(str).isin(id_strings)
matched = df.loc[mask].copy()

if matched.empty:
    display(Markdown('**No rows matched the requested ASAS-SN IDs.**'))
else:
    summary = (
        matched[['asas_sn_id', 'mag_bin', 'source_file']]
        .drop_duplicates()
        .sort_values('asas_sn_id')
    )
    n_rows = len(matched)
    n_files = summary['source_file'].nunique()
    n_ids = summary['asas_sn_id'].nunique()
    display(Markdown(f'**Matched {n_rows} rows across {n_files} files for {n_ids} unique ASAS-SN IDs.**'))
    styled = summary.style.set_table_attributes("style='display:block; max-height:400px; overflow:auto'")
    try:
        styled = styled.set_sticky(axis=0)
    except (AttributeError, ValueError):
        pass
    display(styled)

matched


**Matched 26 rows across 6 files for 26 unique ASAS-SN IDs.**

Unnamed: 0,asas_sn_id,mag_bin,source_file
61096,56800,13.5_14,peaks_13_5_14_20251018_155817-0400.csv
61304,25770019815,13.5_14,peaks_13_5_14_20251018_155817-0400.csv
35231,42950993887,13_13.5,peaks_13_13_5_20251018_155817-0400.csv
10150,60130040391,12.5_13,peaks_12_5_13_20251018_155817-0400.csv
35262,68720274411,13_13.5,peaks_13_13_5_20251018_155817-0400.csv
52790,77310927636,13.5_14,peaks_13_5_14_20251018_155817-0400.csv
96617,94489418658,13.5_14,peaks_13_5_14_20251018_155817-0400.csv
42029,120259184943,13_13.5,peaks_13_13_5_20251018_155817-0400.csv
28220,128849502096,13_13.5,peaks_13_13_5_20251018_155817-0400.csv
34518,223339338105,13_13.5,peaks_13_13_5_20251018_155817-0400.csv


Unnamed: 0,mag_bin,asas_sn_id,index_num,index_csv,lc_dir,dat_path,raw_path,g_n_peaks,g_mean_mag,g_peaks_idx,...,dec_deg,pm_ra,pm_ra_d,pm_dec,pm_dec_d,vsx_match_sep_arcsec,vsx_class,file_key,g_peaks_count_check,v_peaks_count_check
1547,12_12.5,644245387906,11,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/12_12....,/data/poohbah/1/assassin/rowan.90/lcsv2/12_12....,/data/poohbah/1/assassin/rowan.90/lcsv2/12_12....,1,12.453439,[528],...,-58.130373,-3.31,0.14,-1.77,0.11,226.653605,,peaks_12_12_5_clean,1,0
10150,12.5_13,60130040391,10,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/12.5_1...,/data/poohbah/1/assassin/rowan.90/lcsv2/12.5_1...,/data/poohbah/1/assassin/rowan.90/lcsv2/12.5_1...,2,12.740053,"[4, 51]",...,56.931252,6.69,0.04,-2.11,0.04,466.417296,,peaks_12_5_13_clean,2,0
24265,13_13.5,635655234580,10,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,1,13.065357,[328],...,-54.999639,-5.43,0.04,3.28,0.04,68.529369,,peaks_13_13_5_clean,1,0
26499,13_13.5,472447294641,13,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,1,13.228011,[666],...,-47.547294,-0.77,0.06,-2.8,0.06,1295.365891,,peaks_13_13_5_clean,1,0
28220,13_13.5,128849502096,15,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,4,13.284065,"[201, 245, 331, 408]",...,30.724819,-3.4,0.05,-11.58,0.04,4859.17426,,peaks_13_13_5_clean,4,4
30926,13_13.5,377957522430,18,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,7,13.345646,"[61, 259, 383, 528, 620, 733, 769]",...,15.424939,3.29,0.08,-3.14,0.07,954.815746,,peaks_13_13_5_clean,7,2
34518,13_13.5,223339338105,21,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,1,13.369203,[473],...,34.554054,1.06,0.02,-2.97,0.03,2040.602646,,peaks_13_13_5_clean,1,0
35231,13_13.5,42950993887,22,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,1,13.114329,[660],...,38.660551,-3.04,0.04,-7.39,0.04,364.068832,,peaks_13_13_5_clean,1,0
35262,13_13.5,68720274411,22,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,3,13.451504,"[120, 440, 750]",...,64.729811,-2.83,0.03,0.03,0.03,387.235256,,peaks_13_13_5_clean,3,0
36067,13_13.5,463856535113,23,/data/poohbah/1/assassin/lenhart/code/calder/l...,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,/data/poohbah/1/assassin/rowan.90/lcsv2/13_13....,1,13.410329,[21],...,-71.587087,5.65,0.02,-4.29,0.03,814.43558,,peaks_13_13_5_clean,1,0


In [44]:
found_ids = set(summary["asas_sn_id"])
missing_ids = sorted(set(target_ids) - found_ids)
missing_ids


[83014,
 60130353420,
 77310917396,
 266288137752,
 317827964025,
 352187470767,
 455267102087,
 532576686103,
 549756680252,
 601296043597,
 609886184506,
 661425129485]