In [1]:
import pandas as pd
import os
import sys
import glob
import numpy as np

sys.path.append(os.path.abspath(".."))

%load_ext autoreload
%autoreload 2

Test 1 CSV

In [2]:
df = pd.read_csv("data/raw/ogle4/2025/photometry/blg-0001_phot.dat",
                 sep='\\s+', names=["HJD", "mag", "err", 'unk1', 'unk2'])
df.head()

Unnamed: 0,HJD,mag,err,unk1,unk2
0,2457822.0,20.079,0.223,4.66,83.0
1,2457823.0,20.084,0.261,4.73,120.0
2,2457827.0,19.48,0.175,5.08,144.0
3,2457828.0,19.932,0.235,5.55,103.0
4,2457829.0,19.921,0.182,4.17,104.0


In [3]:
records = []
for path in glob.glob("data/raw/ogle4/2025/photometry/*.dat"):
    event = os.path.basename(path).split("_")[0]
    df = pd.read_csv(path, sep='\\s+', names=["HJD", "mag", "err", "unk1", "unk2"])
    df["event"] = event
    records.append(df)

all_df = pd.concat(records, ignore_index=True)
all_df.to_parquet("data/processed/ogle2025_lightcurves.parquet", compression="snappy", engine="fastparquet")
print("Saved", len(all_df), "rows.")

Saved 2536059 rows.


In [4]:
def preprocess_lc(df):
    df = df.sort_values("HJD").dropna()
    baseline = np.median(df["mag"].nlargest(int(len(df)*0.3)))
    df["flux"] = 10**(-0.4 * (df["mag"] - baseline))
    df["t_rel"] = df["HJD"] - df["HJD"].min()
    return df

In [20]:
from src.feature_extraction import extract_simple_features
from tqdm import tqdm

features = []
for event in tqdm(all_df["event"].unique(), desc="Extracting features"):
    df_event = all_df[all_df["event"] == event]
    df_event = preprocess_lc(df_event)
    feats = extract_simple_features(df_event["t_rel"].values, df_event["flux"].values, df_event["err"].values)
    feats["event"] = event
    features.append(feats)

features_df = pd.DataFrame(features)
features_df.to_csv("data/features/features_ogle2025.csv", index=False)

Extracting features: 100%|██████████| 1445/1445 [01:20<00:00, 17.85it/s]


In [21]:
pd.read_csv("data/features/features_ogle2025.csv")

Unnamed: 0,n_points,duration,cadence_med,flux_mean,flux_std,flux_amp,flux_skew,flux_kurt,amp_norm,std_norm,flux_asym,rise_fall_ratio,fwhm_time,outlier_frac,event
0,4569,3178.62416,0.048045,1.077983,0.085507,1.069432,0.811660,6.200264,0.992067,0.079321,0.018457,15.843277,2356.39568,0.004377,blg-0491
1,2466,3174.66809,0.060230,1.170225,0.187046,1.831861,1.002385,4.750533,1.565392,0.159838,-0.166192,20.614887,3122.73629,0.009327,blg-0801
2,4315,3177.66444,0.048150,1.275117,0.311281,2.551203,1.639576,5.564362,2.000760,0.244120,-0.285650,15.931352,2710.69499,0.019467,blg-0490
3,2387,3174.66809,0.061825,1.117995,0.336294,4.984287,10.040832,115.719071,4.458237,0.300801,-0.597205,27.336957,18.99726,0.011730,blg-0800
4,318,3160.66304,1.956380,1.133161,0.536728,8.452286,12.119374,168.126551,7.459036,0.473656,-0.096590,12.757479,0.00000,0.009434,blg-0239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440,4250,3178.62717,0.048990,1.173951,0.270854,5.082222,7.169236,83.335246,4.329160,0.230720,0.043050,18.296900,3.02853,0.010588,blg-0649
1441,3984,3178.62717,0.049980,1.316572,0.451132,7.910968,6.454226,75.847788,6.008761,0.342657,-0.780413,42.104577,1.91649,0.008534,blg-1228
1442,4547,3177.66442,0.047925,1.188923,0.301751,4.424064,5.640739,50.030392,3.721067,0.253802,-0.238922,14.368358,18.11976,0.014295,blg-0286
1443,4544,3178.62735,0.047980,1.069662,0.077585,1.112597,0.878704,9.263032,1.040138,0.072532,-0.013918,36.844888,3090.70841,0.007482,blg-1229
