In [None]:
import astropy.units as u
import h5py
import lightkurve as lk
import matplotlib.pyplot as plt
import numpy as np
import os
from os.path import exists
import pandas as pd
import seaborn as sns
import sys
# from tqdm import tqdm
from tqdm.notebook import tqdm

sys.path.append('..')
from file_editing import *
from period_finding import *

In [None]:
# Global variables
cadence = 120

In [None]:
# Commaize if not done already
csv_filename = 'cnn_data.csv'
if not exists(csv_filename):
    commaize('raw_cnn_data.csv', csv_filename)

In [None]:
# Create training dataset
df = pd.read_csv(csv_filename)
df = df[['iau_name', 'i', 'porb', 'porbe']] 

In [None]:
'''
    name and ingo blah
'''
def save_data_hdf5(file_name, lightcurve, periodogram, best_period, literature_period, star_name, star_imag):
    # Determine if the period is probable
    is_real, cutoff = is_real_period(periodogram, best_period)

    # Define folded and binned lightcurve
    phase_lightcurve = lightcurve.fold(period=best_period)
    bin_value = find_bin_value(phase_lightcurve, 50)
    binned_lightcurve = phase_lightcurve.bin(bin_value*u.min) 

    # Lightcurve data
    time = lightcurve.time.value
    flux = lightcurve.flux.value

    # Get the power at the best period
    interp_func = interp1d(periodogram.period.value, periodogram.power.value, kind='linear', bounds_error=False, fill_value=np.nan)
    power_at_best_period = interp_func(best_period)

    # Make an lmfit object and fit it
    model = lmfit.Model(sine_wave)
    params = model.make_params(amplitude=power_at_best_period, frequency=1/best_period, phase=0.0)
    result = model.fit(flux, params, x=time)

    # Save data to HDF5
    with h5py.File(file_name, 'a') as f:
        grp = f.create_group(star_name)
        grp.create_dataset('periodogram_period', data = periodogram.period.value)
        grp.create_dataset('periodogram_power', data = periodogram.power.value)
        grp.create_dataset('time', data = time)
        grp.create_dataset('flux', data = flux)
        grp.create_dataset('binned_phase', data = binned_lightcurve.phase.value)
        grp.create_dataset('binned_flux', data = binned_lightcurve.flux.value)
        grp.create_dataset('fitted_sine_wave', data = result.best_fit)
        grp.create_dataset('residuals', data = flux - result.best_fit)
        grp.attrs['best_period'] = best_period
        grp.attrs['literature_period'] = literature_period
        grp.attrs['star_imag'] = star_imag
        grp.attrs['real_label'] = is_real

In [None]:
# Check if data has already been added
if exists('training_data.h5'):
    print('File training_data.h5 already exists.')
else:
    for _, row in tqdm(df.iterrows(), desc="Processing lightcurves", total = len(df)):
        # Pull data for that star
        try:
            result = lk.search_lightcurve(row['iau_name'], mission = 'TESS')
            result_exposures = result.exptime
        except Exception as e:
            print(f"Error for {row['iau_name']}: {e} \n")
            continue

        lightcurve = append_lightcurves(result, result_exposures, cadence)
        if not lightcurve: continue # check if there was a result with the cadence needed

        # Star data
        star_name = 'TIC ' + str(lightcurve.meta['TICID'])
        star_imag = row['i']
        literature_period = (row['porb']*u.hour).to(u.day).value
        
        # Get periodogram
        periodogram = lightcurve.to_periodogram(oversample_factor = 10, 
                                                minimum_period = (2*cadence*u.second).to(u.day).value, 
                                                maximum_period = 14)
        
        # Determine if the period is probable
        best_period = periodogram.period_at_max_power.value 

        # Save the data
        save_data_hdf5('training_data.h5', lightcurve, periodogram, best_period, literature_period, star_name, star_imag)