In [5]:
import lightkurve as lk
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
import plotly.express as px
import plotly.graph_objects as go




## 🧩 `build_segmented_dataset()` — Function Summary

The `build_segmented_dataset()` function takes a **light curve** (from the [Lightkurve](https://docs.lightkurve.org/) library) and converts it into a **segmented, event-labeled dataset** suitable for machine learning.

### **Purpose**

To break a continuous light curve into smaller overlapping segments, compute statistical features for each segment, and label segments containing significant flux dips as potential transit events.

### **How it works**

1. **Input:** a `lightkurve.LightCurve` object
2. **Segmentation:** divides the flux time series into equal-length windows (`segment_length`)
3. **Feature extraction:** computes basic statistical features per segment:

   * mean, std, min, max flux
   * skewness, kurtosis
   * transit depth (median − min)
4. **Event detection:** flags segments that contain points below
   `median_flux - sigma * std_flux`
   as **events** (`label = 1`), otherwise **non-events** (`label = 0`)
5. **Output:** returns a `pandas.DataFrame` where each row represents one segment.

### **Output structure**

| segment_start | segment_end | mean_flux | std_flux | min_flux | max_flux | skew | kurtosis | transit_depth | label |
| ------------- | ----------- | --------- | -------- | -------- | -------- | ---- | -------- | ------------- | ----- |

---



In [None]:
def build_segmented_dataset(
    lc,
    segment_length=200,
    overlap=0,
    sigma=3,
):
    

    flux = lc.flux.value
    time = lc.time.value
    n = len(flux)
    step = segment_length - overlap

    median_flux = np.median(flux)
    std_flux = np.std(flux)

    event_mask = flux < median_flux - sigma * std_flux
    event_indices = np.where(event_mask)[0]

    segments = []

    for start in range(0, n - segment_length, step):
        end = start + segment_length
        seg_flux = flux[start:end]
        seg_time = time[start:end]

        features = {
            "segment_start": seg_time[0],
            "segment_end": seg_time[-1],
            "mean_flux": np.mean(seg_flux),
            "std_flux": np.std(seg_flux),
            "min_flux": np.min(seg_flux),
            "max_flux": np.max(seg_flux),
            "skew": skew(seg_flux),
            "kurtosis": kurtosis(seg_flux),
            "transit_depth": np.median(seg_flux) - np.min(seg_flux),
        }

       
        if np.any((event_indices >= start) & (event_indices < end)):
            features["label"] = 1
        else:
            features["label"] = 0

        segments.append(features)

    df = pd.DataFrame(segments)
    return df

In [12]:
df_segments = build_segmented_dataset(lc, segment_length=200, sigma=3)

df_segments["object_id"] = "Kepler-10"
print(df_segments.head())

   segment_start  segment_end           mean_flux                std_flux  \
0     200.324085   200.459628  0.9999955480072467  0.00020480690782619156   
1     200.460310   200.595853  1.0000114228342825  0.00022028633787149135   
2     200.596534   200.732078  0.9999900579346955  0.00021593210217208254   
3     200.732759   200.868302  1.0000173306553606  0.00022584174972427093   
4     200.868983   201.004527  0.9999970499217118  0.00024485229373302504   

             min_flux            max_flux      skew  kurtosis  \
0  0.9994147368777393  1.0005761126734776 -0.121301  0.225332   
1  0.9993707576952258  1.0006799623168925 -0.083657  0.059639   
2  0.9993596873381183  1.0006687408361072  0.165235  0.024089   
3  0.9994191459645783  1.0008826107314164  0.273535  0.892788   
4  0.9993573279598602  1.0007581800268617  0.317908  0.112242   

           transit_depth  label  object_id  
0  0.0005810315734758875      0  Kepler-10  
1  0.0006421155866287442      0  Kepler-10  
2  0.000626

In [4]:
all_segments = []

for target in ["Kepler-10", "Kepler-11", "Kepler-12"]:
    search = lk.search_lightcurve(target, mission="Kepler", quarter=10)
    lc = search.download().remove_nans().normalize().flatten(window_length=401)
    
    df = build_segmented_dataset(lc, segment_length=200, sigma=5)
    df["target_id"] = target
    all_segments.append(df)

# Merge all targets into one DataFrame
df_all = pd.concat(all_segments, ignore_index=True)
df_all.to_csv("transit_segments_all.csv", index=False)

print("✅ Saved combined dataset for all targets.")



✅ Saved combined dataset for all targets.


In [6]:

df = pd.read_csv("transit_segments_all.csv")


target = "Kepler-10"
df_target = df[df["target_id"] == target]


fig = go.Figure()


fig.add_trace(go.Scatter(
    x=df_target.index,
    y=df_target["mean_flux"],
    mode='lines+markers',
    name='Mean Flux',
    line=dict(color='blue'),
    marker=dict(size=6)
))


transit_segments = df_target[df_target["label"] == 1]
fig.add_trace(go.Scatter(
    x=transit_segments.index,
    y=transit_segments["mean_flux"],
    mode='markers',
    name='Transit',
    marker=dict(color='red', size=8, symbol='circle-open')
))

fig.update_layout(
    title=f"Segmented Lightcurve for {target}",
    xaxis_title="Segment Index",
    yaxis_title="Normalized Flux",
    template="plotly_white"
)

fig.show()