In [2]:
import lightkurve as lk
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
import plotly.express as px
import plotly.graph_objects as go






## 🧩 `build_segmented_dataset()` — Function Summary

The `build_segmented_dataset()` function takes a **light curve** (from the [Lightkurve](https://docs.lightkurve.org/) library) and converts it into a **segmented, event-labeled dataset** suitable for machine learning.

### **Purpose**

To break a continuous light curve into smaller overlapping segments, compute statistical features for each segment, and label segments containing significant flux dips as potential transit events.

### **How it works**

1. **Input:** a `lightkurve.LightCurve` object
2. **Segmentation:** divides the flux time series into equal-length windows (`segment_length`)
3. **Feature extraction:** computes basic statistical features per segment:

   * mean, std, min, max flux
   * skewness, kurtosis
   * transit depth (median − min)
4. **Event detection:** flags segments that contain points below
   `median_flux - sigma * std_flux`
   as **events** (`label = 1`), otherwise **non-events** (`label = 0`)
5. **Output:** returns a `pandas.DataFrame` where each row represents one segment.

### **Output structure**

| segment_start | segment_end | mean_flux | std_flux | min_flux | max_flux | skew | kurtosis | transit_depth | label |
| ------------- | ----------- | --------- | -------- | -------- | -------- | ---- | -------- | ------------- | ----- |

---



In [3]:
def build_segmented_dataset(
    lc,
    segment_length=200,
    overlap=0,
    sigma=3,
):
    

    flux = lc.flux.value
    time = lc.time.value
    n = len(flux)
    step = segment_length - overlap

    median_flux = np.median(flux)
    std_flux = np.std(flux)

    event_mask = flux < median_flux - sigma * std_flux
    event_indices = np.where(event_mask)[0]

    segments = []

    for start in range(0, n - segment_length, step):
        end = start + segment_length
        seg_flux = flux[start:end]
        seg_time = time[start:end]

        features = {
            "segment_start": seg_time[0],
            "segment_end": seg_time[-1],
            "mean_flux": np.mean(seg_flux),
            "std_flux": np.std(seg_flux),
            "min_flux": np.min(seg_flux),
            "max_flux": np.max(seg_flux),
            "skew": skew(seg_flux),
            "kurtosis": kurtosis(seg_flux),
            "transit_depth": np.median(seg_flux) - np.min(seg_flux),
        }

       
        if np.any((event_indices >= start) & (event_indices < end)):
            features["label"] = 1
        else:
            features["label"] = 0

        segments.append(features)

    df = pd.DataFrame(segments)
    return df

In [9]:
all_segments = []

for target in ["Kepler-10"]:
    search = lk.search_lightcurve(target, mission="Kepler", quarter=10)
    lc = search.download().remove_nans().normalize().flatten(window_length=401)
    
    df = build_segmented_dataset(lc, segment_length=200, sigma=5)
    df["target_id"] = target
    all_segments.append(df)

# Merge all targets into one DataFrame
df_all = pd.concat(all_segments, ignore_index=True)
df_all.to_csv("transit_segments_all.csv", index=False)

print("✅ Saved combined dataset for all targets.")



✅ Saved combined dataset for all targets.


In [4]:

df = pd.read_csv("transit_segments_all.csv")


target = "Kepler-10"
df_target = df[df["target_id"] == target]


fig = go.Figure()


fig.add_trace(go.Scatter(
    x=df_target.index,
    y=df_target["mean_flux"],
    mode='lines+markers',
    name='Mean Flux',
    line=dict(color='blue'),
    marker=dict(size=6)
))


transit_segments = df_target[df_target["label"] == 1]
fig.add_trace(go.Scatter(
    x=transit_segments.index,
    y=transit_segments["mean_flux"],
    mode='markers',
    name='Transit',
    marker=dict(color='red', size=8, symbol='circle-open')
))

fig.update_layout(
    title=f"Segmented Lightcurve for {target}",
    xaxis_title="Segment Index",
    yaxis_title="Normalized Flux",
    template="plotly_white"
)

fig.show()

In [2]:
from grouped_timeserie_cv import GroupedTimeSerieCV


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

pipelines = [
    Pipeline([
        ('scaler', StandardScaler()),
        ('selector', SelectKBest()),
        ('model', GaussianNB())
    ]),
    Pipeline([
        ('scaler', StandardScaler()),
        ('selector', SelectKBest()),
        ('model', DecisionTreeClassifier())
    ]),
    Pipeline([
        ('scaler', StandardScaler()),
        ('selector', SelectKBest()),
        ('model', LogisticRegression())
    ])
]

In [3]:
from sklearn.feature_selection import mutual_info_classif
param_grids = [
    # GaussianNB
    {
        'selector__k': [3, 5, 'all'],
        'selector__score_func': [mutual_info_classif],
        'model__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    },
    
    # DecisionTreeClassifier
    {
        'selector__k': [3, 5, 'all'],
        'selector__score_func': [mutual_info_classif],
        'model__criterion': ['gini', 'entropy'],
        'model__splitter': ['best', 'random'],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__random_state': [0, 12, 22, 42]
    },
    
    # LogisticRegression
    {
        'selector__k': [3, 5, 'all'],
        'selector__score_func': [mutual_info_classif],
        'model__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'model__C': [0.1, 1.0, 10.0],
        'model__solver': ['lbfgs', 'liblinear', 'saga'],
        'model__max_iter': [100, 200, 500],
        'model__random_state': [0, 12, 22, 42]
    }
]

In [None]:
df['DateTime'] = pd.to_datetime(df['DateTime'])  # solve that datetime column exists

grouped_cv = GroupedTimeSerieCV()
result = grouped_cv.classify(df, pipelines, param_grids, 'D', 'DateTime', 'Label', 'accuracy')