In [13]:
import lightkurve as lk
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
import plotly.express as px
import plotly.graph_objects as go






## 🧩 `build_segmented_dataset()` — Function Summary

The `build_segmented_dataset()` function takes a **light curve** (from the [Lightkurve](https://docs.lightkurve.org/) library) and converts it into a **segmented, event-labeled dataset** suitable for machine learning.

### **Purpose**

To break a continuous light curve into smaller overlapping segments, compute statistical features for each segment, and label segments containing significant flux dips as potential transit events.

### **How it works**

1. **Input:** a `lightkurve.LightCurve` object
2. **Segmentation:** divides the flux time series into equal-length windows (`segment_length`)
3. **Feature extraction:** computes basic statistical features per segment:

   * mean, std, min, max flux
   * skewness, kurtosis
   * transit depth (median − min)
4. **Event detection:** flags segments that contain points below
   `median_flux - sigma * std_flux`
   as **events** (`label = 1`), otherwise **non-events** (`label = 0`)
5. **Output:** returns a `pandas.DataFrame` where each row represents one segment.

### **Output structure**

| segment_start | segment_end | mean_flux | std_flux | min_flux | max_flux | skew | kurtosis | transit_depth | label |
| ------------- | ----------- | --------- | -------- | -------- | -------- | ---- | -------- | ------------- | ----- |

---



In [15]:
def build_segmented_dataset_datetime(
    lc,
    segment_length=200,
    overlap=0,
    sigma=3,
    time_origin=pd.Timestamp("2000-01-01")
):
    """
    Build segmented dataset from lightcurve `lc` for ML and grouped CV.

    Returns DataFrame with:
        DateTime  (segment midpoint)
        Label     (1 if event in segment else 0)
        feature columns...
    """
    flux = lc.flux.value
    time = lc.time.value
    n = len(flux)
    step = segment_length - overlap

    median_flux = np.median(flux)
    std_flux = np.std(flux)

    # simple threshold-based event detection
    event_mask = flux < median_flux - sigma * std_flux
    event_indices = np.where(event_mask)[0]

    segments = []

    for start in range(0, n - segment_length + 1, step):
        end = start + segment_length
        seg_flux = flux[start:end]
        seg_time = time[start:end]

        # --- compute features ---
        mean_flux = np.mean(seg_flux)
        std_flux = np.std(seg_flux)
        min_flux = np.min(seg_flux)
        max_flux = np.max(seg_flux)
        skewness = skew(seg_flux)
        kurt = kurtosis(seg_flux)
        depth = np.median(seg_flux) - np.min(seg_flux)

        # --- label: event within segment? ---
        has_event = int(np.any((event_indices >= start) & (event_indices < end)))

        # --- datetime midpoint for grouped CV ---
        midpoint = seg_time[len(seg_time)//2]
        dt_mid = time_origin + pd.to_timedelta(midpoint, unit="D")

        segments.append({
            "DateTime": dt_mid,
            "segment_start": seg_time[0],
            "segment_end": seg_time[-1],
            "mean_flux": mean_flux,
            "std_flux": std_flux,
            "min_flux": min_flux,
            "max_flux": max_flux,
            "skew": skewness,
            "kurtosis": kurt,
            "transit_depth": depth,
            "Label": has_event
        })

    df = pd.DataFrame(segments)

    # ensure datetime and label have correct types
    df["DateTime"] = pd.to_datetime(df["DateTime"])
    df["Label"] = df["Label"].astype(int)

    return df

In [16]:
all_segments = []

for target in ["Kepler-10"]:
    search = lk.search_lightcurve(target, mission="Kepler", quarter=10)
    lc = search.download().remove_nans().normalize().flatten(window_length=401)
    
    df = build_segmented_dataset_datetime(lc, segment_length=200, sigma=5)
    df["target_id"] = target
    all_segments.append(df)

# Merge all targets into one DataFrame
df_all = pd.concat(all_segments, ignore_index=True)
df_all.to_csv("transit_segments_all.csv", index=False)

print("✅ Saved combined dataset for all targets.")



✅ Saved combined dataset for all targets.


In [17]:

df = pd.read_csv("transit_segments_all.csv")


target = "Kepler-10"
df_target = df[df["target_id"] == target]


fig = go.Figure()


fig.add_trace(go.Scatter(
    x=df_target.index,
    y=df_target["mean_flux"],
    mode='lines+markers',
    name='Mean Flux',
    line=dict(color='blue'),
    marker=dict(size=6)
))


transit_segments = df_target[df_target["Label"] == 1]
fig.add_trace(go.Scatter(
    x=transit_segments.index,
    y=transit_segments["mean_flux"],
    mode='markers',
    name='Transit',
    marker=dict(color='red', size=8, symbol='circle-open')
))

fig.update_layout(
    title=f"Segmented Lightcurve for {target}",
    xaxis_title="Segment Index",
    yaxis_title="Normalized Flux",
    template="plotly_white"
)

fig.show()

In [18]:



from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

pipelines = [
    Pipeline([
        ('scaler', StandardScaler()),
        ('selector', SelectKBest()),
        ('model', LogisticRegression())
    ])
]


In [19]:
from sklearn.feature_selection import mutual_info_classif
param_grids = [
   
    
    # LogisticRegression
    {
        'selector__k': [3, 5, 'all'],
        'selector__score_func': [mutual_info_classif],
        'model__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'model__C': [0.1, 1.0, 10.0],
        'model__solver': ['lbfgs', 'liblinear', 'saga'],
        'model__max_iter': [100, 200, 500],
        'model__random_state': [0, 12, 22, 42]
    }
]

In [20]:
import sys
import os

# Lägg till mappen en nivå upp i Pythons sökväg
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import grouped_timeserie_cv

print(grouped_timeserie_cv.__file__)

grouped_cv =  grouped_timeserie_cv.GroupedTimeSerieCV()
result = grouped_cv.classify(df, pipelines, param_grids, 'h', 'DateTime', 'Label', 'accuracy')

c:\Projects\ExoplanetHunter\ExoplanetHunter-TransitML\grouped_timeserie_cv\__init__.py
Beginning classification with grouped time series cross-validation...
An error occurred during grid search: 
All the 283824 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
283824 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\goran.backlund\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\goran.backlund\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\goran.backlund\AppData

InvalidParameterError: The 'estimator' parameter of learning_curve must be an object implementing 'fit'. Got None instead.

In [None]:
# Plot confusion matrix
grouped_cv.plotter.plot_confusion_matrix(result.confusion_matrices, result.class_labels)

# Plot learning curve
grouped_cv.plotter.plot_learning_curve(result.train_sizes, result.train_mean, result.train_std, result.test_mean, result.test_std)
