In [3]:
# ==============================================================================
# Part 1: Setup and Installation
# ==============================================================================
print("Installing required libraries...")
# Lightkurve is not pre-installed in Colab, so we install it.
# The 'quiet' flag reduces the installation output.
# !pip install lightkurve -q

import lightkurve as lk
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings('ignore')


print("Libraries imported successfully.")

# ==============================================================================
# Part 2: Load and Prepare Master Data from CSV
# ==============================================================================
print("\nLoading the master exoplanet CSV file...")
try:
    # Corrected file path
    df = pd.read_csv('data/exoplanets.csv')
    # Clean up the dataframe by dropping rows where essential data is missing.
    df = df.dropna(subset=['koi_disposition', 'kepid', 'koi_period'])
except FileNotFoundError:
    print("\nFATAL ERROR: 'exoplanets.csv' not found.")
    print("Please upload the dataset to your Colab session and restart the kernel.")
    exit()

# Create the binary target label: 1 for exoplanets, 0 for false positives.
df['is_exoplanet'] = df['koi_disposition'].apply(lambda x: 1 if x in ['CONFIRMED', 'CANDIDATE'] else 0)

print(f"Master file loaded. Found {len(df)} candidates.")

# --- Data Sub-sampling ---
# Downloading and processing all ~9000 light curves would take many hours.
# For a hackathon, we create a smaller, balanced subset to train the model efficiently.
N_SAMPLES_PER_CLASS = 200 # Increase this number for a more robust model, but expect longer run times.

confirmed_df = df[df['is_exoplanet'] == 1].sample(n=N_SAMPLES_PER_CLASS, random_state=42)
false_positive_df = df[df['is_exoplanet'] == 0].sample(n=N_SAMPLES_PER_CLASS, random_state=42)

# Combine the samples into a final working dataframe.
working_df = pd.concat([confirmed_df, false_positive_df]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Created a balanced working dataset of {len(working_df)} light curves.")
print(f"Class distribution:\n{working_df['is_exoplanet'].value_counts()}")


# ==============================================================================
# Part 3: Light Curve Processing Function
# ==============================================================================

# Define the size of our 1D "image"
BINS = 256

def process_light_curve(kepid, period):
    """
    Downloads, cleans, folds, and bins a Kepler light curve for a given ID and period.
    Returns a 1D numpy array ready for the CNN.
    """
    try:
        # 1. Search for and download the light curve data using the Kepler ID.
        # We specify 'Kepler' as the mission to ensure we get the right data.
        search_result = lk.search_lightcurve(f'KIC {kepid}', mission='Kepler')
        # Download all available datasets and stitch them together into one light curve.
        lc_collection = search_result.download_all()
        lc = lc_collection.stitch().remove_nans()

        # 2. Flatten the light curve to remove stellar variability, keeping the transits.
        flat_lc = lc.flatten(window_length=401)

        # 3. Fold the light curve on the known orbital period of the candidate.
        folded_lc = flat_lc.fold(period=period)

        # 4. Bin the folded light curve to create a standardized 1D array ("image").
        binned_lc = folded_lc.bin(time_bin_size=period/BINS, n_bins=BINS)

        # 5. Normalize the flux so that the median is 0. This helps the CNN.
        normalized_flux = binned_lc.flux.value - np.median(binned_lc.flux.value)

        # 6. Handle cases where the process might fail and return None.
        if not np.all(np.isfinite(normalized_flux)):
             return None

        return normalized_flux

    except Exception as e:
        # If any step fails (e.g., data not found), we print an error and return None.
        # print(f"Could not process KEPID {kepid}: {e}")
        return None


Installing required libraries...
Libraries imported successfully.

Loading the master exoplanet CSV file...
Master file loaded. Found 9564 candidates.
Created a balanced working dataset of 400 light curves.
Class distribution:
is_exoplanet
0    200
1    200
Name: count, dtype: int64


In [4]:
#check gpu active or not
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("TensorFlow version:", tf.__version__)

Num GPUs Available:  1
TensorFlow version: 2.16.2


In [5]:

# ==============================================================================
# Part 4: Build the Visual Dataset
# ==============================================================================
print("\nStarting to build the visual dataset. This will take several minutes...")

X_visual = []
y_visual = []



Starting to build the visual dataset. This will take several minutes...


In [None]:

from tqdm.notebook import tqdm
# Use tqdm for a progress bar, which is very helpful for long-running tasks.
for index, row in tqdm(working_df.iterrows(), total=len(working_df)):
    kepid = row['kepid']
    period = row['koi_period']
    label = row['is_exoplanet']

    # Process the light curve
    flux_data = process_light_curve(kepid, period)

    # Only add the data if the processing was successful
    if flux_data is not None:
        X_visual.append(flux_data)
        y_visual.append(label)

# Convert the lists to numpy arrays for TensorFlow.
X_visual = np.array(X_visual)
y_visual = np.array(y_visual)

# Reshape X for the CNN: [number_of_samples, number_of_bins, 1 (channel)]
X_visual = X_visual.reshape(X_visual.shape[0], BINS, 1)

print(f"\nVisual dataset created successfully. Shape of X: {X_visual.shape}")


  0%|          | 0/400 [00:00<?, ?it/s]

In [None]:
# ==============================================================================
# Part 5: Build and Train the 1D CNN
# ==============================================================================
print("\nBuilding the 1D Convolutional Neural Network...")

model = Sequential([
    # Input Layer - Conv1D learns features from the 1D "image"
    Conv1D(filters=8, kernel_size=5, activation='relu', padding='same', input_shape=(BINS, 1)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=16, kernel_size=5, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=32, kernel_size=5, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=4),

    # Flatten the learned features to feed into the classification layers
    Flatten(),

    # Dense layers for classification
    Dense(32, activation='relu'),
    Dropout(0.3), # Dropout helps prevent overfitting
    Dense(1, activation='sigmoid') # Sigmoid output for binary classification
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

# --- Train the model ---
print("\nSplitting data and training the model...")
X_train, X_test, y_train, y_test = train_test_split(X_visual, y_visual, test_size=0.25, random_state=42, stratify=y_visual)

# Adding a callback to save the best model during training
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_cnn_model.h5",
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    mode='max'
)

history = model.fit(X_train, y_train,
                    epochs=25,
                    batch_size=16,
                    validation_data=(X_test, y_test),
                    callbacks=[checkpoint])

# ==============================================================================
# Part 6: Evaluate the Model
# ==============================================================================
print("\nEvaluating the final model on the test set...")

# Load the best performing model saved by the callback
best_model = tf.keras.models.load_model("best_cnn_model.h5")

# Make predictions (as probabilities)
y_pred_proba = best_model.predict(X_test)
# Convert probabilities to binary class labels (0 or 1)
y_pred = (y_pred_proba > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['False Positive', 'Exoplanet']))

# --- Plot Training History ---
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Plot accuracy
ax1.plot(history.history['accuracy'], label='Train Accuracy')
ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
# Plot loss
ax2.plot(history.history['loss'], label='Train Loss')
ax2.plot(history.history['val_loss'], label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
plt.tight_layout()
plt.show()

print("\nCNN model training and evaluation complete.")
print("The best model has been saved as 'best_cnn_model.h5'")

Exception ignored in: <function tqdm.__del__ at 0x14f5c6820>
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf-macos/lib/python3.9/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf-macos/lib/python3.9/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'



Building the 1D Convolutional Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-10-04 02:45:10.660886: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-10-04 02:45:10.660971: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-10-04 02:45:10.660985: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.92 GB
2025-10-04 02:45:10.661005: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-04 02:45:10.661024: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)



Splitting data and training the model...


ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.