<a href="https://colab.research.google.com/github/YashL-495107/font-size-37px-/blob/main/Exoplanet_csvs6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

uploaded = files.upload()  # Choose all three files here


Saving xgb_kepler_model.pkl to xgb_kepler_model.pkl
Saving label_encoder.pkl to label_encoder.pkl
Saving cumulative_2025.10.02_22.38.50.csv to cumulative_2025.10.02_22.38.50.csv


In [2]:
import os
os.rename("cumulative_2025.10.02_22.38.50.csv", "cumulative.csv")


In [3]:
!pip install xgboost pandas scikit-learn




In [4]:
import joblib
import pandas as pd

# Load trained model and encoder
model = joblib.load("xgb_kepler_model.pkl")
encoder = joblib.load("label_encoder.pkl")

# Features used in training
FEATURES = [
    "koi_period",
    "koi_duration",
    "koi_depth",
    "koi_prad",
    "koi_teq",
    "koi_insol",
    "koi_model_snr",
    "koi_steff",
    "koi_slogg",
    "koi_srad",
    "koi_kepmag"
]


In [5]:
def confidence_to_label(p: float) -> str:
    if p >= 0.90:
        return "very high"
    if p >= 0.75:
        return "high"
    if p >= 0.50:
        return "medium"
    if p >= 0.25:
        return "low"
    return "very low"

def predict_exoplanet(features_dict):
    df = pd.DataFrame([features_dict])[FEATURES]
    pred_numeric = model.predict(df)
    probs = model.predict_proba(df)
    pred_idx = int(pred_numeric[0])
    pred_prob = float(probs[0][pred_idx])
    pred_label = encoder.inverse_transform(pred_numeric)[0]
    return {
        "prediction": pred_label,
        "confidence": pred_prob,
        "confidence_label": confidence_to_label(pred_prob)
    }

def predict_csv(file_path):
    df = pd.read_csv(file_path)
    df = df[FEATURES]
    preds_numeric = model.predict(df)
    preds_labels = encoder.inverse_transform(preds_numeric)
    probs = model.predict_proba(df)
    pred_indices = preds_numeric.astype(int)
    pred_probs = [float(probs[i][pred_indices[i]]) for i in range(len(df))]
    df["prediction"] = preds_labels
    df["confidence"] = pred_probs
    df["confidence_label"] = [confidence_to_label(p) for p in pred_probs]
    return df


In [6]:
csv_file = "cumulative.csv"

# Read CSV while skipping problematic lines
df = pd.read_csv(csv_file, sep=',', engine='python', on_bad_lines='skip')

print(df.shape)
df.head()


(52, 1)


Unnamed: 0,# This file was produced by the NASA Exoplanet Archive http://exoplanetarchive.ipac.caltech.edu
0,# Thu Oct 2 22:38:50 2025
1,#
2,# COLUMN kepid: KepID
3,# COLUMN kepoi_name: KOI Name
4,# COLUMN kepler_name: Kepler Name


In [7]:
import pandas as pd
import joblib

# Paths to your uploaded files in Colab
MODEL_PATH = "xgb_kepler_model.pkl"
ENCODER_PATH = "label_encoder.pkl"
CSV_FILE = "cumulative.csv"

# Load model and encoder
model = joblib.load(MODEL_PATH)
encoder = joblib.load(ENCODER_PATH)

# Features used in training
FEATURES = [
    "koi_period",
    "koi_duration",
    "koi_depth",
    "koi_prad",
    "koi_teq",
    "koi_insol",
    "koi_model_snr",
    "koi_steff",
    "koi_slogg",
    "koi_srad",
    "koi_kepmag",
]

# Read CSV while skipping comment lines
df = pd.read_csv(CSV_FILE, comment='#')

# Keep only the columns needed for prediction
df_features = df[FEATURES]

# Predict
preds_numeric = model.predict(df_features)
preds_labels = encoder.inverse_transform(preds_numeric)
probs = model.predict_proba(df_features)
pred_indices = preds_numeric.astype(int)
pred_probs = [float(probs[i][pred_indices[i]]) for i in range(len(df_features))]

# Add predictions to dataframe
df["prediction"] = preds_labels
df["confidence"] = pred_probs

# Show first 10 predictions
df.head(10)


Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,prediction,confidence
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,CONFIRMED,0.941075
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,CONFIRMED,0.976496
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,CANDIDATE,0.926103
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,FALSE POSITIVE,0.984494
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,CONFIRMED,0.87436
5,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,CONFIRMED,0.942152
6,10872983,K00756.02,Kepler-228 c,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,CONFIRMED,0.93176
7,10872983,K00756.03,Kepler-228 b,CONFIRMED,CANDIDATE,0.992,0,0,0,0,...,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,CONFIRMED,0.72887
8,6721123,K00114.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,1,0,...,0.182,-0.098,1.958,0.322,-0.483,298.86435,42.151569,12.66,FALSE POSITIVE,0.97166
9,10910878,K00757.01,Kepler-229 c,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,0.083,-0.028,0.848,0.033,-0.072,286.99948,48.37579,15.841,CONFIRMED,0.813459


In [8]:
def confidence_to_label(p):
    if p >= 0.90:
        return "very high"
    elif p >= 0.75:
        return "high"
    elif p >= 0.50:
        return "medium"
    elif p >= 0.25:
        return "low"
    else:
        return "very low"

df["confidence_label"] = df["confidence"].apply(confidence_to_label)
df.head(10)


Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,prediction,confidence,confidence_label
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,CONFIRMED,0.941075,very high
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,CONFIRMED,0.976496,very high
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,CANDIDATE,0.926103,very high
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,FALSE POSITIVE,0.984494,very high
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,CONFIRMED,0.87436,high
5,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,CONFIRMED,0.942152,very high
6,10872983,K00756.02,Kepler-228 c,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,CONFIRMED,0.93176,very high
7,10872983,K00756.03,Kepler-228 b,CONFIRMED,CANDIDATE,0.992,0,0,0,0,...,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,CONFIRMED,0.72887,medium
8,6721123,K00114.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,1,0,...,-0.098,1.958,0.322,-0.483,298.86435,42.151569,12.66,FALSE POSITIVE,0.97166,very high
9,10910878,K00757.01,Kepler-229 c,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-0.028,0.848,0.033,-0.072,286.99948,48.37579,15.841,CONFIRMED,0.813459,high


In [9]:
df.to_csv("koi_predictions.csv", index=False)


In [10]:
from google.colab import files
files.download("koi_predictions.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
FEATURES = [
    "koi_period",
    "koi_duration",
    "koi_depth",
    "koi_prad",
    "koi_teq",
    "koi_insol",
    "koi_model_snr",
    "koi_steff",
    "koi_slogg",
    "koi_srad",
    "koi_kepmag"
]


In [12]:
import pandas as pd

df = pd.read_csv("koi_predictions.csv")  # Replace with your downloaded file name


In [13]:
df_features = df[FEATURES]


In [14]:
import os

os.makedirs("single_exoplanet_csvs", exist_ok=True)

for i, row in df_features.iterrows():
    # Use Kepler name or KOI name if available, otherwise index
    filename = row.get("kepler_name") if pd.notna(row.get("kepler_name")) else f"exoplanet_{i}"
    # Make filename safe
    filename = filename.replace(" ", "_").replace("/", "_")
    row.to_frame().T.to_csv(f"single_exoplanet_csvs/{filename}.csv", index=False)


In [15]:
import os

os.listdir("single_exoplanet_csvs")


['exoplanet_2385.csv',
 'exoplanet_7951.csv',
 'exoplanet_6410.csv',
 'exoplanet_3688.csv',
 'exoplanet_2323.csv',
 'exoplanet_9348.csv',
 'exoplanet_1644.csv',
 'exoplanet_8732.csv',
 'exoplanet_6858.csv',
 'exoplanet_5354.csv',
 'exoplanet_3759.csv',
 'exoplanet_2370.csv',
 'exoplanet_2044.csv',
 'exoplanet_749.csv',
 'exoplanet_6710.csv',
 'exoplanet_8033.csv',
 'exoplanet_2424.csv',
 'exoplanet_3989.csv',
 'exoplanet_6987.csv',
 'exoplanet_5097.csv',
 'exoplanet_7341.csv',
 'exoplanet_6215.csv',
 'exoplanet_5007.csv',
 'exoplanet_8739.csv',
 'exoplanet_7845.csv',
 'exoplanet_7800.csv',
 'exoplanet_4576.csv',
 'exoplanet_7558.csv',
 'exoplanet_6372.csv',
 'exoplanet_7697.csv',
 'exoplanet_4073.csv',
 'exoplanet_4307.csv',
 'exoplanet_1993.csv',
 'exoplanet_1109.csv',
 'exoplanet_5103.csv',
 'exoplanet_4456.csv',
 'exoplanet_3121.csv',
 'exoplanet_9331.csv',
 'exoplanet_7708.csv',
 'exoplanet_872.csv',
 'exoplanet_5487.csv',
 'exoplanet_8777.csv',
 'exoplanet_5065.csv',
 'exoplanet_4