# INSE6220 Final Project — KOI koi_score Surrogate Model

This notebook follows the agreed framework: clean the raw KOI table, save a clean dataset for reuse, and provide a dedicated "Load Cleaned Data" block for all later steps (EDA, PCA, and modeling).

## How to use this notebook
1. Run the Setup cells below.
2. Run the "Data Cleaning (run once)" cell to generate `data/koi_clean.csv`.
3. For all later work, use the "Load Cleaned Data" cell only — do not re-run cleaning (to avoid leakage and ensure reproducibility).

In [None]:
# Setup: imports and paths
import os
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
# Resolve raw and clean paths robustly relative to working directory
RAW_CANDIDATES = [
    os.path.join('data', 'koi.csv'),
    os.path.join('..', 'data', 'koi.csv'),
]
RAW_PATH = next((p for p in RAW_CANDIDATES if os.path.isfile(p)), RAW_CANDIDATES[0])
DATA_DIR = os.path.dirname(RAW_PATH) if os.path.basename(RAW_PATH) else os.path.join('data')
CLEAN_PATH = os.path.join(DATA_DIR, 'koi_clean.csv')

In [None]:
# Column groups (kept consistent with the framework)
IDENTIFIER_COLS = [
    'kepid',            # Kepler Catalog ID
    'kepoi_name',       # KOI Name
    'kepler_name',      # Official Kepler Planet Name (if any)
]

TARGET_COL = 'koi_score'

LABEL_COLS = [
    'koi_disposition',     # Exoplanet Archive Disposition
    'koi_pdisposition',    # Disposition Using Kepler Data
    'koi_fpflag_nt',       # Not Transit-Like FP flag
    'koi_fpflag_ss',       # Stellar Eclipse FP flag
    'koi_fpflag_co',       # Centroid Offset FP flag
    'koi_fpflag_ec',       # Ephemeris Match FP flag
]

FEATURE_COLS = [
    # Transit geometry & shape
    'koi_period',
    'koi_time0bk',
    'koi_impact',
    'koi_duration',
    'koi_depth',
    'koi_model_snr',
    # Planet properties & irradiation
    'koi_prad',
    'koi_teq',
    'koi_insol',
    # Stellar properties
    'koi_steff',
    'koi_slogg',
    'koi_srad',
    # Sky position & brightness
    'ra',
    'dec',
    'koi_kepmag',
]

# Always drop from feature matrix (metadata), per framework
DROP_ALWAYS = ['koi_tce_plnt_num', 'koi_tce_delivname']

ALL_KEEP_COLS = IDENTIFIER_COLS + [TARGET_COL] + LABEL_COLS + FEATURE_COLS

## Data Cleaning (run once)
Cleans the raw KOI table according to the framework and writes `data/koi_clean.csv`.

Notes:
- Skips comment lines from the raw NASA export.
- Retains identifier columns, target `koi_score`, label/flag columns, and the primary feature columns (central values only).
- Drops uncertainty columns (`*_err1`, `*_err2`) implicitly by not selecting them.
- Drops exact duplicate rows.
- Drops rows with missing or out-of-range `koi_score`.
- Does not impute feature missing values here (to avoid leakage) — handle later within ML pipelines.

In [None]:
def clean_koi(raw_path: str, out_path: str) -> pd.DataFrame:
    """
    Load raw KOI CSV, select relevant columns, enforce basic validity of koi_score,
    and save a clean CSV for downstream use.
    """
    # Load raw CSV; ignore NASA header comments starting with '#'
    df = pd.read_csv(raw_path, comment='#', low_memory=False)
    orig_rows = len(df)
    df.columns = df.columns.str.strip()

    # Keep only the specified columns if present
    keep_cols = [c for c in ALL_KEEP_COLS if c in df.columns]
    df = df.loc[:, keep_cols].copy()

    # Drop exact duplicate rows
    before_dupes = len(df)
    df = df.drop_duplicates()
    dropped_dupes = before_dupes - len(df)

    # Coerce numeric columns
    numeric_cols = set(FEATURE_COLS + [TARGET_COL, 'kepid'])
    numeric_cols.update([c for c in LABEL_COLS if c.startswith('koi_fpflag_')])
    numeric_cols = [c for c in numeric_cols if c in df.columns]
    for c in numeric_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    # Remove rows with missing target
    before_drop_y_na = len(df)
    df = df.dropna(subset=[TARGET_COL])
    dropped_y_na = before_drop_y_na - len(df)

    # Enforce koi_score within [0, 1]
    before_range = len(df)
    mask_valid = (df[TARGET_COL] >= 0.0) & (df[TARGET_COL] <= 1.0)
    df = df.loc[mask_valid].copy()
    dropped_out_of_range = before_range - len(df)

    # Normalize string columns
    for c in ['kepoi_name', 'kepler_name', 'koi_disposition', 'koi_pdisposition']:
        if c in df.columns:
            df[c] = df[c].astype('string').str.strip()

    # Save clean CSV
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_csv(out_path, index=False)

    print(f'Loaded rows: {orig_rows}')
    print(f'Dropped exact duplicates: {dropped_dupes}')
    print(f'Dropped missing koi_score: {dropped_y_na}')
    print(f'Dropped out-of-range koi_score: {dropped_out_of_range}')
    print(f'Kept columns ({len(keep_cols)}): {keep_cols}')
    print(f'Final shape: {df.shape}')
    print(f'Saved to: {out_path}')
    return df

# Run cleaning once to produce data/koi_clean.csv
FORCE_REBUILD = False  # set True to overwrite existing clean file if needed
if FORCE_REBUILD or (not os.path.exists(CLEAN_PATH)):
    _df_clean = clean_koi(RAW_PATH, CLEAN_PATH)
else:
    print(f'Clean file already exists at {CLEAN_PATH}. Set FORCE_REBUILD=True and re-run this cell to rebuild.')

## Load Cleaned Data (use this for EDA/PCA/Models)
This block loads only the clean dataset. Use it for all downstream analysis.

In [None]:
# Load the cleaned dataset
import os
import pandas as pd

CLEAN_CANDIDATES = [
    os.path.join('data', 'koi_clean.csv'),
    os.path.join('..', 'data', 'koi_clean.csv'),
]
CLEAN_PATH = next((p for p in CLEAN_CANDIDATES if os.path.isfile(p)), CLEAN_CANDIDATES[0])
df = pd.read_csv(CLEAN_PATH, low_memory=False)

# Re-declare column groups for standalone use of this cell onward
IDENTIFIER_COLS = ['kepid', 'kepoi_name', 'kepler_name']
TARGET_COL = 'koi_score'
LABEL_COLS = ['koi_disposition','koi_pdisposition','koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec']
FEATURE_COLS = [
    'koi_period','koi_time0bk','koi_impact','koi_duration','koi_depth','koi_model_snr',
    'koi_prad','koi_teq','koi_insol','koi_steff','koi_slogg','koi_srad','ra','dec','koi_kepmag'
]

print(f'Loaded clean dataset: {df.shape[0]} rows x {df.shape[1]} columns')
df.head()

## Box plots with automatic log transform (recommended)
Decides per-column whether to apply a log10 transform (based on skewness and dynamic range), prints the decision, and plots box plots accordingly.

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Candidate numeric columns
all_numeric = [c for c in df.select_dtypes(include=['number']).columns]
# Exclude id/target and fp flags (treated as discrete)
exclude_base = {'kepid','koi_score'}
exclude_base.update([c for c in df.columns if c.startswith('koi_fpflag_')])
# Identify discrete small-cardinality numeric columns to skip in box plots
discrete_small = [c for c in all_numeric if df[c].nunique(dropna=True) <= 5]
numeric_cols = [c for c in all_numeric if c not in exclude_base and c not in discrete_small]

# Columns we keep linear due to physical meaning (angles/magnitude)
never_log = {'ra','dec','koi_kepmag'}

log10_cols, log1p_cols, linear_cols = [], [], []
for col in numeric_cols:
    s = pd.to_numeric(df[col], errors='coerce').dropna()
    if col in never_log:
        linear_cols.append(col)
        continue
    if len(s) < 3:
        linear_cols.append(col)
        continue
    skew = float(s.skew())
    q95, q05 = s.quantile(0.95), s.quantile(0.05)
    ratio = (q95 / max(q05, 1e-12)) if q05 > 0 else np.inf
    criterion = (skew > 0.75) or (ratio > 20)
    minv = float(s.min())
    if criterion and minv > 0:
        log10_cols.append(col)
    elif criterion and minv >= 0:
        log1p_cols.append(col)
    else:
        linear_cols.append(col)

# Create transformed copy for plotting
plot_df = df.copy()
for col in log10_cols:
    plot_df[col] = np.log10(plot_df[col])
for col in log1p_cols:
    plot_df[col] = np.log10(plot_df[col] + 1.0)

print('Skipped discrete/binary columns:', discrete_small)
print('Log10 columns:', log10_cols)
print('Log10(1+x) columns:', log1p_cols)
print('Linear columns:', linear_cols)

# Plot box plots using transformed data where applicable
cols_to_plot = numeric_cols
ncols = 4
nrows = math.ceil(len(cols_to_plot) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(4.8 * ncols, 3.0 * nrows), squeeze=False)

for i, col in enumerate(cols_to_plot):
    r, c = divmod(i, ncols)
    ax = axes[r][c]
    sns.boxplot(x=plot_df[col].dropna(), ax=ax, color='#9467bd', orient='h', whis=1.5, showfliers=False)
    tr = ' (log10)' if col in log10_cols else (' (log10(1+x))' if col in log1p_cols else '')
    ax.set_title(col + tr)
    ax.grid(True, axis='x', linestyle=':', alpha=0.35)

# Hide unused axes
total_axes = nrows * ncols
for j in range(len(cols_to_plot), total_axes):
    r, c = divmod(j, ncols)
    axes[r][c].set_visible(False)

plt.tight_layout()
plt.show()


## Attribute–koi_score correlation and covariance
Absolute Pearson correlation and covariance with koi_score for model-relevant features (excludes ID and flags).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Use model-relevant numeric features only to avoid label leakage
features = [c for c in FEATURE_COLS if c in df.columns]

y = pd.to_numeric(df[TARGET_COL], errors='coerce')
rows = []
for col in features:
    x = pd.to_numeric(df[col], errors='coerce')
    m = x.notna() & y.notna()
    if m.sum() < 3:
        corr = np.nan
        cov = np.nan
    else:
        corr = x[m].corr(y[m])
        cov = x[m].cov(y[m])
    rows.append({
        'feature': col,
        'corr': corr,
        'abs_corr': np.abs(corr) if pd.notna(corr) else np.nan,
        'cov': cov,
        'abs_cov': np.abs(cov) if pd.notna(cov) else np.nan
    })

stats = pd.DataFrame(rows).dropna(subset=['abs_corr'])
stats_corr = stats.sort_values('abs_corr', ascending=False)
stats_cov = stats.dropna(subset=['abs_cov']).sort_values('abs_cov', ascending=False)

# Plot absolute correlation and covariance
N = min(20, len(stats_corr)) if len(stats_corr) else 0
fig, axes = plt.subplots(1, 2, figsize=(12, max(4, 0.35 * max(N, 8))), sharex=False)

if N > 0:
    sub = stats_corr.head(N)
    sns.barplot(ax=axes[0], y='feature', x='abs_corr', data=sub, color='#1f77b4')
    axes[0].set_title('Abs Pearson correlation with koi_score (top {})'.format(N))
    axes[0].set_xlabel('|corr|')
    axes[0].set_ylabel('feature')
else:
    axes[0].set_visible(False)

M = min(20, len(stats_cov)) if len(stats_cov) else 0
if M > 0:
    sub2 = stats_cov.head(M)
    sns.barplot(ax=axes[1], y='feature', x='abs_cov', data=sub2, color='#ff7f0e')
    axes[1].set_title('Abs covariance with koi_score (top {})'.format(M))
    axes[1].set_xlabel('|cov|')
    axes[1].set_ylabel('feature')
else:
    axes[1].set_visible(False)

plt.tight_layout()
plt.show()

print('Top 10 by |corr|:')
print(stats_corr[['feature','corr']].head(10).to_string(index=False))
print()
print('Top 10 by |cov|:')
print(stats_cov[['feature','cov']].head(10).to_string(index=False))


## Distribution of koi_score
Basic distribution of the target to assess skew and mass near 0/1.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(7, 4))
ax = sns.histplot(df['koi_score'].dropna(), bins=40, kde=True, color='#1f77b4', edgecolor='white')
ax.set_title('Distribution of koi_score')
ax.set_xlabel('koi_score')
ax.set_ylabel('Count')
ax.set_xlim(0, 1)
plt.tight_layout()
plt.show()

print('koi_score summary:')
print(df['koi_score'].describe().to_string())
