In [None]:
import pandas as pd
import numpy as np
import difflib
from pathlib import Path

# --- 1. Read robustly ---
df = pd.read_csv('TESS.csv', comment='#', sep=',', skip_blank_lines=True)
df.columns = df.columns.str.strip()   # remove stray whitespace

# Save a full backup
backup_path = Path('TESS_original_backup.csv')
df.to_csv(backup_path, index=False)
print("Backup saved to:", backup_path)

# --- 2. Show columns and quick diagnostics ---
print("Columns count:", len(df.columns))
print(df.columns.tolist())

# --- 3. Safe keep selection (intersection) ---
keep_columns = [
    'tfopwg_disp', 'ra', 'dec', 'st_pmra', 'st_pmdec',
    'pl_tranmid','pl_orbper','pl_trandurh','pl_trandep','pl_rade',
    'pl_insol','pl_eqt','st_tmag','st_dist','st_teff','st_logg','st_rad'
]

present = [c for c in keep_columns if c in df.columns]
missing = list(set(keep_columns) - set(present))
print("Present keep columns:", present)
print("Missing expected columns (investigate):", missing)

for m in missing:
    print(m, "-> close matches:", difflib.get_close_matches(m, df.columns, n=6, cutoff=0.5))

# Working copy
df_work = df.copy()

# --- 4. TARGET MAPPING & CLEANING ---
print("="*60)
print("TARGET MAPPING")
print("="*60)

# Drop FA entirely
df_work = df_work[~df_work['tfopwg_disp'].isin(['FA'])]

# Map remaining classes to our 3-class scheme
target_mapping = {
    'CP': 'Confirmed Planet',
    'KP': 'Confirmed Planet',
    'PC': 'Planet Candidate',
    'FP': 'False Positive'
}

df_work['tfopwg_disp_mapped'] = df_work['tfopwg_disp'].map(target_mapping)

# Drop any rows that didn't match mapping (should not happen now)
df_work = df_work[df_work['tfopwg_disp_mapped'].notna()]

# Encode labels: 0=Confirmed, 1=Candidate, 2=False Positive
label_map = {
    'Confirmed Planet': 0,
    'Planet Candidate': 1,
    'False Positive': 2
}
df_work['label'] = df_work['tfopwg_disp_mapped'].map(label_map)

# Summary
print("Mapped target distribution:")
print(df_work['tfopwg_disp_mapped'].value_counts())
print("\nLabel mapping:")
for k, v in label_map.items():
    count = (df_work['label'] == v).sum()
    print(f"  {k:18s} -> {v} ({count} samples)")

# --- 5. Save intermediate cleaned dataset ---
df_work.to_csv('TESS_cleaned_for_review.csv', index=False)
print("Intermediate cleaned file written to 'TESS_cleaned_for_review.csv'")


Backup saved to: TESS_original_backup.csv
Columns count: 87
['rowid', 'toi', 'toipfx', 'tid', 'ctoi_alias', 'pl_pnum', 'tfopwg_disp', 'rastr', 'ra', 'raerr1', 'raerr2', 'decstr', 'dec', 'decerr1', 'decerr2', 'st_pmra', 'st_pmraerr1', 'st_pmraerr2', 'st_pmralim', 'st_pmrasymerr', 'st_pmdec', 'st_pmdecerr1', 'st_pmdecerr2', 'st_pmdeclim', 'st_pmdecsymerr', 'pl_tranmid', 'pl_tranmiderr1', 'pl_tranmiderr2', 'pl_tranmidlim', 'pl_tranmidsymerr', 'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim', 'pl_orbpersymerr', 'pl_trandurh', 'pl_trandurherr1', 'pl_trandurherr2', 'pl_trandurhlim', 'pl_trandurhsymerr', 'pl_trandep', 'pl_trandeperr1', 'pl_trandeperr2', 'pl_trandeplim', 'pl_trandepsymerr', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_radesymerr', 'pl_insol', 'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_insolsymerr', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2', 'pl_eqtlim', 'pl_eqtsymerr', 'st_tmag', 'st_tmagerr1', 'st_tmagerr2', 'st_tmaglim', 'st_tmagsymerr', 'st_dis

In [None]:
# ============================================================
# MISSING VALUE TREATMENT & FEATURE ENGINEERING
# ============================================================

import numpy as np
from sklearn.impute import SimpleImputer

print("="*60)
print("MISSING VALUE TREATMENT & FEATURE ENGINEERING")
print("="*60)

X = df_work.copy()  # operate on working copy

# 1️⃣ Boolean flags: if any '_is_limit' columns exist, fill missing with 0
bool_cols = [c for c in X.columns if '_is_limit' in c]
for col in bool_cols:
    before = X[col].isnull().sum()
    X[col] = X[col].fillna(0).astype(int)
    print(f"Filled {col}: {before} missing -> 0")

# 2️⃣ Relative errors: fill missing with median
err_cols = [c for c in X.columns if '_err_rel' in c]
for col in err_cols:
    if X[col].isnull().sum() > 0:
        median_val = X[col].median()
        before = X[col].isnull().sum()
        X[col] = X[col].fillna(median_val)
        print(f"Filled {col}: {before} missing -> median {median_val:.6f}")

# 3️⃣ Core numeric features: fill missing with median (prevent dataset shrinkage)
core_features = [
    'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade',
    'st_teff', 'st_rad', 'st_logg', 'st_tmag'
]
for col in core_features:
    if col in X.columns and X[col].isnull().sum() > 0:
        median_val = X[col].median()
        before = X[col].isnull().sum()
        X[col] = X[col].fillna(median_val)
        print(f"Filled core feature {col}: {before} missing -> median {median_val:.4g}")

# 4️⃣ Replace infinities with NaN
X = X.replace([np.inf, -np.inf], np.nan)

# 5️⃣ Fill remaining numeric NaNs with median (only numeric columns)
numeric_cols = X.select_dtypes(include=np.number).columns
if X[numeric_cols].isnull().sum().sum() > 0:
    X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
    print(f"Filled remaining {X[numeric_cols].isnull().sum().sum()} numeric NaNs with median")

print(f"\nFinal missing values after imputation: {X.isnull().sum().sum()}")

# --- FEATURE ENGINEERING ---
X['transit_signal_strength'] = X['pl_trandep'] * X['pl_trandurh']
X['planet_star_radius_ratio'] = np.sqrt(X['pl_trandep'] / 1e6)
X['stellar_density_proxy'] = (X['pl_trandurh'] / X['pl_orbper']) ** 2

if 'pl_insol' in X.columns and 'pl_eqt' in X.columns:
    X['insol_temp_ratio'] = X['pl_eqt'] / np.sqrt(X['pl_insol'])

err_cols_present = [c for c in err_cols if c in X.columns]
if err_cols_present:
    X['avg_measurement_error'] = X[err_cols_present].mean(axis=1)

if 'pl_eqt' in X.columns:
    X['is_hot_jupiter'] = (X['pl_eqt'] > 1000).astype(int)

X['is_small_planet'] = (X['pl_rade'] < 2.0).astype(int)
X['is_bright_star'] = (X['st_tmag'] < 12.0).astype(int)

if 'st_pmra' in X.columns and 'st_pmdec' in X.columns:
    X['proper_motion_total'] = np.sqrt(X['st_pmra']**2 + X['st_pmdec']**2)

X['log_period'] = np.log10(X['pl_orbper'])

print(f"\n{'='*60}")
print(f"Final feature count: {X.shape[1]} features")
print(f"Sample count: {X.shape[0]} samples")
print("="*60)


# ============================================================
# DATA SPLITTING & SCALING
# ============================================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("="*60)
print("DATA SPLITTING & SCALING")
print("="*60)

# --- 1️⃣ Define features and target ---
y = X['label']
X = X.drop(columns=['tfopwg_disp', 'tfopwg_disp_mapped', 'label'], errors='ignore')

# --- 2️⃣ Split into train/test ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples:  {X_test.shape[0]}")

# --- 3️⃣ Scale numeric columns ---
scaler = StandardScaler()

# Fit only on training numeric columns
X_train_num = X_train.select_dtypes(include=np.number)
X_test_num = X_test.select_dtypes(include=np.number)

X_train_scaled = scaler.fit_transform(X_train_num)
X_test_scaled = scaler.transform(X_test_num)

# Convert back to DataFrame with same column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_num.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_num.columns, index=X_test.index)

print("Scaling complete — numeric features standardized.")
print("="*60)

MISSING VALUE TREATMENT & FEATURE ENGINEERING
Filled core feature pl_orbper: 99 missing -> median 4.116
Filled core feature pl_rade: 459 missing -> median 10.5
Filled core feature st_teff: 148 missing -> median 5789
Filled core feature st_rad: 460 missing -> median 1.23
Filled core feature st_logg: 759 missing -> median 4.334
Filled remaining 85716 numeric NaNs with median

Final missing values after imputation: 85716

Final feature count: 98 features
Sample count: 7143 samples
DATA SPLITTING & SCALING
Train samples: 5357
Test samples:  1786
Scaling complete — numeric features standardized.


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
import numpy as np

# --- 6. MISSING VALUE TREATMENT & FEATURE ENGINEERING ---
print("="*60)
print("MISSING VALUE TREATMENT & FEATURE ENGINEERING")
print("="*60)

X = df_work.copy()  # operate on working copy

# 1️⃣ Boolean flags: if any '_is_limit' columns exist, fill missing with 0
bool_cols = [c for c in X.columns if '_is_limit' in c]
for col in bool_cols:
    before = X[col].isnull().sum()
    X[col] = X[col].fillna(0).astype(int)
    print(f"Filled {col}: {before} missing -> 0")

# 2️⃣ Relative errors: fill missing with median
err_cols = [c for c in X.columns if '_err_rel' in c]
for col in err_cols:
    if X[col].isnull().sum() > 0:
        median_val = X[col].median()
        before = X[col].isnull().sum()
        X[col] = X[col].fillna(median_val)
        print(f"Filled {col}: {before} missing -> median {median_val:.6f}")

# 3️⃣ Core numeric features: fill missing with median (prevent dataset shrinkage)
core_features = [
    'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade',
    'st_teff', 'st_rad', 'st_logg', 'st_tmag'
]
for col in core_features:
    if col in X.columns and X[col].isnull().sum() > 0:
        median_val = X[col].median()
        before = X[col].isnull().sum()
        X[col] = X[col].fillna(median_val)
        print(f"Filled core feature {col}: {before} missing -> median {median_val:.4g}")

# 4️⃣ Replace infinities with NaN
X = X.replace([np.inf, -np.inf], np.nan)

# 5️⃣ Fill remaining numeric NaNs with median (only numeric columns)
numeric_cols = X.select_dtypes(include=np.number).columns
if X[numeric_cols].isnull().sum().sum() > 0:
    X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
    print(f"Filled remaining {X[numeric_cols].isnull().sum().sum()} numeric NaNs with median")

print(f"\nFinal missing values after imputation: {X.isnull().sum().sum()}")

# --- FEATURE ENGINEERING ---
X['transit_signal_strength'] = X['pl_trandep'] * X['pl_trandurh']
X['planet_star_radius_ratio'] = np.sqrt(X['pl_trandep'] / 1e6)
X['stellar_density_proxy'] = (X['pl_trandurh'] / X['pl_orbper']) ** 2

if 'pl_insol' in X.columns and 'pl_eqt' in X.columns:
    X['insol_temp_ratio'] = X['pl_eqt'] / np.sqrt(X['pl_insol'])

err_cols_present = [c for c in err_cols if c in X.columns]
if err_cols_present:
    X['avg_measurement_error'] = X[err_cols_present].mean(axis=1)

if 'pl_eqt' in X.columns:
    X['is_hot_jupiter'] = (X['pl_eqt'] > 1000).astype(int)

X['is_small_planet'] = (X['pl_rade'] < 2.0).astype(int)
X['is_bright_star'] = (X['st_tmag'] < 12.0).astype(int)

if 'st_pmra' in X.columns and 'st_pmdec' in X.columns:
    X['proper_motion_total'] = np.sqrt(X['st_pmra']**2 + X['st_pmdec']**2)

X['log_period'] = np.log10(X['pl_orbper'])

print(f"\n{'='*60}")
print(f"Final feature count: {X.shape[1]} features")
print(f"Sample count: {X.shape[0]} samples")
print("="*60)


MISSING VALUE TREATMENT & FEATURE ENGINEERING
Filled core feature pl_orbper: 99 missing -> median 4.116
Filled core feature pl_rade: 459 missing -> median 10.5
Filled core feature st_teff: 148 missing -> median 5789
Filled core feature st_rad: 460 missing -> median 1.23
Filled core feature st_logg: 759 missing -> median 4.334
Filled remaining 85716 numeric NaNs with median

Final missing values after imputation: 85716

Final feature count: 98 features
Sample count: 7143 samples


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# --- BASELINE MODEL: RANDOM FOREST ---
print("="*60)
print("BASELINE MODEL: RANDOM FOREST")
print("="*60)

# Initialize classifier
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)

# Train
rf_clf.fit(X_train_scaled, y_train)

# Predict
y_pred = rf_clf.predict(X_test_scaled)

# Convert target names to strings for report
target_names_str = ['Confirmed Planet', 'Planet Candidate', 'False Positive']

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names_str))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


BASELINE MODEL: RANDOM FOREST

Classification Report:
                  precision    recall  f1-score   support

Confirmed Planet       0.80      0.49      0.61       317
Planet Candidate       0.77      0.94      0.85      1170
  False Positive       0.74      0.42      0.54       299

        accuracy                           0.77      1786
       macro avg       0.77      0.62      0.66      1786
    weighted avg       0.77      0.77      0.75      1786


Confusion Matrix:
[[ 155  156    6]
 [  33 1100   37]
 [   6  168  125]]


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# --- XGBOOST CLASSIFIER WITH HYPERPARAMETER TUNING ---
print("="*60)
print("XGBOOST CLASSIFIER WITH RANDOMIZED SEARCH")
print("="*60)

# Initialize base XGBClassifier
xgb_clf = XGBClassifier(
    objective='multi:softmax',  # multi-class classification
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 400, 600],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2.0]
}

# Randomized search
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=20,           # number of random combinations to try
    scoring='f1_macro',  # focus on macro F1
    cv=3,                # 3-fold CV
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X_train_scaled, y_train)

# Best model
best_xgb = random_search.best_estimator_
print("\nBest hyperparameters found:")
print(random_search.best_params_)

# Predict
y_pred_xgb = best_xgb.predict(X_test_scaled)

# Evaluation
target_names_str = ['Confirmed Planet', 'Planet Candidate', 'False Positive']
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=target_names_str))

print("\nConfusion Matrix:")
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print(cm_xgb)


XGBOOST CLASSIFIER WITH RANDOMIZED SEARCH
Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best hyperparameters found:
{'subsample': 1.0, 'reg_lambda': 1.5, 'reg_alpha': 0.5, 'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.6}

Classification Report:
                  precision    recall  f1-score   support

Confirmed Planet       0.77      0.59      0.67       317
Planet Candidate       0.81      0.93      0.86      1170
  False Positive       0.75      0.51      0.60       299

        accuracy                           0.80      1786
       macro avg       0.77      0.67      0.71      1786
    weighted avg       0.79      0.80      0.79      1786


Confusion Matrix:
[[ 187  120   10]
 [  45 1083   42]
 [  11  136  152]]


In [None]:
# Keep only numeric columns
X_train_num = X_train_scaled.select_dtypes(include=np.number)
X_test_num = X_test_scaled[X_train_num.columns]

# Impute any remaining NaNs with median (should be safe now)
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_train_num = imputer.fit_transform(X_train_num)
X_test_num = imputer.transform(X_test_num)


 'pl_insollim' 'pl_insolsymerr' 'pl_eqterr1' 'pl_eqterr2' 'pl_eqtlim'
 'pl_eqtsymerr']. At least one non-missing value is needed for imputation with strategy='median'.
 'pl_insollim' 'pl_insolsymerr' 'pl_eqterr1' 'pl_eqterr2' 'pl_eqtlim'
 'pl_eqtsymerr']. At least one non-missing value is needed for imputation with strategy='median'.


In [None]:
# --- Use numeric, imputed features ---
X_train_num = X_train_scaled.select_dtypes(include=np.number)
X_test_num = X_test_scaled[X_train_num.columns]

# Impute any remaining NaNs with median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_train_num = imputer.fit_transform(X_train_num)
X_test_num = imputer.transform(X_test_num)

# --- Stacking ensemble ---
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Base learners
estimators = [
    ('rf', RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)),
    ('xgb', XGBClassifier(
        objective='multi:softmax', num_class=3, eval_metric='mlogloss',
        n_estimators=400, max_depth=7, learning_rate=0.05, random_state=42
    )),
]

# Meta-learner
meta_clf = LogisticRegression(multi_class='multinomial', max_iter=1000)

# Stacking
stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_clf,
    cv=3,
    n_jobs=-1,
    passthrough=True
)

# Fit on numeric, imputed data
stack_clf.fit(X_train_num, y_train)

# Predict
y_pred_stack = stack_clf.predict(X_test_num)

# Predict probabilities for ROC-AUC
y_pred_stack_proba = stack_clf.predict_proba(X_test_num)

# Evaluation
print("\nStacked Model Classification Report:")
print(classification_report(y_test, y_pred_stack, target_names=target_names_str))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))

# Calculate ROC-AUC
# For multi-class, use 'ovo' (one vs one) or 'ovr' (one vs rest)
roc_auc = roc_auc_score(y_test, y_pred_stack_proba, multi_class='ovo')
print(f"\nROC-AUC Score (One vs One): {roc_auc:.4f}")

 'pl_insollim' 'pl_insolsymerr' 'pl_eqterr1' 'pl_eqterr2' 'pl_eqtlim'
 'pl_eqtsymerr']. At least one non-missing value is needed for imputation with strategy='median'.
 'pl_insollim' 'pl_insolsymerr' 'pl_eqterr1' 'pl_eqterr2' 'pl_eqtlim'
 'pl_eqtsymerr']. At least one non-missing value is needed for imputation with strategy='median'.



Stacked Model Classification Report:
                  precision    recall  f1-score   support

Confirmed Planet       0.79      0.60      0.68       317
Planet Candidate       0.81      0.93      0.86      1170
  False Positive       0.73      0.49      0.59       299

        accuracy                           0.80      1786
       macro avg       0.78      0.67      0.71      1786
    weighted avg       0.79      0.80      0.78      1786


Confusion Matrix:
[[ 191  119    7]
 [  39 1084   47]
 [  12  141  146]]

ROC-AUC Score (One vs One): 0.8870
