In [5]:
#import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [6]:
# Load the data
data = pd.read_excel("C:/Users/noelj/Downloads/kepler_data.xlsx",skiprows=52)

In [7]:
# Inspect the data
print("Data Sample:")
print(data.head())

Data Sample:
      kepid kepoi_name   kepler_name koi_disposition koi_pdisposition  \
0  10797460  K00752.01  Kepler-227 b       CONFIRMED        CANDIDATE   
1  10797460  K00752.02  Kepler-227 c       CONFIRMED        CANDIDATE   
2  10811496  K00753.01           NaN       CANDIDATE        CANDIDATE   
3  10848459  K00754.01           NaN  FALSE POSITIVE   FALSE POSITIVE   
4  10854555  K00755.01  Kepler-664 b       CONFIRMED        CANDIDATE   

   koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_fpflag_ec  ...  \
0      1.000              0              0              0              0  ...   
1      0.969              0              0              0              0  ...   
2      0.000              0              0              0              0  ...   
3      0.000              0              1              0              0  ...   
4      1.000              0              0              0              0  ...   

   koi_steff_err2  koi_slogg  koi_slogg_err1  koi_slogg_err2 

In [8]:
print("\nColumns in Data:")
print(data.columns)


Columns in Data:
Index(['kepid', 'kepoi_name', 'kepler_name', 'koi_disposition',
       'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
       'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1',
       'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
       'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
       'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth',
       'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1',
       'koi_prad_err2', 'koi_teq', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_tce_delivname', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')


In [9]:
# Preprocess Data
# Dropping columns that may not contribute to model performance
data = data.drop(['rowid', 'kepid', 'kepler_name'], axis=1, errors='ignore')  # Modify these column names as needed

In [10]:
# Handling missing values
#data = data.dropna()  # Dropping rows with missing values. Alternatively, we could impute them.

In [11]:
# Separating features and target variable
X = data.drop(['koi_disposition', 'kepoi_name'], axis=1)  # Drop the 'kepoi_name' column
y = data['koi_disposition']

In [12]:
# Encode the target variable (Candidate, False Positive, Confirmed)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [13]:
# Identify columns with object (string) dtype
object_cols = X.select_dtypes(include=['object']).columns

In [14]:
# Encode object columns using Label Encoding
for col in object_cols:
    label_encoder = LabelEncoder()  # Create a new encoder for each column
    X[col] = label_encoder.fit_transform(X[col])

In [15]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Scale features (standardize data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [17]:
# Model Training
# Using Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [20]:
# Model Prediction
y_pred = clf.predict(X_test)

In [21]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9189754312598014


In [22]:
# Detailed Classification Report
print("\nClassification Report:")

# Get unique labels from y_test and y_pred
unique_labels = np.unique(np.concatenate((y_test, y_pred)))

# Filter target names to match unique labels
filtered_target_names = [label_encoder.classes_[i] for i in unique_labels]
print(classification_report(y_test, y_pred, target_names=filtered_target_names))


Classification Report:
                 precision    recall  f1-score   support

     q1_q16_tce       0.86      0.81      0.84       484
q1_q17_dr24_tce       0.83      0.87      0.85       490
q1_q17_dr25_tce       1.00      1.00      1.00       939

       accuracy                           0.92      1913
      macro avg       0.90      0.89      0.89      1913
   weighted avg       0.92      0.92      0.92      1913

