In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import pandas as pd
import os

## Import Data

In [2]:
koi_df = pd.read_csv("cumulative.csv")
print(koi_df.head())

   rowid     kepid kepoi_name   kepler_name koi_disposition koi_pdisposition  \
0      1  10797460  K00752.01  Kepler-227 b       CONFIRMED        CANDIDATE   
1      2  10797460  K00752.02  Kepler-227 c       CONFIRMED        CANDIDATE   
2      3  10811496  K00753.01           NaN  FALSE POSITIVE   FALSE POSITIVE   
3      4  10848459  K00754.01           NaN  FALSE POSITIVE   FALSE POSITIVE   
4      5  10854555  K00755.01  Kepler-664 b       CONFIRMED        CANDIDATE   

   koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  ...  \
0      1.000              0              0              0  ...   
1      0.969              0              0              0  ...   
2      0.000              0              1              0  ...   
3      0.000              0              1              0  ...   
4      1.000              0              0              0  ...   

   koi_steff_err2  koi_slogg  koi_slogg_err1  koi_slogg_err2  koi_srad  \
0           -81.0      4.467           0.064    

In [3]:
len_df = len(koi_df)
len_con = len(koi_df[koi_df['koi_disposition'] == 'CONFIRMED'])
len_can = len(koi_df[koi_df['koi_disposition'] == 'CANDIDATE'])
len_fal = len(koi_df[koi_df['koi_disposition'] == 'FALSE POSITIVE'])

# Total Length
print(f"Number of entries: {len_df}")

# Confirmed
print(f"Number of confirmed: {len_con}")

# Candidate
print(f"Number of candidates: {len_can}")

# False Positives
print(f"Number of false positives: {len_fal}")

Number of entries: 9564
Number of confirmed: 2293
Number of candidates: 2248
Number of false positives: 5023


## Data Cleaning and Feature Selection

In [4]:
# Flags:
#        koi_fpflag_nt = 1 (Not Transit-Like Flag)
#        koi_fpflag_ss = 1 (Stellar Eclipse Flag)
#        koi_fpflag_co = 1 (Centroid Offset Flag)
#        koi_fpflag_ec = 1 (Ephemeris Match Indicates Contamination Flag)

# Integer Encode
koi_df.loc[koi_df.koi_disposition == 'CONFIRMED', 'category'] = 0 
koi_df.loc[koi_df.koi_disposition == 'CANDIDATE', 'category'] = 1
koi_df.loc[koi_df.koi_disposition == 'FALSE POSITIVE', 'category'] = 2

# Force null uncertainty to 0
koi_df['koi_duration_err1'] = koi_df['koi_duration_err1'].fillna(0)
koi_df['koi_depth_err1'] = koi_df['koi_depth_err1'].fillna(0)
koi_df['koi_period_err1'] = koi_df['koi_period_err1'].fillna(0)

# Correlation
koi_df.corr().to_csv("correlation.csv", index=True)

''' 
Using:
    kepid: unique id (Display only),
    kepler_name: name (Display only),
    koi_disposition: category (label),
    koi_duration: duration of transit (feature),
    koi_duration_err1: duration uncertainty (feature)
    koi_depth: stellar flux lost during transit (feature)
    koi_depth_err1: depth uncertainty (feature)
    koi_period: The interval between consecutive planetary transits(feature)
    koi_period_err1: Period uncertainty (feature)
    prad: eccentricity value(feature)
'''

koi_df = koi_df[['kepid', 
                'kepler_name', 
                 'koi_disposition', 
                 'koi_duration', 
                 'koi_duration_err1',
                 'koi_depth',
                 'koi_depth_err1',
                 'koi_period',
                 'koi_period_err1',
                 'koi_prad',
                 'koi_fpflag_nt', 
                 'koi_fpflag_ss', 
                 'koi_fpflag_co', 
                 'category']]

# Depth has null values. lets remove.
koi_df.dropna(subset = ["koi_depth"], inplace=True)

## Typical Stats

In [5]:
# Duration Mean / STD
duration_mean = koi_df[['koi_duration']].mean()
duration_std = koi_df[['koi_duration']].std()
duration_max = koi_df['koi_duration'].max()
duration_min = koi_df['koi_duration'].min()
duration_spread = duration_max - duration_min
print(f"Duration mean: {duration_mean.values[0]}, Duration std: {duration_std.values[0]}")
print(f"Duration max: {duration_max}, Duration min: {duration_min}, Duration spread: {duration_spread}")

# Depth Mean / STD
depth_mean = koi_df[['koi_depth']].mean()
depth_std = koi_df[['koi_depth']].std()
depth_std = koi_df[['koi_depth']].std()
depth_max = koi_df['koi_depth'].max()
depth_min = koi_df['koi_depth'].min()
depth_spread = depth_max - depth_min
print(f"Depth mean: {depth_mean.values[0]}, Depth std: {depth_std.values[0]}")
print(f"Depth max: {depth_max}, Depth min: {depth_min}, Depth spread: {depth_spread}")

# Correlation
print(koi_df[['kepid', 
                'kepler_name', 
                 'koi_disposition', 
                 'koi_duration',
                 'koi_duration_err1', 
                 'koi_depth',
                 'koi_depth_err1',
                 'koi_period',
                 'koi_period_err1',
                 'koi_prad',
                 'koi_fpflag_nt', 
                 'koi_fpflag_ss', 
                 'koi_fpflag_co', 
                 'category']].corr())

Duration mean: 5.6546922649711995, Duration std: 6.49970265482651
Duration max: 138.54, Duration min: 0.052, Duration spread: 138.488
Depth mean: 23791.335898271933, Depth std: 82242.68314876728
Depth max: 1541400.0, Depth min: 0.0, Depth spread: 1541400.0
                      kepid  koi_duration  koi_duration_err1  koi_depth  \
kepid              1.000000     -0.025658          -0.014655  -0.017483   
koi_duration      -0.025658      1.000000           0.372224   0.067275   
koi_duration_err1 -0.014655      0.372224           1.000000  -0.130779   
koi_depth         -0.017483      0.067275          -0.130779   1.000000   
koi_depth_err1    -0.016069      0.002231           0.019478   0.228531   
koi_period         0.011527      0.037294           0.032982  -0.009180   
koi_period_err1    0.011932      0.266048           0.543926  -0.070543   
koi_prad           0.003681      0.036573           0.021426   0.002558   
koi_fpflag_nt     -0.035018      0.288012           0.362135  -0.045

## Plots

In [6]:
%matplotlib notebook
import matplotlib.pyplot as plt

# # Candidates
# plt.figure("Candidates", figsize=(15,7))
# plt.scatter(koi_df['kepid'][:100], koi_df['category'][:100])
# plt.ticklabel_format(useOffset=False, style='plain')
# plt.xlabel("ID")
# plt.ylabel("Category")
# plt.show()

# # duration
# plt.figure("Duration", figsize=(15,7))
# plt.scatter(koi_df['kepid'][:100], koi_df['koi_duration'][:100])
# plt.ticklabel_format(useOffset=False, style='plain')
# plt.xlabel("ID")
# plt.ylabel("Duration")
# plt.show()

# # # depth
# plt.figure("Depth", figsize=(15,7))
# plt.plot(koi_df['koi_depth'])
# plt.xlabel("ID")
# plt.ylabel("Depth")
# plt.show()

# Against
# plt.figure("Against", figsize=(15,7))
# plt.scatter(x=koi_df.loc[koi_df['category'] == 1], y=koi_df.loc[koi_df['category'] == 1, 'koi_duration'])

## Standardize

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# scale
koi_df[['koi_depth', 'koi_duration', 'koi_prad']] = scaler.fit_transform(koi_df[['koi_depth', 'koi_duration','koi_prad']])
koi_df = koi_df.reset_index()

## Train/Test Split

In [8]:
# 2/3 split

# Flags included
x_train = koi_df[['koi_duration', 'koi_duration_err1', 'koi_depth', 'koi_depth_err1', 'koi_prad', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co']][:7356]
x_test = koi_df[['koi_duration', 'koi_duration_err1', 'koi_depth', 'koi_depth_err1', 'koi_prad', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co']][7357:]
y_train = koi_df[['category']][:7356]
y_test = koi_df[['category']][7357:]

## Sklearn log reg

In [9]:
from sklearn.linear_model import LogisticRegression

# Build and fit (regularized by default)
regressor = LogisticRegression(max_iter=1000)
history = regressor.fit(x_train, y_train.values.ravel())

print(regressor.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Prediction

In [12]:
from sklearn.metrics import confusion_matrix
import numpy as np

predictions = regressor.predict(x_test)
print(predictions)
print(y_test)
pred_df = pd.DataFrame(predictions, columns=["y_hat"])
pred_df['y'] = y_test.values
pred_df.to_csv("preds.csv", index=False)

# Accuracy
correct = len(pred_df.loc[pred_df['y'] == pred_df['y_hat']])
print(f"Correct Predictions: {correct}")
accuracy = (correct / len(pred_df)) * 100
print(f"Accuracy: {accuracy} %")

# Confusion Matrix
confusion_matrix(predictions, y_test, labels=[0, 1, 2])

[2. 2. 2. ... 1. 2. 2.]
      category
7357       2.0
7358       2.0
7359       2.0
7360       2.0
7361       2.0
...        ...
9196       2.0
9197       2.0
9198       1.0
9199       2.0
9200       2.0

[1844 rows x 1 columns]
Correct Predictions: 1679
Accuracy: 91.05206073752711 %


array([[  24,   83,    9],
       [  11,  321,   46],
       [   3,   13, 1334]])

## Analysis