# Using Deep Learning to Find Hot-Jupiters

## 1 Find Training Set
### Uncleaned Dataset (Given by DSECOP Tutorials)

In [2]:
import pandas as pd
exoplanets = pd.read_csv('Data/NASAExoplanetsData.csv')
exoplanets.head()

Unnamed: 0.1,Unnamed: 0,loc_rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,label
0,1,2,10666592,K00002.01,Kepler-2 b,CONFIRMED,CANDIDATE,,0,1,...,4.021,0.011,-0.011,1.991,0.018,-0.018,292.24728,47.969521,10.463,1.0
1,3,4,3861595,K00004.01,Kepler-1658 b,CONFIRMED,CANDIDATE,,0,1,...,3.657,0.205,-0.107,2.992,0.469,-0.743,294.35654,38.94738,11.432,1.0
2,5,6,3248033,K00006.01,,FALSE POSITIVE,FALSE POSITIVE,,0,0,...,4.106,0.175,-0.152,1.58,0.415,-0.34,294.59955,38.366772,12.161,1.0
3,6,7,11853905,K00007.01,Kepler-4 b,CONFIRMED,CANDIDATE,,0,0,...,4.105,0.01,-0.01,1.533,0.04,-0.04,285.61533,50.13575,12.211,1.0
4,7,8,5903312,K00008.01,,FALSE POSITIVE,FALSE POSITIVE,,0,0,...,4.433,0.062,-0.156,0.985,0.187,-0.079,298.66101,41.13789,12.45,1.0


## 2 Find and Isolate our needed parameters

### Cleaned Dataset
I've isolated the values we will be utilizing as inputs for our training.

In [4]:
exoplanets = pd.read_csv('Data/NASAExoplanetsDataCleaned-full.csv')
exoplanets.head()

Unnamed: 0.1,Unnamed: 0,koi_period,koi_duration,koi_prad,koi_teq,koi_steff,koi_slogg,koi_srad,label
0,1,2.204735,3.88216,16.39,2025,6350,4.021,1.991,1
1,3,3.849372,2.6605,13.1,2035,6244,3.657,2.992,1
2,5,1.334104,3.0142,50.73,2166,6178,4.106,1.58,1
3,6,3.213669,3.99355,4.14,1507,5781,4.105,1.533,1
4,7,1.160153,1.4127,2.0,1752,5842,4.433,0.985,1


### Split between a training set and a testing set
I split the data, each with half Hot Jupiters and half not Hot Jupiters, into two csv files: one for training and one for testing.

In [6]:
exoplanets_train = pd.read_csv('Data/LearnData.csv')
exoplanets_train.head()

Unnamed: 0,koi_period,koi_duration,koi_prad,koi_teq,koi_steff,koi_slogg,koi_srad,label
0,2.204735,3.88216,16.39,2025,6350,4.021,1.991,1
1,3.849372,2.6605,13.1,2035,6244,3.657,2.992,1
2,1.334104,3.0142,50.73,2166,6178,4.106,1.58,1
3,3.213669,3.99355,4.14,1507,5781,4.105,1.533,1
4,1.160153,1.4127,2.0,1752,5842,4.433,0.985,1


In [7]:
exoplanets_test = pd.read_csv('Data/TestData.csv')
exoplanets_test.head()

Unnamed: 0,koi_period,koi_duration,koi_prad,koi_teq,koi_steff,koi_slogg,koi_srad,label
0,1.636689,1.353,11.55,2560,5234,3.436,3.739,1
1,0.616388,0.8228,2.9,3451,5667,3.625,3.049,1
2,0.895725,0.839,1.53,1786,5897,4.56,0.834,1
3,2.20922,2.73,0.88,1507,5991,4.375,1.092,1
4,0.519439,2.1631,40.61,2403,6177,4.462,1.011,1


## 3 Set Hyperparameters

| Value | Description |
| ----- | ----------- |
| Learning Rate | $α = 0.03$  |
| Activation Function (between input layer and until after hidden layer 2) | $g(z) = tanh(z) $ |
| Activation Function (Between final layer and output) | $g(z) = σ(z)$ |
| Hidden Layers | 3 |
| Nodes in Hidden Layers | [4, 3, 1] |
| Iterations | 5000 |

**Note**: This means we will not stop based on the value of our cost function, we will stop after the given number of iterations.

The following is a diagram depicting the information in the table.

![simplified graphic of my neural network](Resources/Flowchart.png)

## 4 Define the Loss and Cost Function, Generalize Gradient Descent method

Our Loss function is the Log-Likelihood Loss function, defined as the following:

$L(a,y^i)=-y^ilog(a)-(1-y^i)log(1-a)$.

We define our Cost function as the following:

$J(ω,b)=\frac{1}{m}Σ^m_{i=1}L(a,y^i)$.

Using the Log-Likelihood Loss function, we define our Cost function as the following:

$J(ω,b)=\frac{1}{m}Σ^m_{i=1}[-y^ilog(a)-(1-y^i)log(1-a)]$

We will define our Gradient Descent method as the following.


In [36]:
# Generalized Gradient Descent Method
# Start with one layer and go from there

import numpy as np
import math

def read_in_dataset(file_loc):
    num = np.genfromtxt(file_loc, dtype=float, delimiter=",", skip_header=True)
    return num

def sigmoid(x):
    return(1 / (1 + math.exp(-x)))

def tanActive(x):
    return(np.tanh(x))

def gradient_descent(exoplanet_vars, Y, alpha, iter):
    # Define variables
    a = []
    z = []
    m = len(Y)
    n = exoplanet_vars.shape[0]
    w = np.random.rand(1, n) * np.sqrt(1/n)
    b = np.random.rand()

    for k in range(iter):
        z = np.dot(w, exoplanet_vars) + b
        a = tanActive(z)
        dz = a - Y
        w -= (1/m) * np.dot(dz, exoplanet_vars.T)
        b -= alpha * dz

    return(w, b)

def main():
    n_iteration = 100_000
    learning_rate = 0.0008

    dataset_location = "data/LearnData.csv"
    num_planets = read_in_dataset(dataset_location)
    num_planets = num_planets.T
    Y = num_planets[7:].flatten() # use flatten to make it one dimensional after using the "label" column
    num_planets = num_planets[:-1]
    print(gradient_descent(num_planets, Y, learning_rate, n_iteration))
    
if __name__ == "__main__":
    main()

(array([[-4.44123655e+02, -2.14984460e+01, -1.12768721e+02,
         5.50965039e+03, -1.15085138e+03, -1.93245515e+00,
         1.36047526e+00]]), array([[0.55861351, 0.55861351, 0.55861351, ..., 0.54421351, 0.54421351,
        0.54421351]]))


In [44]:
import numpy as np
from sklearn import linear_model

def read_in_dataset(file_loc):
    num = np.genfromtxt(file_loc, dtype=float, delimiter=",", skip_header=True)
    return num

def main():
    n_iteration = 100_000
    learning_rate = 0.0008

    dataset_location = "data/LearnData.csv"
    num_planets = read_in_dataset(dataset_location)
    Y = (num_planets.T)[7:].flatten() # use flatten to make it one dimensional after using the "label" column
    num_planets = num_planets[:,:-1]

    SGDClf = linear_model.SGDClassifier(loss="log_loss", alpha=learning_rate, max_iter=n_iteration)
    SGDClf.fit(num_planets, Y)

    test_data = read_in_dataset("data/TestData.csv")
    testY = (test_data.T)[7:].flatten()
    test_data = test_data[:,:-1]

    # let's predict and compare...how well do our predicted values perform?

    predictedY = set(SGDClf.predict(test_data))
    realY = set(testY)

    percentMatch = len(predictedY.intersection(realY)) / len(predictedY) * 100
    print(percentMatch)
    
if __name__ == "__main__":
    main()

100.0
