In [2]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [1]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [32]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()



Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
selected_features = df.drop(['koi_disposition'], axis=1)
print(selected_features.columns)


Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk',
       'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1',
       'koi_impact_err2', 'koi_duration', 'koi_duration_err1',
       'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
       'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg',
       'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1',
       'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')


# Create a Train Test Split

Use `koi_disposition` for the y values

In [67]:
y = df["koi_disposition"]
X = selected_features
y.unique()

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


In [10]:
X_train

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3875,1,0,0,0,11.377528,1.111000e-04,-1.111000e-04,132.584800,0.008330,-0.008330,...,-305,4.162,0.128,-0.192,1.660,0.528,-0.352,300.19409,45.145741,13.930
1768,0,0,0,0,2.215713,7.630000e-06,-7.630000e-06,131.821070,0.002980,-0.002980,...,-169,4.447,0.108,-0.162,0.901,0.195,-0.130,297.02008,43.432549,15.392
3250,0,0,0,0,7.785911,2.034000e-04,-2.034000e-04,137.873400,0.027900,-0.027900,...,-180,4.479,0.054,-0.216,0.954,0.305,-0.102,290.42307,51.388729,13.515
6574,1,0,1,1,2.404557,3.730000e-06,-3.730000e-06,131.676160,0.001440,-0.001440,...,-175,4.013,0.259,-0.130,1.775,0.411,-0.503,293.21356,46.175129,13.474
2815,0,0,0,0,110.461746,2.985000e-03,-2.985000e-03,147.546500,0.023800,-0.023800,...,-206,4.434,0.054,-0.216,1.058,0.349,-0.116,287.03952,46.481701,15.092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,1,0,0,8.268081,6.340000e-07,-6.340000e-07,135.056330,0.000064,-0.000064,...,-190,4.502,0.050,-0.200,0.922,0.273,-0.091,292.53125,46.728699,15.768
5191,0,0,0,0,11.161938,1.677000e-04,-1.677000e-04,133.553800,0.013000,-0.013000,...,-124,4.072,0.188,-0.101,1.640,0.281,-0.343,295.21268,49.562180,13.374
5226,0,1,0,0,6.150251,7.000000e-07,-7.000000e-07,134.422825,0.000088,-0.000088,...,-458,3.896,0.270,-0.180,2.867,0.988,-1.087,297.18176,45.988441,10.622
5390,1,0,0,0,3.343285,4.380000e-05,-4.380000e-05,134.845100,0.011200,-0.011200,...,-197,3.773,0.293,-0.098,2.652,0.433,-0.939,296.86258,41.147419,13.276


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [45]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

print(encoded_y_train)


[2 1 0 ... 2 2 2]


# Train the Model



In [50]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train_scaled, y_train_categorical)
predict_y=model.predict(X_test_scaled)

In [63]:
# print(model.intercept_)
# print(model.coef_)

# print(f"Training Data Score: {model.score(X_train_scaled, y_train_categorical)}")
# print(f"Testing Data Score: {model.score(X_test_scaled, y_test_categorical)}")


from sklearn.metrics import mean_squared_error
print("MSE:" ,mean_squared_error(y_test_categorical, predict_y))

# small RSS means a tighter fit to the the line.  Diff between predicted and actual.  How far the actual point 
# from the predicted line
print("Residual sum of squares: %.2f"
              % np.mean((model.predict(X_test_scaled) - y_test_categorical) ** 2))

MSE: 0.10347205362035207
Residual sum of squares: 0.10


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [61]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV


parameters = {'copy_X':[1,5,10],
                      "penalty":["l1","l2"]} 
grid = GridSearchCV(model,parameters, cv=None)
grid.fit(X_train_scaled, y_train_categorical)
print("r2 / variance : ", grid.best_score_)
print("Residual sum of squares: %.2f"
              % np.mean((grid.predict(X_test_scaled) - y_test_categorical) ** 2))



ValueError: Invalid parameter penalty for estimator LinearRegression(copy_X=1, fit_intercept=True, n_jobs=None, normalize=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [17]:
model.get_params().keys()

dict_keys(['copy_X', 'fit_intercept', 'n_jobs', 'normalize'])

In [25]:
grid.fit(X_train_scaled, y_train_categorical)





GridSearchCV(cv=None, error_score=nan,
             estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                        n_jobs=None, normalize=False),
             iid='deprecated', n_jobs=None,
             param_grid={'copy_X': [1, 5, 10], 'fit_intercept': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
# Train the model with GridSearch

In [26]:
print(grid.best_params_)
print(grid.best_score_)

{'copy_X': 1, 'fit_intercept': 'l1'}
0.5141292552678381


# Save the Model

In [30]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'mult_regression.sav'
joblib.dump(model, filename)

['mult_regression.sav']