In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade
!pip install joblib
#restart kernel.

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting scikit-learn (from sklearn)
  Downloading https://files.pythonhosted.org/packages/7e/e5/888491b7e2c16718a68dfd8498325e8927003410b2d19ba255d8751338a5/scikit_learn-0.23.1-cp38-cp38-win_amd64.whl (6.8MB)
Collecting scipy>=0.19.1 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/e3/dd/c6b603e261c828c38ad95b6e902abb27e1797e3bc7a10b92e389bd57eb5a/scipy-1.5.1-cp38-cp38-win_amd64.whl (31.4MB)
Collecting joblib>=0.11 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/51/dd/0e015051b4a27ec5a58b02ab774059f3289a94b0906f880a3f9507e74f38/joblib-0.16.0-py3-none-any.whl (300kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-

In [1]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler #used to scale the numerical data
from sklearn.model_selection import train_test_split #Split data into test & training datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV #gdrisearch tuning
from sklearn.metrics import classification_report

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("../data/cumulative.csv")

# Set features that are not relevant and want to be dropped.
columnsToDrop = ['kepid','rowid', 'kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score', 'koi_tce_delivname']
# drop all columns with err in the name as we don't care about error rates and it will pollute the data.
for c in df.columns:
        if 'err' in c:
            columnsToDrop.append(c)

#Generate new DF with relevant features
filteredDF = df.drop(columns=columnsToDrop)

# Drop the null columns where all values are null
filteredDF = filteredDF.dropna(axis='columns', how='all')
# Drop the null rows
filteredDF = filteredDF.dropna()
filteredDF.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,...,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,...,793.0,93.59,35.8,1.0,5455.0,4.467,0.927,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,...,443.0,9.11,25.8,2.0,5455.0,4.467,0.927,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,...,638.0,39.3,76.3,1.0,5853.0,4.544,0.868,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,...,1395.0,891.96,505.6,1.0,5805.0,4.564,0.791,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,...,1406.0,926.16,40.9,1.0,6031.0,4.438,1.046,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
#create set of data for x values without the koi_disposition 
xValues = filteredDF.drop(columns=['koi_disposition'])
#extract the koi_disposition for y values
yValues = filteredDF['koi_disposition']

#Split filtered datasets into separate Testing & training sets
#random_state is just for randomization
xTraining, xTesting, yTraining, yTesting = train_test_split(xValues, yValues, random_state=42)
xTraining.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
2225,0,0,0,0,13.297069,140.94606,0.03,3.283,219.7,0.91,515.0,16.64,14.9,2.0,4500.0,4.632,0.633,284.82803,44.60873,14.993
6264,1,0,0,0,273.169948,309.84052,0.3259,2.701,283.3,2.8,362.0,4.08,6.2,1.0,5880.0,4.035,1.65,283.16446,41.778519,13.831
1691,0,0,0,0,9.613753,132.185679,0.604,4.654,797.9,3.27,931.0,177.26,108.6,1.0,5997.0,4.364,1.11,290.35815,44.518749,14.283
4576,0,0,0,0,40.351171,153.40933,0.079,3.97,533.5,1.88,479.0,12.46,11.3,1.0,5665.0,4.568,0.82,291.93756,39.419941,15.676
159,0,0,0,0,27.807566,186.713552,0.782,4.21587,25391.0,13.84,554.0,22.24,748.1,1.0,5862.0,4.584,0.785,296.88821,41.396091,15.482


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [4]:
# Scale the data using MinMaxScaler
# Feature selection and preprocessing has been done previously
xTrainingFit = MinMaxScaler().fit(xTraining)
xTrainingScaled = xTrainingFit.transform(xTraining)
xTestingScaled = xTrainingFit.transform(xTesting)

# Train the Model



In [5]:
model = LogisticRegression()
model.fit(xTrainingScaled, yTraining)
print(f"Training Data Score: {model.score(xTrainingScaled, yTraining)}")
print(f"Testing Data Score: {model.score(xTestingScaled, yTesting)}")
print(model)

Training Data Score: 0.8112701252236136
Testing Data Score: 0.8095663835493965
LogisticRegression()


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [9]:
# Create the GridSearchCV model
param_grid = {'C': [1, 10, 100, 1000]}
grid = GridSearchCV(model, param_grid, verbose=1, n_jobs=4)

In [10]:
# Train the model with GridSearch
grid.fit(xTrainingScaled, yTraining)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    1.4s finished


GridSearchCV(estimator=LogisticRegression(), n_jobs=4,
             param_grid={'C': [1, 10, 100, 1000]}, verbose=1)

In [11]:
print(grid.best_params_)
print(grid.best_score_)
#get prediction results for our model
predictions = grid.predict(xTestingScaled)
#print a report of the predictions based on our testing data
#comparing both results.
print(classification_report(yTesting, predictions))

{'C': 1000}
0.8221540968047736
                precision    recall  f1-score   support

     CANDIDATE       0.64      0.56      0.60       506
     CONFIRMED       0.66      0.71      0.68       577
FALSE POSITIVE       0.98      1.00      0.99      1154

      accuracy                           0.82      2237
     macro avg       0.76      0.76      0.76      2237
  weighted avg       0.82      0.82      0.82      2237



# Save the Model

In [12]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
#save the file in the Data folder
filename = '../data/AlexanderBurch_LinearRegression.sav'
#dump it with joblib
joblib.dump(model, filename)

['../data/AlexanderBurch_LinearRegression.sav']