In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\adria\anaconda3\envs\pythondata\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

# Select your features (columns)

In [None]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_model_snr', 'koi_fpflag_co','koi_fpflag_ss', 'koi_prad']]

In [None]:
selected_features.info()

# Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
target=df['koi_disposition']
target.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(selected_features,target,random_state=42)

In [None]:
X_train.head()

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [None]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler=MinMaxScaler().fit(X_train)
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

In [None]:
#One-hot Encoding
from tensorflow.keras.utils import to_categorical

#Step1: Label-encode dataset
label_encoder=LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train=label_encoder.transform(y_train)
encoded_y_test=label_encoder.transform(y_test)


In [None]:
#Step2: Convert encoded Labels to one-hot-encoding
y_train_categorical=to_categorical(encoded_y_train)
y_test_categorical=to_categorical (encoded_y_test)

# Train the Model



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf = rf.fit(X_train_scaled, y_train_categorical)

In [None]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train_categorical)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test_categorical)}")

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

In [None]:
# Sort the features by their importance
sorted(zip(rf.feature_importances_, selected_features), reverse=True)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [200, 300, 600],
              'max_features':['auto','sqrt','log2'],
              'max_depth': [4, 5, 6,7,8],
             'criterion':['gini','entropy']}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train_categorical)

In [None]:
# List the best parameters for this dataset
print(f"Best Parameters: {grid.best_params_}")

# List the best score
print(f"Best Score: {grid.best_score_}")

In [None]:
print(f"Training Grid Score: {grid.score(X_train_scaled, y_train_categorical)}")
print(f"Testing Grid Score: {grid.score(X_test_scaled, y_test_categorical)}")

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_categorical, predictions))

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'adrianaovalle_rf.sav'
joblib.dump(rf, filename)