# Predicting Pulsar Stars using scikit-learn

In [1]:
import gc
import time

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC

# Classifier Setup

In [2]:
data = pd.read_csv('pulsar_stars.csv')
x_data = data.loc[:, data.columns != "y"]
y_data = data.loc[:, "y"]

In [3]:
random_state = 100
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3,
                                                    shuffle=True,
                                                    random_state=random_state)

# Linear Regression 

In [5]:
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression()

In [6]:
y_train_regr_pred = regr.predict(x_train)
y_train_regr_pred[y_train_regr_pred >= 0.5] = 1
y_train_regr_pred[y_train_regr_pred < 0.5] = 0
print(accuracy_score(y_train, y_train_regr_pred))

0.970705619412516


In [6]:
y_test_regr_pred = regr.predict(x_test)
y_test_regr_pred[y_test_regr_pred >= 0.5] = 1
y_test_regr_pred[y_test_regr_pred < 0.5] = 0
print(accuracy_score(y_test, y_test_regr_pred))

0.9720670391061452


# Random Forest Classifier

In [7]:
# WARNING: ignore "FutureWarning: The default value of n_estimators will change
# from 10 in version 0.20 to 100 in 0.22."10 in version 0.20 to 100 in 0.22."
rfclf = RandomForestClassifier()
rfclf.fit(x_train, y_train)

RandomForestClassifier()

In [8]:
y_train_rfclf_pred = rfclf.predict(x_train)
print(accuracy_score(y_train, y_train_rfclf_pred))

0.9959291187739464


In [9]:
y_test_rfclf_pred = rfclf.predict(x_test)
print(accuracy_score(y_test, y_test_rfclf_pred))

0.9811918063314712


## Feature Importance

In [10]:
# Show the feature importance as evaluated by the Random Forest Classifier.
print(rfclf.feature_importances_)

[0.20461882 0.05611513 0.312567   0.23724652 0.03613368 0.06802656
 0.0561676  0.0291247 ]


In [11]:
# Sort the features by importance in the descending order
print(np.argsort(rfclf.feature_importances_)[::-1])

[2 3 0 5 6 1 4 7]


## Hyper-parameter Tuning

In [12]:
# Tune the hyper-parameters 'n_estimators' and 'max_depth'.
rf_parameters = {'n_estimators': [10, 100, 500], 'max_depth':[2, 5, 8, None]}
tuned_rfclf = GridSearchCV(rfclf, rf_parameters, cv=10)
tuned_rfclf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='w

In [13]:
# Show the best combination of parameters
print(tuned_rfclf.best_params_)

{'max_depth': None, 'n_estimators': 500}


In [14]:
# Show the best score
print(tuned_rfclf.best_score_)

0.978448275862069


# Support Vector Machine

## Pre-processing

In [15]:
# Standardize (or normalize) it, otherwise the grid search will take much longer.
# Warning: ignore "FutureWarning: The default value of gamma will change from
# 'auto' to 'scale' in version 0.22 to account better for unscaled features.
# Set gamma explicitly to 'auto' or 'scale' to avoid this warning".
scaler = StandardScaler()
scaler.fit(x_train)
x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

In [16]:
svcclf = SVC(gamma="auto")
svcclf.fit(x_train_std, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
# WARNING: ignore "FutureWarning: The default value of gamma will change from
# 'auto' to 'scale' in version 0.22 to account better for unscaled features.
# Set gamma explicitly to 'auto' or 'scale' to avoid this warning".
y_train_svcclf_pred = svcclf.predict(x_train_std)
print(accuracy_score(y_train, y_train_svcclf_pred))

0.978448275862069


In [18]:
# WARNING: ignore "FutureWarning: The default value of gamma will change from
# 'auto' to 'scale' in version 0.22 to account better for unscaled features.
# Set gamma explicitly to 'auto' or 'scale' to avoid this warning".
y_test_svcclf_pred = svcclf.predict(x_test_std)
print(accuracy_score(y_test, y_test_svcclf_pred))

0.9797020484171323


## Hyper-parameter Tuning

In [18]:
# Tune the hyper-parameters 'C' and 'kernel' (testing on rbf and linear).
svc_parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 0.01, 0.1, 1, 10]}
tuned_svcclf = GridSearchCV(svcclf, svc_parameters, cv=10)
tuned_svcclf.fit(x_train_std, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [19]:
# Show the best score.
print(tuned_svcclf.best_score_)

0.9783684546615581


# Support Vector Machine

In [20]:
# Calculate the accuracies for both training and test sets after hyperparameter
# tuning and normalization. 
y_train_tuned_svcclf_pred = tuned_svcclf.predict(x_train_std)
svc_train_accuracy = accuracy_score(y_train, y_train_tuned_svcclf_pred)
y_test_tuned_svcclf_pred = tuned_svcclf.predict(x_test_std)
svc_test_accuracy = accuracy_score(y_test, y_test_tuned_svcclf_pred)

In [21]:
print(svc_train_accuracy)

0.9806034482758621


In [22]:
print(svc_test_accuracy)

0.9813780260707635


In [23]:
# Calculate the rank test score, mean testing score and mean fit time for the
# all hyperparameter values obtained above. The GridSearchCV class holds a 
# ‘cv_results_’ dictionary that helps us report these metrics easily.
rank_test_score = tuned_svcclf.cv_results_["rank_test_score"]
mean_test_score = tuned_svcclf.cv_results_["mean_test_score"]
mean_fit_time = tuned_svcclf.cv_results_["mean_fit_time"]

In [24]:
# Show the rank test score for all hyperparameter values obtained.
print(rank_test_score)

[ 8 10  7  9  5  6  3  4  2  1]


In [25]:
# Show mean testing score for all of hyperparameter values obtained.
print(mean_test_score)

[0.96847063 0.90876437 0.97413793 0.9683908  0.97717114 0.97573436
 0.97788953 0.97757024 0.97796935 0.97836845]


In [26]:
# Show mean fit time for all of hyperparameter values obtained.
print(mean_fit_time)

[0.22501905 0.75369732 0.18820636 0.65757122 0.18741305 0.39506247
 0.21553993 0.27932367 0.63216114 0.37264616]


# PCA

In [27]:
# Perform dimensionality reduction on the data.
pca = PCA(n_components=8, svd_solver='full')
pca.fit(x_data)

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
    svd_solver='full', tol=0.0, whiten=False)

In [28]:
# Show Percentages of variance explained by each principal component.
print(pca.explained_variance_ratio_)

[8.71053041e-01 7.81934383e-02 4.11562290e-02 6.15967691e-03
 2.43717952e-03 9.58578490e-04 3.90570992e-05 2.79968662e-06]


In [29]:
# Show the singular values corresponding to each principal component.
print(pca.singular_values_)

[14430.28004546  4323.5214088   3136.68022725  1213.47665361
   763.30168073   478.70316467    96.62788002    25.87063984]
