# Predicting Pulsar Stars using scikit-learn

In [1]:
import gc
import time

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC

# Classifier Setup

In [2]:
data = pd.read_csv('pulsar_stars.csv')
x_data = data.loc[:, data.columns != "y"]
y_data = data.loc[:, "y"]

In [3]:
random_state = 100
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3,
                                                    shuffle=True,
                                                    random_state=random_state)

# Linear Regression 

In [4]:
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression()

In [5]:
y_train_regr_pred = regr.predict(x_train)
y_train_regr_pred[y_train_regr_pred >= 0.5] = 1
y_train_regr_pred[y_train_regr_pred < 0.5] = 0
print(accuracy_score(y_train, y_train_regr_pred))

0.970705619412516


In [6]:
y_test_regr_pred = regr.predict(x_test)
y_test_regr_pred[y_test_regr_pred >= 0.5] = 1
y_test_regr_pred[y_test_regr_pred < 0.5] = 0
print(accuracy_score(y_test, y_test_regr_pred))

0.9720670391061452


# Random Forest Classifier

In [7]:
# WARNING: ignore "FutureWarning: The default value of n_estimators will change
# from 10 in version 0.20 to 100 in 0.22."10 in version 0.20 to 100 in 0.22."
rfclf = RandomForestClassifier()
rfclf.fit(x_train, y_train)

RandomForestClassifier()

In [8]:
y_train_rfclf_pred = rfclf.predict(x_train)
print(accuracy_score(y_train, y_train_rfclf_pred))

1.0


In [9]:
y_test_rfclf_pred = rfclf.predict(x_test)
print(accuracy_score(y_test, y_test_rfclf_pred))

0.982122905027933


## Feature Importance

In [10]:
# Show the feature importance as evaluated by the Random Forest Classifier.
print(rfclf.feature_importances_)

[0.25426836 0.04221348 0.2764117  0.19695874 0.07062171 0.06899106
 0.047483   0.04305194]


In [11]:
# Sort the features by importance in the descending order
print(np.argsort(rfclf.feature_importances_)[::-1])

[2 0 3 4 5 6 7 1]


## Hyper-parameter Tuning

In [12]:
# Tune the hyper-parameters 'n_estimators' and 'max_depth'.
rf_parameters = {'n_estimators': [10, 100, 500], 'max_depth':[2, 5, 8, None]}
tuned_rfclf = GridSearchCV(rfclf, rf_parameters, cv=10)
tuned_rfclf.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 5, 8, None],
                         'n_estimators': [10, 100, 500]})

In [13]:
# Show the best combination of parameters
print(tuned_rfclf.best_params_)

{'max_depth': None, 'n_estimators': 500}


In [14]:
# Show the best score
print(tuned_rfclf.best_score_)

0.97844865613263


# Support Vector Machine

## Pre-processing

In [15]:
# Standardize (or normalize) it, otherwise the grid search will take much longer.
# Warning: ignore "FutureWarning: The default value of gamma will change from
# 'auto' to 'scale' in version 0.22 to account better for unscaled features.
# Set gamma explicitly to 'auto' or 'scale' to avoid this warning".
scaler = StandardScaler()
scaler.fit(x_train)
x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

In [16]:
svcclf = SVC(gamma="auto")
svcclf.fit(x_train_std, y_train)

SVC(gamma='auto')

In [17]:
# WARNING: ignore "FutureWarning: The default value of gamma will change from
# 'auto' to 'scale' in version 0.22 to account better for unscaled features.
# Set gamma explicitly to 'auto' or 'scale' to avoid this warning".
y_train_svcclf_pred = svcclf.predict(x_train_std)
print(accuracy_score(y_train, y_train_svcclf_pred))

0.978448275862069


In [18]:
# WARNING: ignore "FutureWarning: The default value of gamma will change from
# 'auto' to 'scale' in version 0.22 to account better for unscaled features.
# Set gamma explicitly to 'auto' or 'scale' to avoid this warning".
y_test_svcclf_pred = svcclf.predict(x_test_std)
print(accuracy_score(y_test, y_test_svcclf_pred))

0.9797020484171323


## Hyper-parameter Tuning

In [19]:
# Tune the hyper-parameters 'C' and 'kernel' (testing on rbf and linear).
svc_parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 0.01, 0.1, 1, 10]}
tuned_svcclf = GridSearchCV(svcclf, svc_parameters, cv=10)
tuned_svcclf.fit(x_train_std, y_train)

GridSearchCV(cv=10, estimator=SVC(gamma='auto'),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ('linear', 'rbf')})

In [20]:
# Show the best score.
print(tuned_svcclf.best_score_)

0.9782890392132366


# Support Vector Machine

In [21]:
# Calculate the accuracies for both training and test sets after hyperparameter
# tuning and normalization. 
y_train_tuned_svcclf_pred = tuned_svcclf.predict(x_train_std)
svc_train_accuracy = accuracy_score(y_train, y_train_tuned_svcclf_pred)
y_test_tuned_svcclf_pred = tuned_svcclf.predict(x_test_std)
svc_test_accuracy = accuracy_score(y_test, y_test_tuned_svcclf_pred)

In [22]:
print(svc_train_accuracy)

0.9806034482758621


In [23]:
print(svc_test_accuracy)

0.9813780260707635


In [24]:
# Calculate the rank test score, mean testing score and mean fit time for the
# all hyperparameter values obtained above. The GridSearchCV class holds a 
# ‘cv_results_’ dictionary that helps us report these metrics easily.
rank_test_score = tuned_svcclf.cv_results_["rank_test_score"]
mean_test_score = tuned_svcclf.cv_results_["mean_test_score"]
mean_fit_time = tuned_svcclf.cv_results_["mean_fit_time"]

In [25]:
# Show the rank test score for all hyperparameter values obtained.
print(rank_test_score)

[ 8 10  7  9  5  6  2  4  3  1]


In [26]:
# Show mean testing score for all of hyperparameter values obtained.
print(mean_test_score)

[0.96839113 0.9087644  0.97413823 0.96831126 0.97717147 0.97581447
 0.97804942 0.97757064 0.97796961 0.97828904]


In [27]:
# Show mean fit time for all of hyperparameter values obtained.
print(mean_fit_time)

[0.18931091 0.63046844 0.13762269 0.49669123 0.14785182 0.28201809
 0.22270532 0.26109602 0.66548913 0.37899776]


# PCA

In [28]:
# Perform dimensionality reduction on the data.
pca = PCA(n_components=8, svd_solver='full')
pca.fit(x_data)

PCA(n_components=8, svd_solver='full')

In [29]:
# Show Percentages of variance explained by each principal component.
print(pca.explained_variance_ratio_)

[8.71053041e-01 7.81934383e-02 4.11562290e-02 6.15967691e-03
 2.43717952e-03 9.58578490e-04 3.90570992e-05 2.79968662e-06]


In [30]:
# Show the singular values corresponding to each principal component.
print(pca.singular_values_)

[14430.28004546  4323.5214088   3136.68022725  1213.47665361
   763.30168073   478.70316467    96.62788002    25.87063984]
