<a href="https://colab.research.google.com/github/bahadirbasaran/pulsarDetection/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Environment**

In [0]:
import copy
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, mean_squared_error, cohen_kappa_score, confusion_matrix, classification_report

**Function&Class Definitions**

In [0]:
def plot_2d_space(X, y, label='Classes', saveName=None):   
  colors = ['#1F77B4', '#FF7F0E']
  markers = ['o', 's']
  for l, c, m in zip(np.unique(y), colors, markers):
    plt.scatter(X[y==l, 0], X[y==l, 1], c=c, label=l, marker=m)
  plt.title(label)
  plt.legend(loc='upper right')
  if saveName is None:
    plt.show()
  else:
    plt.savefig(saveName)

def plot_roc_curve(fpr, tpr, auc_score, saveName=None):
  plt.plot(fpr, tpr, color='orange', label=('AUC Score: ',auc_score))
  plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver Operating Characteristic (ROC) Curve')
  plt.legend(loc='best')
  if saveName is None:
    plt.show()
  else:
    plt.savefig(saveName)

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
  plt.figure()
  plt.title(title)
  if ylim is not None:
    plt.ylim(*ylim)
  plt.xlabel("Training examples")
  plt.ylabel("Score")
  train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
  train_scores_mean = np.mean(train_scores, axis=1)
  train_scores_std = np.std(train_scores, axis=1)
  test_scores_mean = np.mean(test_scores, axis=1)
  test_scores_std = np.std(test_scores, axis=1)
  plt.grid()

  plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
  plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
  plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
  plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
  plt.legend(loc="best")
  
  return plt

def evaluateClassifier(classifier, y_true, y_pred, probabilities, feature_importances):  
  score_acc = accuracy_score(y_true,y_pred)
  print("\nAccuracy Score of {}: {}".format(classifier, score_acc))

  classificationReport = classification_report(y_true,y_pred)
  print("\nClassification Report of {}: {}".format(classifier, classificationReport))

  score_cohenKappa = cohen_kappa_score(y_true, y_pred)
  print("\nCohen Kappa Score of {}: {}".format(classifier, score_cohenKappa))

  score_mse = mean_squared_error(y_true, y_pred)
  print("\nMean Squared Error Score of {}: {}".format(classifier, score_mse))

  plt.figure(figsize=(13,10))
  plt.subplot(221)
  sb.heatmap(confusion_matrix(y_true,y_pred),annot=True,fmt = "d",linecolor="k",linewidths=3)
  plt.title("Confusion Matrix of {}".format(classifier),fontsize=20)
  plt.show()

  false_positive_rate, true_positive_rate, thresholds = roc_curve(y_true, probabilities)
  plt.subplot(222)
  plot_roc_curve(false_positive_rate, true_positive_rate, roc_auc_score(y_true, probabilities))

  plt.subplot(223)
  sb.barplot(x=feature_importances, y=feature_importances.index)
  plt.title('Feature importances')
  plt.xlabel('Score')
  plt.show()

def executeClassifier(classifier, xTrain, yTrain, xTest, yTest, n_fold, paramSet):
  gs = GridSearchCV(
                    estimator = classifier,
                    param_grid = paramSet,
                    cv = n_fold,      
                    n_jobs = -1
                   ) 
  gs.fit(xTrain, yTrain)

  print("\n{} Best Score: {}".format(classifier, gs.best_score_))
  print("\n{} Best Parameters Set:\n".format(classifier))
  for param in sorted(paramSet.keys()):
    print("Parameter {}: {}".format(param, gs.best_estimator_.get_params()[param]))

  y_pred = gs.predict(xTest)

  #Predict probabilities for the test data and Keep Probabilities of the positive class only.
  probs = gs.predict_proba(xTest)[:, 1]

  featureImportances = pd.Series(gs.feature_importances_, index=dataset.columns[:-1]).sort_values(ascending=False)

  evaluateClassifier(classifier, gs, yTest, y_pred, probs, featureImportances)

  plt.subplot(224)
  plot_learning_curve(gs, "Learning Curve", xTrain, yTrain)

def createDataSplits(dataset, list_dataColumns, targetColumn, testSizeRatio, randomState=42, oversample=True, normalize=True):   
  X = dataset[dataset.columns[list_dataColumns]]
  y = dataset[dataset.columns[targetColumn]]
    
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSizeRatio, random_state=randomState)
    
  if normalize == True:
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
        
  #Index of X_train and X_test are changed. We must reset the index of y_train and y_test.Otherwise, index mismatch error.
  y_train = y_train.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  #Oversampling to balance the data set        
  if oversample == True:
    sm = SMOTE(sampling_strategy='minority', random_state=randomState)
    X_train, y_train = sm.fit_sample(X_train, y_train)   
    plot_2d_space(X_train, y_train, label='Dataset After Oversampling')  

  return X_train, X_test, y_train, y_test

**Data Importation**

In [0]:
dataset = pd.read_csv('https://raw.githubusercontent.com/bahadirbasaran/pulsarDetection/master/pulsar_stars.csv?token=ALRHARXNN6L47R4KLJOGKP26LQWMU')

**Data Visualization**

In [0]:
dataset.info()
datasetStats = dataset.describe()

# Correlation Matrix Heatmap
_, ax = plt.subplots(figsize = (10,10))
hm = sb.heatmap(dataset.corr(), annot = True, linewidths=.5, fmt = ".3f", ax=ax)
bottom, top = hm.get_ylim()
hm.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

# Pair Plot
sb.pairplot(data=dataset,
             palette="husl",
             hue="target_class",
             vars=list(dataset.columns))
plt.tight_layout()
plt.show()

# Violin Plot
plt.figure(figsize=(17,15))
for i in range(8):
    plt.subplot(4,4,i+1)
    sb.violinplot(data=dataset, y=dataset.columns[i], x="target_class")
plt.show()

# Bar Plot
_, ax1 = plt.subplots(3,3, figsize=(15,10))
k = 0
for i in range(3):
    for j in range(3):
        if k != 8:
            sb.barplot(x='target_class', y=dataset.columns[k], data=dataset, ax = ax1[i][j])
            k += 1            
plt.show()

# Joint Plot
sb.jointplot(x=dataset.columns[0], y=dataset.columns[1], data=dataset, kind='kde', height=12)


**bold text**

In [0]:
X_train, X_test, y_train, y_test = createDataSplits(dataset, [0,1,2,3,4,5,6,7], 8, 0.3)
