In [None]:
import numpy as np
import pandas as pd
import os

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [None]:
# dataset from: https://www.kaggle.com/pavanraj159/predicting-a-pulsar-star
df = pd.read_csv('data/pulsar_stars.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
# no NaN data
df.isnull().any()

In [None]:
# heavily skewed toward negative class
df['target_class'].value_counts()

In [None]:
# correlation matrix
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#disconcerting that multiple factors weigh heavily on target class
corr['target_class'].sort_values(ascending=False)

In [None]:
df.hist(figsize=(40,30))
plt.show()

In [None]:
#PROBLEM WITH PROPORTIONING DATASET


# startified split to make test set labeled evenly
# maintains proportionality of dataset
# from sklearn.model_selection import StratifiedShuffleSplit

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# for train_index, test_index in split.split(df, df['target_class']):
#     train_set = df.loc[train_index]
#     test_set = df.loc[test_index]

# train_set['target_class'].value_counts()

# #split into train 1s and 0s
# pos_train = train_set.loc[train_set['target_class'] == 1]
# neg_train = train_set.loc[train_set['target_class'] == 0]
# pos_test = test_set.loc[test_set['target_class'] == 1]
# neg_test = test_set.loc[test_set['target_class'] == 0]


In [None]:

#split into train 1s and 0s
pos = df.loc[df['target_class'] == 1]
neg = df.loc[df['target_class'] == 0]

threeXpos = round(pos.shape[0]*2.5)
# if I wanted to take a random batch of the negative class
# neg_batch_seed = np.random.randint(0, (neg.shape[0] - threeXpos))

#3 to l neg. to pos.
proportioned_neg = neg[0:threeXpos]
#train
neg_test = neg[threeXpos:]
print(neg_test.shape)

pos = pos.append(proportioned_neg)

pos['target_class'].value_counts()


In [None]:
# convert df into an array
array = pos.values
print(type(array))
print(array.shape)
array[0]

In [None]:
# will append to testing set
neg_test = neg_test.values
neg_testX = neg_test[:, :8]
neg_testY = neg_test[:, 8]

In [None]:
#normalize input values between 0 and 1
def normalize(array):
    for column in range(array.shape[1]):
        array[:,column] = (array[:,column] - np.min(array[:,column])) / (np.max(array[:,column]) - np.min(array[:,column]))
    return array                                
                                                               

In [None]:
X_s = array[:, :8] # X values... already column vectors
X_s = normalize(X_s)

print(X_s.shape)
y_s = array[:, 8] # labels
y_s.astype(int) # convert to integers

print(y_s.shape)
print('_________')

# shuffle - then split into train and test sets
rnd_inds = np.arange(len(X_s)) #array xs long
np.random.shuffle(rnd_inds) # xs shuffled

# reorder xs and corresponding ys
X_s = X_s[rnd_inds,:] 
y_s = y_s[rnd_inds] 


#STRATIFIED SAMPLING, GET EQUAL AMT OF ONES AND ZEROS... SKLEARN



# 85% of dataset put into training set
train_len = int(np.round(X_s.shape[0]*.85))
print(train_len)

#assign training and test set
X_train = X_s[:train_len] # everything before train_len
y_train = y_s[:train_len]

X_test = X_s[train_len:] # everything after train_len 
X_test = np.append(X_test, neg_testX, axis= 0) # append unused X neg class data

y_test = y_s[train_len:]
y_test = np.append(y_test, neg_testY, axis= 0) # append unused X neg class data





print(X_train.shape)
print('_________')


# take validation set from training data
# valid used to check quality of test set  
valid_len = int(np.round(X_s.shape[0]*.1))
print(valid_len)


X_valid, X_train = X_train[:valid_len], X_train[valid_len:]
y_valid, y_train = y_train[:valid_len], y_train[valid_len:]
print(X_train.shape)
print(y_train.shape)
print('_________')
print(X_test.shape)
print(y_test.shape)

In [None]:
import tensorflow as tf

In [None]:
reset_graph()
num_epochs = int(5e5) # num training loops
batch_size = 1 # NOTE: ONLY SEEMS TO WORK W/ BATCH SIZE 1

n_inputs = 8
n_outputs = 2 # binary classifier
num_nodes_l1 = 100 # 100 doesn't matter
num_nodes_l2 = 100

In [None]:
xs = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X") #input values ––> specifying params of fed dict. 
ys = tf.placeholder(tf.int64, shape=(None), name="y") #labels


In [None]:
# design network
hidden1 = tf.layers.dense(xs, num_nodes_l1, name="hidden1",
                              activation=tf.nn.relu) #first layer connected to input layer (placeholder)
hidden2 = tf.layers.dense(hidden1, num_nodes_l2, name="hidden2",
                              activation=tf.nn.relu)
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

#Computes sparse softmax cross entropy between logits and labels.
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=ys, logits=logits)

loss = tf.reduce_mean(xentropy, name="loss")

    

In [None]:
learning_rate = 1e-3

# cost = tf.reduce_mean(tf.square(outputs-ys)) # mse ... tf.losses.mean_squared_error(labels, predictions)
optimizer = tf.train.AdamOptimizer( learning_rate )

training_op = optimizer.minimize(loss) # optimize (reduce) mse



In [None]:
# Says whether the targets are in the top K predictions.
correct = tf.nn.in_top_k(logits, ys, 1)

accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))


In [None]:
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # get starting accuracy val. 
    last_accuracy = accuracy.eval(feed_dict={xs: X_valid, ys: y_valid})
    
    for i in range(num_epochs):    
        # select random subsets of data to train on (aka batches)
        rnd_idx = np.random.permutation(len(X_train))[:batch_size]
        X_batch = X_train[rnd_idx]
        y_batch = y_train[rnd_idx]
        # print out progress
        if not(i%(10000)):
    
            # print epoch + accuracy val
            print('epoch: ', i)
            accuracy_val = accuracy.eval(feed_dict={xs: X_valid, ys: y_valid})
            print(accuracy_val)
            
            
#             print(y_valid[:10])
#             logits_val = logits.eval(feed_dict={xs: X_valid, ys: y_valid})
#             print(logits_val[:10])
            
            
#             correct_val = correct.eval(feed_dict={xs: X_valid, ys: y_valid})
#             print(correct_val)

            
            
#             implement early stopping
#             if (last_accuracy <= accuracy_val):
#                 last_accuracy = accuracy_val 
#                 save_path = saver.save(sess, "./my_model.ckpt")
#                 print("Model saved in path: %s" % save_path)
#             else:
#                 break

            # save best model
            if (accuracy_val > last_accuracy):
                last_accuracy = accuracy_val 
                save_path = saver.save(sess, "./my_model.ckpt")
                print("Model saved in path: %s" % save_path)
            
        sess.run(training_op, feed_dict={xs: X_batch, ys: y_batch}) #feed batch to optimizer
    

In [None]:
with tf.Session() as sess:
    #restore model from early stopping
    saver.restore(sess, "./my_model.ckpt")
    # generate predicted output values for y, test
    pred_ys = sess.run(logits, feed_dict={xs:X_test})


In [None]:
print(y_test[:10])
pred_ys[:10]

In [None]:
# precision // recall scores // F1 score

from sklearn.metrics import precision_score, recall_score, f1_score

# get class prediction of each instance 
preds = []
for i in range(len(pred_ys)):
    preds.append(np.argmax(pred_ys[i], axis = 0))

print(precision_score(y_test, preds))
# 97.7% precise. Few False Positives

print(recall_score(y_test, preds))
# 93.6% accurate. Not a lot of False Negatives.

print(f1_score(y_test, preds))
# 95.6% f1 score

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_test, preds)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])
    

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, preds)