In [1]:
import pandas as pd
import copy
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
import math
from collections import OrderedDict
from sklearn.preprocessing import RobustScaler 
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN, LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from keras.optimizers import SGD
import warnings
warnings.filterwarnings('ignore')

# The Search for Exoplanets
### Anika Das, Ayan Chowdhury, Gary Shetye, Grace Zhang

# Data Preprocessing

In [2]:
# reading in dataframes
planets_train = pd.read_csv('exoTrain.csv')
planets_test = pd.read_csv('exoTest.csv')

In [3]:
# converting the labels of (1, 2) to (0, 1)
def convert_labels_to_binary(labels, label_one, label_two):
    
    binary_labels = []
    
    # for each label 
    for label in labels:
        # append 0 if the label is label_one (1)
        if (label == label_one):
            binary_labels.append(0)
        # append 1 if the label is label_two (2)
        elif (label == label_two):
            binary_labels.append(1)
        else:
            print("error")
            
    return binary_labels

In [4]:
# get the labels of train and test data to have 0, 1's
planets_train_new_labels = convert_labels_to_binary(planets_train["LABEL"].tolist(), 1, 2)
planets_test_new_labels = convert_labels_to_binary(planets_test["LABEL"].tolist(), 1, 2)

# change dataframe outcome column to new labels 
planets_train["LABEL"] = planets_train_new_labels
planets_test["LABEL"] = planets_test_new_labels

In [5]:
def split_x_and_outcome(df, outcome_col):
    
    # get df of all features (no outcome column)
    features = df.loc[:, df.columns != outcome_col]
    # get Series object of outcome column
    outcome = df[outcome_col]
    
    return features, outcome

In [6]:
# split training and testing dataframes into their features and labels
planets_features_train = split_x_and_outcome(planets_train, "LABEL")[0]
planets_labels_train = split_x_and_outcome(planets_train, "LABEL")[1]

planets_features_test = split_x_and_outcome(planets_test, "LABEL")[0]
planets_labels_test = split_x_and_outcome(planets_test, "LABEL")[1]

In [7]:
# Robust scaler
def scaler(df):
    scaler = RobustScaler()
    # takes a dataframe, transposes it, scales each column (each star's flux values)
    # based on median and IQR, and transposes it back
    scaled = pd.DataFrame(scaler.fit_transform(df.T).T,columns=df.columns)
    return scaled

# Synthetic minority sampling technique (SMOTE) to balance 0/1 classes
def minority_sampling(df, labels, neighbors):
    # initialize SMOTE object
    oversample = SMOTE(k_neighbors=neighbors)
    # fit SMOTE to create df with synthetic samples
    df, labels = oversample.fit_resample(df, labels)
    return df, labels

In [8]:
# Apply Robust scaler to training and testing features
features_scaled = scaler(planets_features_train)
features_test_scaled = scaler(planets_features_test)

In [9]:
# Create equal number of 0/1 samples 
added_rows = minority_sampling(features_scaled, planets_labels_train, 3) # shape (10100, 3197)
added_rows_test = minority_sampling(features_test_scaled, planets_labels_test, 3) # shape (1130, 3197)

# KNN Model

In [10]:
# helper function to calculate euclidean distance
def euclidean_dist(r1, r2):
    r1_np = np.array(r1)
    r2_np = np.array(r2)
    
    dist = np.linalg.norm(r1_np - r2_np)
    return dist

# helper function to calculate manhattan distance
def manhattan(a, b):
    return sum(abs(val1-val2) for val1, val2 in zip(a,b))


# helper function returning most common value in a list
def most_common(speaker_preds):
    # returns most common value in a list
    data = Counter(speaker_preds)
    return data.most_common(1)[0][0]

In [11]:
def model_metrics(y_actuals, y_pred):
    
    '''
    Given a series of actual labels (y), and a series of predicted outcomes (ypred), 
    returns the model accuracy, sensitivity, specificity, precision, and f1-score. 
    
    '''
    
    # initializing counts + metrics variables 
    false_pos = 0 
    false_neg = 0
    true_pos = 0
    true_neg = 0
    
    recall = 0
    precision = 0
    f1_score = 0
    accuracy = 0
    specificity = 0
    
    # for each prediction
    for i in range(len(y_pred)):
        # if the prediction was obama
        if (y_pred[i] == 0):
            # sentence actually obama's
            if (y_actuals[i] == 0):
                # true obama found
                true_neg += 1
            else:
                # sentence actually trump's
                false_neg += 1
        # if the prediction was trump
        else:
            # if the sentence actually trump's
            if (y_actuals[i] == 1):
                # true positive found
                true_pos += 1
            else:
                # sentence actually obama's
                false_pos += 1
    
    # no true positives --> f1 must be undefined
    if (true_pos == 0):
        f1_score = None
        # no false positives means precision is undefined
        if (false_pos == 0):
            precision = None
        else:
            # precision is 0 if there are some false positives
            precision = 0
        # no false negatives means that recall is undefined
        if (false_neg == 0):
            recall = None
        else:
            # recall is 0 if there are some false negatives
            recall = 0
    else:
        # calculating precision, recall, f1 outside of edge cases
        precision = round((true_pos) / (true_pos + false_pos), 7)
        recall = round((true_pos) / (true_pos + false_neg), 7)
        f1_score = round(2*((recall * precision) / (recall + precision)), 7)
       
    # calculating accuracy, specificity
    try:
        specificity = round((true_neg) / (true_neg + false_pos), 3)
        accuracy = round((true_pos + true_neg) / len(y_pred), 3)
    except ZeroDivisionError:
        specificity = 0
        accuracy = 0
  
    return accuracy, precision, specificity, recall, f1_score

In [12]:
def get_neighbors(train_data, train_labels, val_sentence, num_neighbors, metric):
    
    preds = []
    distances = {}
        
    # for every training vector
    for i in range(len(train_data) - 1):
        # calculate euclidean/manhattan distance between training and validation vector. add this distance + train label as 
        # key-value pair
        distances[metric(val_sentence, train_data[i])] = train_labels[i]
    
    # sort dictionary using OrderedDict
    distances = OrderedDict(sorted(distances.items()))
    distances = list(distances.items())
    # get list of nearest neighbors (train vectors with smallest distances)
    neighbors = distances[:num_neighbors]

    # get neighbor vector's labels
    for pred in neighbors:
        preds.append(pred[1])
    
    # return most common label
    return most_common(preds)


# k fold validation
def split_df_into_folds(df, k):
    
    # randomly shuffle the dataframe
    # shuffle_df = df.sample(frac=1)
    # split the dataframe into k folds, store each in an np array
    shuffle_df = np.array_split(df, k)
    return shuffle_df

# accuracy metric
def avg_accuracy(y_preds, actual):
    
    total_accuracy = 0
    idx = 0
    
    true_neg = 0
    true_pos = 0
    
    for y in y_preds:
        
        if (y == 0):
            # trump prediction is correct
            if (actual[idx] == 0):
                # true negative found
                true_neg += 1
        if (y == 1):
            # obama prediction is correct
            if (actual[idx] == 1):
                # true negative found
                true_pos += 1
        
        idx += 1
    
    # return true predictions over all predictions
    return (true_pos + true_neg) / len(actual)


def k_fold_validation(df, num_folds, penalty, solver, k, outcome_col, label_one, label_two, classifier, metric):
    
    algorithm = 0
    if (classifier == get_neighbors):
        algorithm = 1
        
    # get np array of dataframes for number of folds
    dfs = split_df_into_folds(df, num_folds)
    
    fold_accuracies = {}
    evals = []
    
    for i in range(len(dfs)):
        # create new DF
        train_dfs = pd.DataFrame()
        
        # the current df is the validation df, split it into x and outcome
        val_df_x = split_x_and_outcome(dfs[i], outcome_col)[0]
        val_df_y = convert_labels_to_binary(split_x_and_outcome(dfs[i], outcome_col)[1].squeeze(), label_one, label_two)
        # all other df's are for training
        others = dfs[:i] + dfs[i+1:]
        for dataframe in others:
            # make one big DF for training from other df's 
            train_dfs = pd.concat([train_dfs, dataframe])
    
        # split training df's into x and outcome
        train_dfs_x = split_x_and_outcome(train_dfs, outcome_col)[0]
        train_dfs_y = convert_labels_to_binary(split_x_and_outcome(train_dfs, outcome_col)[1].squeeze(), label_one, label_two)
        
        training_vectors = train_dfs_x.values.tolist()
        
        if (algorithm == 1):
            y_preds = []

            # for each sentence, 
            for i in range(len(val_df_x)):
                y_preds.append(classifier(training_vectors, train_dfs_y, val_df_x.iloc[i], k, metric))
        else:
            y_preds = []
            y_preds = classifier(training_vectors, train_dfs_y, val_df_x, k, penalty, solver)
        
        # add accuracy to dictionary
        fold_accuracies[i] = avg_accuracy(y_preds, val_df_y)
        evals.append(model_metrics(val_df_y, y_preds))
    
    # return dictionary of accuracies for each fold's training result
    return fold_accuracies, evals


def split_x_and_outcome(df, outcome_col):
    
    # get df of all features (no outcome column)
    features = df.loc[:, df.columns != outcome_col]
    # get Series object of outcome column
    outcome = df[outcome_col]
    
    return features, outcome


def get_avg_accuracy_per_fold(df, num_folds, penalty, solver, k, outcome_col, label_one, label_two, classifier, metric):
    
    # get the dictionary of accuracies from each fold's training result
    accuracies_per_fold = k_fold_validation(df, num_folds, penalty, solver, k, outcome_col, label_one, label_two, classifier, metric)[0]
    total_accuracy = 0
    
    # for each accuracy, add it to total_accuracy
    for accuracy in list(accuracies_per_fold.values()):
        total_accuracy += accuracy
    
    # return average accuracy (total accuracy / number of accuracies calculated)
    return total_accuracy / len(list(accuracies_per_fold.values()))

In [13]:
def knn_hypertune(df, num_folds, ks: list, outcome_col, label_one, label_two, classifier, metrics: list):
    
    best_score = 0
    best_vals = []
    # for each k value
    for k in ks:
        # for each metric
        for metric in metrics:
            # get average accuracy per k-fold
            curr_accuracy = get_avg_accuracy_per_fold(df[0].sample(frac=0.1), num_folds, k, 0, 1, outcome_col, label_one, label_two, classifier, metric)
            print("metric: ", metric)
            print("k: ", k)
            print("current_Acc: ", curr_accuracy)
            # if current accuracy with parameters is better than then current best accuracy
            # update the best hyperparameters
            if (curr_accuracy > best_score):
                best_score = curr_accuracy
                best_vals = [k, metric]
    
    return best_score, best_vals   

In [14]:
# hyperparameters for KNN: nearest-neighbors, distance measurement for instances
ks = [5, 7]
metrics = [euclidean_dist, manhattan]

# add labels back to dataframe
added_rows[0]["LABEL"] = added_rows[1]

# get the best score and values
best_score, best_vals = knn_hypertune(added_rows, 5, ks, "LABEL", 0, 1, get_neighbors, metrics)
print(best_score)
print(best_vals)

metric:  <function euclidean_dist at 0x7fcf19fff7a0>
k:  5
current_Acc:  0.698019801980198
metric:  <function manhattan at 0x7fcf19fff050>
k:  5
current_Acc:  0.7722772277227723
metric:  <function euclidean_dist at 0x7fcf19fff7a0>
k:  7
current_Acc:  0.7722772277227723
metric:  <function manhattan at 0x7fcf19fff050>
k:  7
current_Acc:  0.698019801980198
0.7722772277227723
[5, <function manhattan at 0x7fcf19fff050>]


In [15]:
def knn_test_with_best_parameters(test_features, test_labels, best_k, best_metric):
    
    y_preds = []
    # get the test instances as a list of lists
    test_features_list = test_features.values.tolist()
    # for each start instance
    for i in range(len(test_features)):
        rest_features = []
        # get the particular instance we are at
        test_row = test_features_list[i]
        # make a list of lists of every other instance
        rest_features = test_features_list[:i] + test_features_list[i:]
        # call get neighbors and append predicted class to y_pred
        y_preds.append(get_neighbors(rest_features, test_labels.tolist(), test_row, best_k, best_metric))
    
    # calculate model metrics of y_preds and test labels
    metrics = model_metrics(test_labels, y_preds)
    
    return metrics

In [16]:
# calculating testing metrics for KNN
metrics = knn_test_with_best_parameters(added_rows_test[0], added_rows_test[1], 7, euclidean_dist)
print(metrics)

(0.983, 0.9674658, 0.966, 1.0, 0.9834639)


# Logistic Regression

In [17]:
def logistic_regression(training_features, training_labels, test_features, k, pen, sol):
    
    # instantiate the model (using the default parameters)
    logreg = LogisticRegression(penalty=pen, solver=sol, random_state=16)

    # fit the model with data
    logreg.fit(training_features, training_labels)
    
    # call predict function
    y_pred = logreg.predict(test_features)
    return y_pred

In [18]:
def logistic_hypertune(df, num_folds, penalties: list, solvers: list, ks: [], outcome_col, label_one, label_two, classifier, metrics: list):
    
    best_score = 0
    best_vals = []
    # for each penalty 
    for penalty in penalties:
        # for each solver
        for solver in solvers:
            # get average accuracy per k-fold
            curr_accuracy = get_avg_accuracy_per_fold(df[0].sample(frac=0.1), num_folds, penalty, solver, 0, outcome_col, label_one, label_two, classifier, [])
            # if current accuracy with parameters is better than then current best accuracy
            # update the best hyperparameters
            if (curr_accuracy > best_score):
                best_score = curr_accuracy
                best_vals = [solver, penalty]
    
    return best_score, best_vals 

In [19]:
# regularization hyperparameters
penalties = ["l1", "l2"]
solvers = ['liblinear', 'saga']

# add labels back to the dataframe
added_rows[0]["LABEL"] = added_rows[1]

# gets the best score and parameters for logistic regression
best_score, best_params = logistic_hypertune(added_rows, 5, penalties, solvers, [], 'LABEL', 1, 0, logistic_regression, [])

print(best_score)
print(best_params)

0.9475247524752476
['liblinear', 'l1']


In [20]:
# remove labels again 
added_rows_cleaned = added_rows[0].drop(columns="LABEL")

# calculate logistic regression test predictions 
test_preds = logistic_regression(added_rows_cleaned, added_rows[1], added_rows_test[0], 0, 'l1', 'liblinear')

# gets metrics
best_metrics_logreg = model_metrics(added_rows_test[1].tolist(), test_preds)
print(best_metrics_logreg)

(0.573, 0.7862069, 0.945, 0.2017699, 0.3211267)


# RNN With LSTM 

In [21]:
# add labels back to training/testing dataframes
added_rows[0]["LABEL"] = added_rows[1]
added_rows_test[0]["LABEL"] = added_rows_test[1]

# shuffle the dataframes and sample random 50% of data for RNN training
added_rows_shuffled = added_rows[0].sample(frac=0.5).reset_index(drop=True)
added_rows_test_shuffled = added_rows_test[0].sample(frac=0.5).reset_index(drop=True)

# split the training/testing dataframes to their respective features and labels
added_rows_shuffled_features = split_x_and_outcome(added_rows_shuffled, "LABEL")[0].to_numpy()
added_rows_shuffled_labels = split_x_and_outcome(added_rows_shuffled, "LABEL")[1].to_numpy(dtype="float64")
added_rows_shuffled_features_test = split_x_and_outcome(added_rows_test_shuffled, "LABEL")[0].to_numpy()
added_rows_shuffled_labels_test = split_x_and_outcome(added_rows_test_shuffled, "LABEL")[1].to_numpy(dtype="float64")

# print shapes of the training data
print("training shapes: ", added_rows_shuffled_features.shape, added_rows_shuffled_labels.shape)

training shapes:  (5050, 3197) (5050,)


In [22]:
def preprocess_features_for_rnn(features, num_blocks):
    # get the length of each block of fluxes
    block_len = features.shape[1] // num_blocks
    # split up the features into blocks, truncate features to be divisible
    # by block length
    features_blocks = features[:, :num_blocks*block_len] 
    # reshape the array
    features_blocks = features_blocks.reshape((features.shape[0], num_blocks, block_len))
    return features_blocks

In [23]:
def preprocess_labels_for_rnn(labels, num_blocks):
    # for each row, make an array of length num_blocks of the label of that row 
    # e.g (0, 0, 0, 0) or (1, 1, 1, 1)
    labels_blocks = np.repeat(labels[:, np.newaxis], num_blocks, axis=1)
    return labels_blocks

In [24]:
# preprocess the features for the RNN
flux_for_training = preprocess_features_for_rnn(added_rows_shuffled_features, 5)
flux_for_testing = preprocess_features_for_rnn(added_rows_shuffled_features_test, 5)

# preprocess the labels for RNN
flux_labels_for_training = preprocess_labels_for_rnn(added_rows_shuffled_labels, 5)
flux_labels_for_testing = preprocess_labels_for_rnn(added_rows_shuffled_labels_test, 5)

In [25]:
# printing the shapes of each of the training/testing features/labels
print(flux_for_training.shape)
print(flux_labels_for_training.shape)
print(flux_for_testing.shape)
print(flux_labels_for_testing.shape)

(5050, 5, 639)
(5050, 5)
(565, 5, 639)
(565, 5)


In [26]:
def init_rnn(X, y, num_epochs=3, learning_rate=0.01, lstm_units=5, batch_size=64):
    # initializing Sequential input layer
    model = Sequential()
    # LSTM layer with parameterized # of units
    model.add(LSTM(lstm_units, input_shape=(X.shape[1]*X.shape[2], 1)))
    # Dense output layer using sigmoid for activation (binary classification)
    model.add(Dense(1, activation='sigmoid'))
    # initialize stochastic gradient descent optimizer with parameterized learning rate
    optimizer = SGD(learning_rate=learning_rate)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print(model.summary())
    # reshape features
    X = X.reshape((X.shape[0], X.shape[1]*X.shape[2], 1))
    # get y's as integers
    y_binary = np.array([int(np.all(row)) for row in y])
    # fit the model
    history = model.fit(X, y_binary, epochs=num_epochs, batch_size=64)
    return model, history

In [27]:
# create fit of model on training data and return the accuracy for the epoch


def calc_avg_val_accuracy_rnn(training_features, training_labels, num_epochs, learning_rate, lstm_units):
    
    best_vals = []
    best_accuracy = 0
    # for each epoch
    for epoch in num_epochs:
        # for each learning rate
        for rate in learning_rate:
            # for each number of LSTM units
            for units in lstm_units:   
                # run the RNN and get epoch data
                history = init_rnn(training_features, training_features, epoch, rate, units)[1]
                # calculate the average accuracy of the training iteration
                avg_acc = np.mean(history.history['accuracy'])
                # if the average accuracy greater than the best, keep track of parameters
                if avg_acc > best_accuracy:
                    best_accuracy = avg_acc
                    best_vals = [epoch, rate, units]
    
    # return best hyperparmaeters and best accuracy for RNN training
    return best_accuracy, best_vals

In [28]:
# hyperparameters for RNN
num_epochs = [3, 5, 7]
learning_rates = [0.01, 0.001]
lstm_cells = [3, 7]

# get best hyperparameters for validation
best_acc, best_params = calc_avg_val_accuracy_rnn(flux_for_training, flux_labels_for_training, num_epochs, learning_rates, lstm_cells)
print(best_acc)
print(best_params)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 3)                 60        
                                                                 
 dense (Dense)               (None, 1)                 4         
                                                                 
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2

In [29]:
# Reshape the testing features to have shape (565, 3195, 1)
flux_for_test_reshaped = np.reshape(flux_for_testing, (flux_for_testing.shape[0], flux_for_testing.shape[1]*flux_for_testing.shape[2], 1))
flux_for_test_labels_reshaped = np.all(flux_labels_for_testing, axis=1).astype(int)[:, np.newaxis]  # Reshape to (565, 1)


# Evaluate the model on the reshaped testing data
model = init_rnn(flux_for_test_reshaped, flux_for_test_labels_reshaped, 5, 0.01, 7, 64)[0]

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 7)                 252       
                                                                 
 dense_12 (Dense)            (None, 1)                 8         
                                                                 
Total params: 260
Trainable params: 260
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Ep

In [30]:
# Make predictions on the reshaped testing data
y_pred_reshaped = model.predict(flux_for_test_reshaped)

# the model outputs predictions between 0-1. Convert them to 0's and 1's based on if
# they are less than or greater than 0.5
def make_preds_into_binary(preds):
    y_preds = []
    for pred in preds:
        if pred < 0.5:
            y_preds.append(0)
        else:
            y_preds.append(1)
            
    return y_preds
    
y_preds = make_preds_into_binary(y_pred_reshaped)

# get the first element of each list in a list of lists
def extract(lst):
    return [int(item[0]) for item in lst]

# calculate metrics on the test predictions from RNN
rnn_metrics = model_metrics(extract(flux_labels_for_testing.tolist()), y_preds)
print(rnn_metrics)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
(0.361, 0.366548, 0.362, 0.3601399, 0.3633157)
