In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy

from lpproj import LocalityPreservingProjection

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

from sklearn.pipeline import Pipeline

# Exoplanet Transit Classification (Kepler Mission)
## Data Preprocessing

### X|y split + training and hold-out dataset creation

In [117]:
feature_df_path = "..\\data\\feat_df_tot.csv"

df_exo = pd.read_csv(feature_df_path, index_col = ['KIC_ID', 'TCE_num'])
cols_to_drop = ["time_complexity", "Depth", "rms"]
df_exo = df_exo.drop(columns = cols_to_drop)
df_exo.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Period,Duration,target_label,even_odd_stat,p_secondary,max,min,LCBIN_0,LCBIN_1,LCBIN_2,...,LCBIN_131,LCBIN_132,LCBIN_133,LCBIN_134,LCBIN_135,LCBIN_136,LCBIN_137,LCBIN_138,LCBIN_139,LCBIN_140
KIC_ID,TCE_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
757450,1,8.884923,2.07004,1,0.925362,0.007862,0.000177,-0.015739,-0.003788,-0.002984,-0.002181,...,0.003811,0.003485,0.00431,0.005135,0.005959,0.006784,0.007609,0.008434,0.009258,0.010083
1026032,1,8.460439,4.73492,2,0.97888,0.0,0.00036,-0.077604,0.001511,0.001346,0.001181,...,0.001283,0.001274,0.001265,0.001134,0.001003,0.000872,0.000741,0.00087,0.000999,0.001129
1293031,1,0.539366,5.434,3,0.499758,0.289257,2.6e-05,-1.7e-05,-1.0,-0.979731,-0.959462,...,0.417232,0.435126,0.453019,0.470913,0.488807,0.5067,0.524594,0.542487,0.560381,0.578274


In [118]:
# define X and y
X = df_exo.drop(columns = ['target_label'])
y = df_exo['target_label']

#now create train + hold out set

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, stratify = y)

In [120]:
y_test.value_counts()

1    351
2    321
3    132
Name: target_label, dtype: int64

In [121]:
y_train.value_counts()

1    1983
2    1819
3     749
Name: target_label, dtype: int64

In [122]:
# At this point, X_train, y_train, x_test, y_test will be saved as separate files. That way the modeling is being evaluated from a particular train/test sample as saved on file.

base_path = "..\\data\\"
datasplit_dict = {"X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test}

for name, data in datasplit_dict.items():
    data.to_csv(base_path + name + ".csv")


### Data preprocessing on train/hold-out test data from file

In [218]:
#transformer class to perform sqrt log transform on p_secondary and even_odd_stat

class SqrtLogZeroExceptionTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        super().__init__()
        self.trans_evenodd_max_ = None
        self.trans_psec_max_ = None

    def fit(self, X, y = None):
        cols_to_take = ['even_odd_stat', 'p_secondary'] #these are the columns for which we'll do the sqrt log transformation

        with np.errstate(divide='ignore'):
            X_trans = np.sqrt(np.abs(np.log10(X[cols_to_take])))
        # cuts out infinities and gets maxes --> will use this to impute infinities (which are zeros in the original feature set)
        maxsqrtlog = X_trans[~(X_trans == np.inf)].max()

        self.trans_evenodd_max_ = maxsqrtlog['even_odd_stat']
        self.trans_psec_max_ = maxsqrtlog['p_secondary']

        return self
    
    def transform(self, X, y = None):
        cols_to_take = ['even_odd_stat', 'p_secondary'] #these are the columns for which we'll do the sqrt log transformation

        # this will issue some divide by zero warnings. im going to suppress this warning.
        with np.errstate(divide='ignore'):
            X_trans = np.sqrt(np.abs(np.log10(X[cols_to_take])))


        X_toreturn = deepcopy(X)
        X_toreturn['even_odd_stat'] = X_trans['even_odd_stat'].replace(np.inf, self.trans_evenodd_max_)
        X_toreturn['p_secondary'] = X_trans['p_secondary'].replace(np.inf, self.trans_psec_max_)

        return X_toreturn

    def fit_transform(self, X, y = None):
        return self.fit(X).transform(X)

#### Set up column transformer for LPP dim reduction on LCBIN data and then construct transformation pipeline

In [237]:
# transformer class to do the LPP dimensional reduction on the LCBIN subset.
#  the columntransformer outputs 

LCBINselector = make_column_selector(pattern = "^LCBIN_")
trans = [('lpp', LocalityPreservingProjection(n_components=2), LCBINselector(X_train) )]

steps = [('sqrtlog', SqrtLogZeroExceptionTransformer()), ('lpptrans', ColumnTransformer(transformers = trans, remainder = 'passthrough') )]

# fit_transforming the pipeline will output a numpy array (LocalityPreservingProjection outputs numpy array). 
# column order is (LPP_1, LPP_2, Period, Duration, even_odd_stat, p_secondary, max, min)

datatrans_pipe = Pipeline(steps)

In [228]:
data_trans.fit(X_train)

Pipeline(steps=[('sqrtlog', SqrtLogZeroExceptionTransformer()),
                ('lpptrans',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('lpp',
                                                  LocalityPreservingProjection(),
                                                  ['LCBIN_0', 'LCBIN_1',
                                                   'LCBIN_2', 'LCBIN_3',
                                                   'LCBIN_4', 'LCBIN_5',
                                                   'LCBIN_6', 'LCBIN_7',
                                                   'LCBIN_8', 'LCBIN_9',
                                                   'LCBIN_10', 'LCBIN_11',
                                                   'LCBIN_12', 'LCBIN_13',
                                                   'LCBIN_14', 'LCBIN_15',
                                                   'LCBIN_16', 'LCBIN_17',
                                           