## Read in Candidate/Confirmed Keys

In [1]:
import numpy as np
import pandas as pd
import glob

In [None]:
confirmed = "./data/raw/k2candidates_2020.10.28_02.32.01.csv"

con_df = pd.read_csv(confirmed)

con_df.head()

In [None]:
con_df = con_df[['epic_name', 'epic_candname', 'k2_campaign', 'k2c_disp']]
con_df = con_df[con_df.k2c_disp != "FALSE POSITIVE"]

con_df = con_df.drop_duplicates(['epic_name'])
print(con_df.k2_campaign.unique())

con_df = con_df[['epic_name', 'k2c_disp']]

con_df = con_df.set_index('epic_name')
con_df.columns = ['label']

con_df

In [None]:
candidates = con_df[con_df.label == "CANDIDATE"]
confirmed = con_df[con_df.label == "CONFIRMED"]

print(f"Confirmed Planets: {confirmed.shape}\n")
print(f"Candidate Planets: {candidates.shape}\n")

candidates

## Read in Light Curves

In [None]:
import sys

def combine_lightcurves(folder_path, recursive=True):
    if recursive:
        # Get all the sub directories
        folders = glob.glob(folder_path)
    else:
        # Set parent directory as main
        folders = [folder_path]
    
    df_list = []
    epic_list = []
    for folder in folders:
        sys.stdout.write(f'\nReading Folder: {folder}\n')
        # Grab a list of all the txt files
        files = glob.glob(folder + '/*.txt')
        # Loop through the files
        sys.stdout.write('Importing EPICs: ')
        for i, file in enumerate(files):
            # Extract EPIC number from filename
            lc_idx = file.find('lightcurve')
            new_str = file[lc_idx:]
            epic_start_idx = new_str.find('_')
            epic_end_idx = new_str.find('-')
            epic = new_str[epic_start_idx + 1:epic_end_idx]
            
            if epic not in epic_list:
                epic_list.append(epic)
                # Import
                if i % 100 == 0:
                    sys.stdout.write('.')
                    sys.stdout.flush()
                df = pd.read_csv(file)
                df = df.drop(columns=[' Corrected Flux'])
                df.columns = [f'EPIC {epic}']
                df = df.reset_index(drop=True)

                df_list.append(df)
    
    print(f'\nCombining {len(df_list)} DataFrames')
    # Once all the df's have been extracted combine them
    super_df = pd.concat(df_list, axis=1)
    
    return super_df

def shift_and_fill(df):
    # For each column count the number of NaN's (which will be at the end) and then shift by half that amount, then bfill & ffill
    for col in df.columns:
        shift = df[col].isna().sum()
        if shift:
            df[col] = df[col].shift(shift // 2)
        else:
            continue

    df = df.ffill()
    df = df.bfill()
    
    return df

def combine_labels(labels, df):
    temp_df = df.copy(deep=True).T
    output_df = temp_df.merge(right=labels, left_index=True, right_index=True, how='left')
    output_df['label'] = output_df['label'].fillna('NONE')
    
    return output_df

In [None]:
unfilled = combine_lightcurves('data/raw/Campaigns/*', recursive=True)
filled = shift_and_fill(unfilled)

In [None]:
columns = unfilled.columns
unfilled[columns[:5]].plot(subplots=True, figsize=(20,10))

columns = filled.columns
filled[columns[:5]].plot(subplots=True, figsize=(20,10))

In [None]:
unfilled = unfilled.apply(pd.to_numeric, errors='coerce')
filled = filled.apply(pd.to_numeric, errors='coerce')

In [None]:
unfilled = combine_labels(con_df, unfilled)
filled = combine_labels(con_df, filled)

In [None]:
con = unfilled.query('label == "CONFIRMED"')
con[-10:].T.drop('label').plot(subplots=True, figsize=(20, len(con[-10:]) * 4))

In [None]:
can = unfilled.query('label == "CANDIDATE"')
can[-10:].T.drop('label').plot(subplots=True, figsize=(20, len(can[-10:]) * 4))

In [None]:
unfilled.to_hdf('unfilled_transit_data.h5', key='stage', mode='w', index=True)
filled.to_hdf('filled_transit_data.h5', key='stage', mode='w', index=True)

# Load HDF5 files rather than re-extracting

In [2]:
# load hdf files
unfilled = pd.read_hdf('./unfilled_transit_data.h5')
filled = pd.read_hdf('./filled_transit_data.h5')

In [17]:
kaggle_test = "./data/raw/exoTest.csv"
kaggle_ds = "./data/raw/exoTrain.csv"
test_df = pd.read_csv(kaggle_test)
kg_df = pd.read_csv(kaggle_ds)
test_df.columns = [x.lower() for x in test_df.columns]
test_df.columns = ['label'] + [x[5:] for x in test_df.columns if 'label' not in x]
kg_df.columns = [x.lower() for x in kg_df.columns]
kg_df.columns = ['label'] + [x[5:] for x in kg_df.columns if 'label' not in x]
test_df.index = ['EPIC '+ str(x) for x in test_df.index]
kg_df.index = ['EPIC '+ str(x) for x in kg_df.index]
test_df

Unnamed: 0,label,1,2,3,4,5,6,7,8,9,...,3188,3189,3190,3191,3192,3193,3194,3195,3196,3197
EPIC 0,2,119.88,100.21,86.46,48.68,46.12,39.39,18.57,6.98,6.63,...,14.52,19.29,14.44,-1.62,13.33,45.50,31.93,35.78,269.43,57.72
EPIC 1,2,5736.59,5699.98,5717.16,5692.73,5663.83,5631.16,5626.39,5569.47,5550.44,...,-581.91,-984.09,-1230.89,-1600.45,-1824.53,-2061.17,-2265.98,-2366.19,-2294.86,-2034.72
EPIC 2,2,844.48,817.49,770.07,675.01,605.52,499.45,440.77,362.95,207.27,...,17.82,-51.66,-48.29,-59.99,-82.10,-174.54,-95.23,-162.68,-36.79,30.63
EPIC 3,2,-826.00,-827.31,-846.12,-836.03,-745.50,-784.69,-791.22,-746.50,-709.53,...,122.34,93.03,93.03,68.81,9.81,20.75,20.25,-120.81,-257.56,-215.41
EPIC 4,2,-39.57,-15.88,-9.16,-6.37,-16.13,-24.05,-0.90,-45.20,-5.04,...,-37.87,-61.85,-27.15,-21.18,-33.76,-85.34,-81.46,-61.98,-69.34,-17.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EPIC 565,1,374.46,326.06,319.87,338.23,251.54,209.84,186.35,167.46,135.45,...,-123.55,-166.90,-222.44,-209.71,-180.16,-166.83,-235.66,-213.63,-205.99,-194.07
EPIC 566,1,-0.36,4.96,6.25,4.20,8.26,-9.53,-10.10,-4.54,-11.55,...,-12.40,-5.99,-17.94,-11.96,-12.11,-13.68,-3.59,-5.32,-10.98,-11.24
EPIC 567,1,-54.01,-44.13,-41.23,-42.82,-39.47,-24.88,-31.14,-24.71,-13.12,...,-0.73,-1.64,1.58,-4.82,-11.93,-17.14,-4.25,5.47,14.46,18.70
EPIC 568,1,91.36,85.60,48.81,48.69,70.05,22.30,11.63,37.86,28.27,...,2.44,11.53,-16.42,-17.86,21.10,-10.25,-37.06,-8.43,-6.48,17.60


In [18]:
# Normalise these
labels = kg_df[['label']]
kg_df = kg_df.drop('label', axis=1)

# Rescale to the same range as the K2 data
kg_df = (((kg_df - np.mean(kg_df, axis=1).values.reshape(-1,1)) / 
           np.std(kg_df, axis=1).values.reshape(-1,1)) / 10 ) + 1

kg_df = pd.concat([kg_df, labels], axis=1)

In [20]:
labels = test_df[['label']]
test_df = test_df.drop('label', axis=1)

# Rescale to the same range as the K2 data
test_df = (((test_df - np.mean(test_df, axis=1).values.reshape(-1,1)) / 
           np.std(test_df, axis=1).values.reshape(-1,1)) / 10 ) + 1

test_df = pd.concat([test_df, labels], axis=1)

In [21]:
kg_df.to_csv('kaggle_processed_train.csv', index=True)
test_df.to_csv('kaggle_processed_test.csv', index=True)