# Preparing an augmented table of TCE transits for quality vetting

In [14]:
%matplotlib inline
import matplotlib.pyplot as pl
from matplotlib import rcParams
rcParams["savefig.dpi"] = 100
import seaborn as sns

import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm

## Step 1: load the TCE catalog

Load the DR25 TCE table. You can obtain this file from NexSCI as follows:

    wget -O q1_q17_dr25_tce.csv "http://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=q1_q17_dr25_tce&select=*"

The columns in the TCE table are documented at:

    http://exoplanetarchive.ipac.caltech.edu/docs/API_tce_columns.html

In [15]:
# Read the TCE table
tcedf = pd.read_csv('../data/q1_q17_dr25_tce.csv')
# Add the unique TCE ID used by the RoboVetter output:
tcedf.loc[:, 'tce'] = ['{:09d}-{:02d}'.format(row.kepid, row.tce_plnt_num) for row in tcedf.itertuples()]

## Step 2: Create a list of all transits for >100-day period TCEs

We'll need a table of all the TCE transits.  Let's create a new dataframe containing one row per transit:

In [16]:
# A few useful constants
KEPLER_BEGIN_BK, KEPLER_END_BK = 130, 1582

In [17]:
mask = tcedf.tce_period > 100
transitrows = []
for mytce in tqdm(tcedf[mask].itertuples()):
    mytime = mytce.tce_time0bk
    while mytime < KEPLER_END_BK:
        newrow = {'transit_time': mytime,
                  'tce': mytce.tce,
                  'kepid': mytce.kepid,
                  'tce_plnt_num': mytce.tce_plnt_num,
                  'tce_period': mytce.tce_period,
                  'tce_max_mult_ev': mytce.tce_max_mult_ev}                  
        transitrows.append(newrow)
        mytime += mytce.tce_period
transits = pd.DataFrame(transitrows)
transits.to_hdf('all-long-period-tce-transits.h5', key='transits')

12968it [00:00, 48114.86it/s]


In [18]:
transits.head()

Unnamed: 0,kepid,tce,tce_max_mult_ev,tce_period,tce_plnt_num,transit_time
0,2304168,002304168-02,12.22,431.719,2,219.298
1,2304168,002304168-02,12.22,431.719,2,651.017
2,2304168,002304168-02,12.22,431.719,2,1082.736
3,2304168,002304168-02,12.22,431.719,2,1514.455
4,2303102,002303102-10,11.51,480.481,10,161.57


## Step 3: Add a `quarter` column

In [19]:
KEPLER_QUARTERS = pd.read_csv('../data/kepler-quarters.csv')

def mjd2quarter(mjd):
    mask = (KEPLER_QUARTERS.first_lc_mjd < mjd+0.01) & (KEPLER_QUARTERS.last_lc_mjd > mjd-0.01)
    if mask.any():
        return KEPLER_QUARTERS.loc[mask, 'quarter'].values[0]
    return None

def bkjd_to_mjd_approximate(bkjd):
    """Inexact conversion from Barycentric Kepler Julian Date (BKJD) to Modified Julian Date (MJD).
    
    Inexact because it ignores the TIMECORR and TIMSLICE corrections.
    """
    return bkjd + 2454833 - 2400000.5

def bkjd2quarter(bkjd):
    return mjd2quarter(bkjd_to_mjd_approximate(bkjd))

In [21]:
# Add a column detailing the quarter
quarter_column = []
for row in tqdm(transits.itertuples()):
    quarter_column.append(mjd2quarter(bkjd_to_mjd_approximate(row.transit_time)))
transits['quarter'] = quarter_column

66047it [00:49, 1347.82it/s]


## Step 4: Add channel information

In [94]:
import requests

def get_channel_module_output(kepler_id, transit_time):
    quarter = mjd2quarter(bkjd_to_mjd_approximate(transit_time))
    try:
        q = int(quarter)
    except Exception:
        return None, None, None
    max_records = np.random.randint(10000, 9990000)  # Hack to prevent MAST from throwing a stupid 'Max retries exceeded with url' error
    url = ('http://archive.stsci.edu/kepler/data_search/search.php?'
           'target={}&sci_data_quarter={}'
           '&action=Search&outputformat=JSON'
           '&max_records={}').format(int(kepler_id), int(quarter), max_records)
    resp = requests.get(url)
    if 'no rows found' in str(resp.content):
        return None, None, None
    else:
        return int(resp.json()[0]['Channel']), int(resp.json()[0]['Module']), int(resp.json()[0]['Output'])

def transit_to_channel_module_output(transit_idx):
    mytransit = transits.ix[transit_idx]
    ch, mod, out = get_channel_module_output(mytransit.kepid, mytransit.transit_time)
    return {'idx': transit_idx,
            'kepid': mytransit.kepid,
            'transit_time': mytransit.transit_time,
            'channel': ch,
            'module': mod,
            'output': out}

In [95]:
transits.ix[3]










kepid                   2304168
tce                002304168-02
tce_max_mult_ev           12.22
tce_period              431.719
tce_plnt_num                  2
transit_time            1514.46
quarter                      16
Name: 3, dtype: object








In [87]:
transit_to_channel_module_output(3)

{'channel': 84,
 'idx': 3,
 'kepid': 2304168,
 'module': 24,
 'output': 4,
 'transit_time': 1514.4550000000002}




In [70]:
[idx for idx in transits.index[0:10]]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [96]:
import multiprocessing
pool = multiprocessing.Pool(processes=15)
results = []
for result in tqdm(pool.imap(transit_to_channel_module_output, transits.index), total=len(transits), mininterval=120, maxinterval=240):
    results.append(result)








  0%|          | 0/66047 [00:00<?, ?it/s]






  2%|▏         | 1110/66047 [02:00<1:57:12,  9.23it/s]






  4%|▎         | 2446/66047 [04:00<1:48:57,  9.73it/s]






  6%|▌         | 3818/66047 [06:00<1:41:51, 10.18it/s]






  8%|▊         | 5175/66047 [08:00<1:36:45, 10.49it/s]






 10%|▉         | 6545/66047 [10:01<1:32:20, 10.74it/s]






 12%|█▏        | 7901/66047 [12:01<1:28:54, 10.90it/s]






 14%|█▍        | 9190/66047 [14:01<1:27:21, 10.85it/s]






 16%|█▌        | 10413/66047 [16:01<1:27:09, 10.64it/s]






 18%|█▊        | 11631/66047 [18:03<1:26:53, 10.44it/s]






 20%|█▉        | 13095/66047 [20:03<1:20:54, 10.91it/s]






 23%|██▎       | 15042/66047 [22:03<1:10:17, 12.09it/s]






 26%|██▌       | 17028/66047 [24:03<1:02:07, 13.15it/s]






 29%|██▉       | 19007/66047 [26:04<56:00, 14.00it/s]  






 32%|███▏      | 21033/66047 [28:04<50:51, 14.75it/s]






 35%|███▍      | 23019/66047 [30:04<47:01, 15.25it/s]






 38%|███▊      | 25097/660

In [97]:
ccdinfo = pd.DataFrame(results)
ccdinfo.index = ccdinfo['idx']
ccdinfo.head()

Unnamed: 0_level_0,channel,idx,kepid,module,output,transit_time
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.0,0,2304168,2.0,4.0,219.298
1,56.0,1,2304168,16.0,4.0,651.017
2,56.0,2,2304168,16.0,4.0,1082.736
3,84.0,3,2304168,24.0,4.0,1514.455
4,32.0,4,2303102,10.0,4.0,161.57


In [98]:
transits_with_ccdinfo = transits.merge(ccdinfo, left_index=True, right_index=True)

In [99]:
len(transits_with_ccdinfo)

66047

In [103]:
transits_with_ccdinfo.head()

Unnamed: 0,kepid_x,tce,tce_max_mult_ev,tce_period,tce_plnt_num,transit_time_x,quarter,channel,idx,kepid_y,module,output,transit_time_y
0,2304168,002304168-02,12.22,431.719,2,219.298,2.0,4.0,0,2304168,2.0,4.0,219.298
1,2304168,002304168-02,12.22,431.719,2,651.017,7.0,56.0,1,2304168,16.0,4.0,651.017
2,2304168,002304168-02,12.22,431.719,2,1082.736,11.0,56.0,2,2304168,16.0,4.0,1082.736
3,2304168,002304168-02,12.22,431.719,2,1514.455,16.0,84.0,3,2304168,24.0,4.0,1514.455
4,2303102,002303102-10,11.51,480.481,10,161.57,1.0,32.0,4,2303102,10.0,4.0,161.57


In [101]:
transits_with_ccdinfo.to_hdf('all-long-period-tce-transits-with-ccd-info.h5', key='transits')