In [1]:
# This script will analyse the vetting of all candidates. Then we will decide which threshold
# we will use during training and prediction.
import numpy as np
import pandas as pd
from astropy.table import Table
import matplotlib.pylab as plt

figSize  = (16, 10)
fontSize = 20
%matplotlib inline

In [2]:
vetting_table = pd.read_csv('../data/voting_5000.csv')

In [3]:
image_table = pd.read_csv('../data/image_data_5000.csv')

In [4]:
threshold = 'label_9'#'label_9' #This parameter needs to be changed to select which threshold we want
vetting_table = vetting_table[['transientid',threshold]]

In [5]:
# Select the rows that contains nans - this will be discarded
sel_nan_vet = vetting_table[vetting_table[threshold].isnull()]
print('The number of sources to be discarded = {}'.format(sel_nan_vet.shape[0]))
sel_nan_vet.head()

The number of sources to be discarded = 623


Unnamed: 0,transientid,label_9
7,5408,
8,5490,
33,6930,
39,7132,
41,7296,


In [6]:
# Select the rows that contains a label
sel_vetted_sources = vetting_table.dropna()
print('The number of sources to include in ML pipeline is {}'.format(sel_vetted_sources.shape[0]))
sel_vetted_sources.columns = ['transientid','label']
sel_vetted_sources.head()

The number of sources to include in ML pipeline is 4377


Unnamed: 0,transientid,label
0,510,0.0
1,607,0.0
2,707,0.0
3,725,0.0
4,5233,1.0


In [7]:
# Drop rows that have transient id that were droped during vetting when using 8-2 split
image_table_ = pd.DataFrame()
for i in range(sel_nan_vet.shape[0]):
    #print(sel_nan_vet.transientid.iloc[i])
    img = image_table[image_table.transientid == sel_nan_vet.transientid.iloc[i]]
    image_table_ = image_table_.append(img)
image_table_.shape

(623, 5)

## Assign a table with highest probability

In [8]:
data_dir = '../data/'
fits_file = 'vetting_4Zafiirah_20200714102528.fits'

In [9]:
# Read fits file
table = Table.read(data_dir+fits_file)

In [10]:
# Convert fits file to pandas data frame
# removing the images from the data frame due to high dimensional data
names = [name for name in table.colnames if len(table[name].shape) <= 1]
print(names)
param_table = table[names].to_pandas()
param_table.head()

['transientid', 'username', 'vettingdate', 'vetclas', 'number', 'image', 'date-obs', 'filter', 'object', 't-ntrans', 'pc-zpstd', 'psf-fwhm', 's-seeing', 's-seestd', 'x_peak', 'y_peak', 'ra_peak', 'dec_peak', 'flux_peak', 'fluxerr_peak', 'mag_peak', 'magerr_peak']


Unnamed: 0,transientid,username,vettingdate,vetclas,number,image,date-obs,filter,object,t-ntrans,...,s-seeing,s-seestd,x_peak,y_peak,ra_peak,dec_peak,flux_peak,fluxerr_peak,mag_peak,magerr_peak
0,510,b'dpieterse',b'2020-05-08 00:06:45.492000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
1,510,b'pmv',b'2020-05-08 00:09:02.602000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
2,510,b'pgroot',b'2020-05-08 09:12:52.507000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
3,510,b'simon',b'2020-05-08 09:20:26.132000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
4,510,b'nblago',b'2020-05-08 10:33:52.220000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714


In [11]:
table1 = param_table
transientids = sel_nan_vet.transientid.unique()
final_table  = pd.DataFrame()
print(transientids.shape)

for ids in transientids:
    source   = table1[table1.transientid==ids]    
    matching_bogus = [b for b in source['vetclas'].values if b"bogus" in b]
    matching_real  = [r for r in source['vetclas'].values if b"real" in r]
    make_table = pd.DataFrame([ids],columns=['transientid'])
    make_table['bogus'] = len(matching_bogus)
    make_table['real']  = len(matching_real)
    make_table['HUMAN_PROB_REAL'] = len(matching_real)/(len(matching_bogus)+len(matching_real))
    if (len(matching_bogus)>len(matching_real)):
        make_table['label'] = 0
    else:
        make_table['label'] = 1        
    final_table  = final_table.append(make_table,ignore_index=True)
    
    

(623,)


In [12]:
final_table

Unnamed: 0,transientid,bogus,real,HUMAN_PROB_REAL,label
0,5408,3,7,0.7,1
1,5490,3,7,0.7,1
2,6930,2,8,0.8,1
3,7132,3,7,0.7,1
4,7296,2,8,0.8,1
...,...,...,...,...,...
618,2997022,3,7,0.7,1
619,3000742,2,8,0.8,1
620,3002428,2,8,0.8,1
621,3003014,3,7,0.7,1


In [21]:
confused_label = final_table[['transientid','HUMAN_PROB_REAL','label']]
confused_label.head()

Unnamed: 0,transientid,HUMAN_PROB_REAL,label
0,5408,0.7,1
1,5490,0.7,1
2,6930,0.8,1
3,7132,0.7,1
4,7296,0.8,1


In [22]:
# Merge selected images and their respective labels
confused_dataset = pd.merge(image_table_, confused_label, on='transientid')
confused_dataset.head()

Unnamed: 0,transientid,image_scorr,image_difference,image_ref,real_image,HUMAN_PROB_REAL,label
0,5408,3c0628406fd6104075a40740ac990a40fefdeb3fd3c584...,45a34b42010070415691f441939e8ac10b6a5fc11e8d6e...,b0c81cc200b00340004885c0609a85c140632341007553...,40293c42189098c2e0e6eec1603bd241a0dfbd4100542e...,0.7,1
1,5490,9040bc3f2265553fa3dd873eb57269bc3cd8923c65b903...,5f9e63c208c95fc26a0af041b77cacc20fa1a24135f593...,007ec1bf205eeac100d006bf8053e74040fc08c200a27a...,8033a641003f04c2801fa340d0a203c2004960400047fc...,0.7,1
2,6930,24b003c0f84320c03c4922c096fa17c0ad050bc0eba7f8...,30fe8dc2d70b8dc243553d418b8b1bc2d27c9bc0e3846e...,b0d90e420011244180c0ab41005a44c100b02b41205c48...,e064dbc1802e9a40a000c9c1b02b07c2f00a094200ff2e...,0.8,1
3,7132,5cab58bf4f40dbbe29e0b2bd2064a2bde63be4bef8135c...,a0821ac211d10fc2c2cf1d4233603b42a4f80542970285...,00dcabbfb00b12c2c0cb42410034c7c0405ce2c1404b2b...,60e8b9c1b04f884270863d42a0499641c09d5fc100992a...,0.7,1
4,7296,b536efbf75960ac0835305c069ffd3bfb9108fbfc9260a...,6203a64185ac97c1c329ccc18c531c42a0508f404cb7c0...,40bc84c10034eb414049734160d9f5c160d4ca41304587...,b05c0dc2806b384100de903fc06a58c20078bfbfc08e06...,0.8,1


In [23]:
confused_dataset.to_csv('../data/'+threshold+'_confused_image_data_with_label_and_prob.csv',index=None)

In [24]:
confused_dataset.shape

(623, 7)