In [1]:
# This script will analyse the vetting of all candidates. Then we will decide which threshold
# we will use during training and prediction.
import numpy as np
import pandas as pd
from astropy.table import Table
import matplotlib.pylab as plt

plt.rc('text', usetex=True)
plt.rc('font',**{'family':'DejaVu Sans','serif':['Palatino']})
figSize  = (16, 10)
fontSize = 20
%matplotlib inline

In [2]:
data_dir = '../data/'
fits_file = 'vetting_4Zafiirah_20200714102528.fits'

In [3]:
# Read fits file
table = Table.read(data_dir+fits_file)

In [4]:
# Convert fits file to pandas data frame
# removing the images from the data frame due to high dimensional data
names = [name for name in table.colnames if len(table[name].shape) <= 1]
print(names)
param_table = table[names].to_pandas()
param_table.head()

['transientid', 'username', 'vettingdate', 'vetclas', 'number', 'image', 'date-obs', 'filter', 'object', 't-ntrans', 'pc-zpstd', 'psf-fwhm', 's-seeing', 's-seestd', 'x_peak', 'y_peak', 'ra_peak', 'dec_peak', 'flux_peak', 'fluxerr_peak', 'mag_peak', 'magerr_peak']


Unnamed: 0,transientid,username,vettingdate,vetclas,number,image,date-obs,filter,object,t-ntrans,...,s-seeing,s-seestd,x_peak,y_peak,ra_peak,dec_peak,flux_peak,fluxerr_peak,mag_peak,magerr_peak
0,510,b'dpieterse',b'2020-05-08 00:06:45.492000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
1,510,b'pmv',b'2020-05-08 00:09:02.602000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
2,510,b'pgroot',b'2020-05-08 09:12:52.507000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
3,510,b'simon',b'2020-05-08 09:20:26.132000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714
4,510,b'nblago',b'2020-05-08 10:33:52.220000',b'bogus',547,5,b'2017-09-01 17:36:00.000000',b'q',90005,170,...,2.601214,0.078431,9848,8471,233.559933,-13.202148,105.561905,7.361342,18.841297,0.075714


In [5]:
param_table.username.unique()

array([b'dpieterse', b'pmv', b'pgroot', b'simon', b'nblago', b'jkersten',
       b'sbloemen', b'naomi', b'AstroLauraD', b'zafiirah', b'dmodiano',
       b'mbeijer', b'rruizc'], dtype=object)

In [6]:
print("The number of sources in dataset {}".format(len(np.unique(param_table.transientid))))

The number of sources in dataset 5000


In [7]:
table1 = param_table
transientids = table1.transientid.unique()
final_table  = pd.DataFrame()
print(transientids.shape)

for ids in transientids:
    source   = table1[table1.transientid==ids]
    make_table = pd.DataFrame([ids],columns=['transientid'])
    
    for i in range(source.shape[0]):
        label = source['vetclas'].iloc[i]
        if label == b'bogus':
            volunteer_label = 0
            
        elif label == b'real':
            volunteer_label = 1
        make_table['volunteer_'+str(i+1)] = volunteer_label
        
    final_table         = final_table.append(make_table,ignore_index=True)
    
print(final_table.shape)

(5000,)
(5000, 12)


In [8]:
final_table.head()

Unnamed: 0,transientid,volunteer_1,volunteer_2,volunteer_3,volunteer_4,volunteer_5,volunteer_6,volunteer_7,volunteer_8,volunteer_9,volunteer_10,volunteer_11
0,510,0,0,0,0,0,0,0,0,0,0,
1,607,0,0,0,0,0,0,0,0,0,1,
2,707,0,0,0,0,0,0,0,0,0,0,
3,725,0,0,0,0,0,0,0,0,0,0,
4,5233,1,1,1,1,1,1,1,1,1,1,


In [9]:
table = final_table.iloc[:,:-1]

In [10]:
table.to_csv('../data/10_volunteers_voting_5000.csv',index=None)