# MeerCRAB prediction phase on new candidate files
This script will be integrated in BlackBOX to make prediction on new candidate files.

In [1]:
import os
import numpy as np
import pandas as pd
from meerCRAB_code.model import compile_model,model_save 
import matplotlib.pylab as plt
from keras.utils import np_utils
from time import gmtime, strftime
from meerCRAB_code.util import makedirs, ensure_dir
from meerCRAB_code.prediction_phase import load_new_candidate, realbogus_prediction


Using TensorFlow backend.
  return f(*args, **kwds)


## Load csv file

Csv file having this format - Each row has 24 columns separated by semi-colons.

In order they are:

- transientid: ID of souce in DB
- username:
- vettingdate:
- vetclas: can be either real, bogus, bogus_cosmicray, bogus_subtract, 
- bogus_spike or bogus_ghost
- number: number of source in orig. FITS file
- image: ID of image/FITS file in DB
- date-obs:
- filter:
- object: the MeerLICHT/BlackGEM tile of observation
- psf-fwhm:
- s-seeing:
- s-seestd:
- x_peak: integer x position (no python index) of peak in Scorr image
- y_peak: idem y
- ra_peak: corresponding ra [degrees]
- dec_peak: corresponding dec [degrees]
- flux_peak: corresponding calibrated flux [microJy]
- fluxerr_peak: flux uncertainty [microJy]
- mag_peak: corresponding calibrated magnitude [AB magn.]
- magerr_peak: magn. uncertainty [AB magn.]
- thumbnail_red: 100x100 thumbnail
- thumbnail_ref:
- thumbnail_d:
- thumbnail_scorr:

Notice that the thumbnails are 2D numpy arrays of 32bit floats, and are 
written as binary large objects (BLOBs). 


In [2]:
data = pd.read_csv('/Volumes/ZAFIIRAH/meerlicht-project/Bogus_against_interesting/data/dumpformachinelearning_20200114161507.csv',sep=';',header=None)
data = data.drop_duplicates(subset=0, keep="first")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,9184,nblago,2019-11-05 20:23:46.518000,bogus,395,37,2017-09-01 18:30:00.000000,q,90005,3.056144,...,234.133234,-14.702009,46.36245,7.713804,19.73465,0.180645,A0E80BC240CF3AC200F8BD3F608BD441706D224200001D...,F06705C280BFE3C170B10F42403909C2401746C1A041DB...,575FDB406D7D9AC2D85514420177743F4F446140A113A9...,BBA2273F32826E3F899BA63FD24CE83FD54E1040BDFA0C...
3,9210,nblago,2019-11-05 20:24:02.033000,bogus,515,37,2017-09-01 18:30:00.000000,q,90005,3.056144,...,233.819094,-14.61232,50.849087,7.740949,19.634357,0.165286,6068E3C140616341C090AAC1005CE5C010410DC2300F0A...,00B2704060C559C280582641A04045C2803FBC4000ED23...,0484EB41AC3D82C154F6C541EFCA96407E925EC1281912...,FC739F3F65BA8C3F90CF333FF7710D3EBE6F68BE19C2CD...
6,12110,nblago,2019-11-05 20:24:24.389000,real,1295,47,2017-09-01 19:17:50.000000,q,90004,3.398105,...,273.16855,-31.58585,177.599289,16.582558,18.276463,0.101376,7C9369440EDB2E44FC74DD43E8489743A85A8243402102...,18DC384454900B440CABD3438836B14350FF6543682C64...,82C3F3420D4806433E048AC2F618374297DEA7417BB908...,9FCE663E69D5B43D4D2ED2BECEB284BFB76CCEBFA2FFF3...
9,13585,nblago,2019-11-05 20:24:38.576000,bogus_subtract,1767,49,2017-09-01 19:23:40.000000,q,90004,3.13496,...,273.659695,-31.597761,110.529106,16.169716,18.791374,0.158836,80ED6341C0E5FFC2C0700742F0299042603568429007AD...,C03886C2B0E2CCC20008823F80CA3E4140BC2F42C043D8...,078BC8C263E57242AC90BBC0ECB221C271570B4250A660...,EE37A53FA68E813F61464F3F4072263F5EE7283FC26D67...
12,14590,nblago,2019-11-05 20:24:49.179000,bogus,775,50,2017-09-01 19:24:50.000000,q,90004,3.253015,...,273.958172,-31.143006,75.347374,10.79938,19.207396,0.155616,A0661DC240FCD7C1B09986C240C912C2603B49C2303BB9...,80F972C2E09EA1C2804E90C260D63AC280CE32410065D4...,A1E48A4248945042A64E64C132C71BC202BA5D4283EA03...,3E238E3F4C24573F77480F3FC10F023F80D22D3F23DE58...


# Load the new candidates
- ID: The first column of the csv file is 'transientid' --> data.iloc[:,0]
- full_data : Here we will provide the whole csv file as above, the code will atomatically select the last 4 or 3 column to extract the images. Four last columns is the candidate file in the order of (20)New Image, (21)Ref Image, (22)Diff Image, (23)Scorr image

- n_images: The number of images to consider 4, 3, 2. Note that here we should be careful. If             the network that we will select below has been trained on 3 images, therefore we               will need to use n_images=3
- min_pix: value range from 0 to 100 (applied when cropped = True). Note that here we should be careful. If the network that we will select below has been trained on 30X30 images, therefore we will need to use min_pix=35.
- max_pix: value range from 0 to 100 (applied when cropped = True). Note that here we should be careful. If the network that we will select below has been trained on 30X30 images, therefore we will need to use max_pix=65.
- cropped: True - cropping is done from the centre. If we want 30X30 pixels image, then min_pix= 35, max_pix=65

In [3]:
# Parameters to change
nClasses   = 2  # The number of classes we are classifying: Real and Bogus
minPix     = 35 # The minimum pixel to be used from the image 
maxPix     = 65 # The maximum pixel to be used from the image
num_images = 'NRD'  # The number of images to used for training and testing either 'NRDS', 'NRD', 'NR', 'D', 'S'
threshold = 'threshold_9' #This variable is used when loading the image- threshold atleast 9 people vetted a source as either real or bogus - threshold_9, can also use threshold_8
model_cnn_name = 'NET3'+'_'+threshold+'_'+num_images  # The network name choose from: 'NET1','NET2','NET3', NET1_32_64','NET1_64_128','NET1_128_256'
probability_threshold = 0.5
model_path = "./meerCRAB_model/"

In [4]:
test, ID_test = load_new_candidate(ID=data.iloc[:,0].values,full_data=data,n_images=num_images,minPix=minPix,maxPix=maxPix,cropped=True)
print("Total number of training instances: {}".format(str(len(ID_test))))
print("The Shape of the test set is {}".format(test.shape))

Total number of training instances: 134
The Shape of the test set is (134, 30, 30, 3)


# Prediction on new candidate files
Here we will load the pre-existing train model using the parameter 

INPUTS:
- model_name: model_cnn_name = 'NET1_32_64'
- X_test : should have shape (Nimages,100,100,3), (Nimages,30,30,3), (Nimages,30,30,4). This will vary depending on the criteria one use for min_pix, max_pix and num_images.
- ID: The transient ID extracted from the csv file ID=data.iloc[:,0]

OUTPUTS:
- overall_real_prob: An array of probability that each candidate is real
- overall_dataframe: A table with column transientid and ML_PROB_REAL

In [5]:
overall_real_prob, overall_dataframe = realbogus_prediction(model_name=model_cnn_name, X_test=test,ID=ID_test, probability_threshold=probability_threshold,model_path=model_path)


Loaded model:NET3_threshold_9_NRD from disk


In [6]:
# The transient ID for each candidate
ID_test

array([  9184,   9210,  12110,  13585,  14590,  21103,  21355,  24134,
        25362,  27118,  31383,  38607,  46836,  46970,  47752,  51364,
        51594,  52013,  53320,  53465,  54064,  54202,  54423,  60038,
        61252,  62892,  66172,  66249,  66558,  68367,  75021,  75152,
        75621,  76630,  78401,  78888,  80804,  80875,  87566,  97416,
        97494,  99695, 103858, 104758, 113393, 115324, 131115, 132902,
       134030, 134334, 134850, 147178, 147296, 149201, 149297, 149488,
       150770, 152647, 156544, 157838, 158087, 165939, 166129, 170915,
       171648, 172410, 173430, 178944, 182469, 184310, 190724, 191986,
       198859, 203779, 204589, 218048, 222098, 224815, 225928, 225986,
       230616, 230800, 241567, 243336, 247661, 247797, 251461, 251743,
       254029, 259438, 259793, 266891, 269943, 271943, 272473, 272652,
       273131, 274010, 274616, 275065, 280030, 281340, 288492, 290158,
       294475, 297652, 301444, 303095, 304415, 306058, 308004, 308992,
      

In [7]:
# The probability that each source is a real source: It varies from 0 to 1
overall_real_prob

array([1.31654243e-08, 1.26158457e-05, 1.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 3.74807087e-07, 2.82803194e-06, 1.57054619e-05,
       5.67756445e-07, 1.00000000e+00, 3.14599532e-36, 9.91135370e-03,
       7.69833207e-07, 3.27276561e-36, 6.06705116e-13, 2.57796913e-16,
       1.00000000e+00, 3.25098608e-06, 4.40713984e-06, 1.89745464e-04,
       1.00000000e+00, 2.42841881e-17, 2.75323458e-28, 3.97761779e-09,
       1.27492435e-20, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       4.93719792e-15, 1.00000000e+00, 5.68406016e-04, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       4.67047556e-09, 1.28934010e-24, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 3.37149389e-03, 1.00000000e+00,
       1.00000000e+00, 1.36183007e-05, 1.38836938e-07, 9.98423934e-01,
       9.08938350e-16, 1.00000000e+00, 1.36255677e-14, 1.00000000e+00,
       1.00000000e+00, 7.47383165e-05, 8.18805006e-07, 3.90982132e-06,
      

In [8]:
# A dataframe that contains the transient ID and its probability that it is a Real source
overall_dataframe

Unnamed: 0,transientid,ML_PROB_REAL,label
0,9184,1.316542e-08,0.0
1,9210,1.261585e-05,0.0
2,12110,1.000000e+00,1.0
3,13585,1.000000e+00,1.0
4,14590,0.000000e+00,0.0
...,...,...,...
129,373673,2.041226e-03,0.0
130,374696,6.538528e-03,0.0
131,379978,1.154433e-02,0.0
132,387543,6.551196e-03,0.0
