In [1]:
# This script will merge the labels and the images. Afterwards, the whole data is split into 75% training and 25% testing
import pandas as pd

In [2]:
vetting_table = pd.read_csv('../data/voting_5000.csv')


In [3]:
image_table = pd.read_csv('../data/image_data_5000.csv')

In [4]:
threshold = 'label_9'#'label_9' #This parameter needs to be changed to select which threshold we want
vetting_table = vetting_table[['transientid',threshold]]

In [5]:
# Select the rows that contains nans - this will be discarded
sel_nan_vet = vetting_table[vetting_table[threshold].isnull()]
print('The number of sources to be discarded = {}'.format(sel_nan_vet.shape[0]))
sel_nan_vet.head()

The number of sources to be discarded = 623


Unnamed: 0,transientid,label_9
7,5408,
8,5490,
33,6930,
39,7132,
41,7296,


In [6]:
# Select the rows that contains a label
sel_vetted_sources = vetting_table.dropna()
print('The number of sources to include in ML pipeline is {}'.format(sel_vetted_sources.shape[0]))
sel_vetted_sources.columns = ['transientid','label']
sel_vetted_sources.head()

The number of sources to include in ML pipeline is 4377


Unnamed: 0,transientid,label
0,510,0.0
1,607,0.0
2,707,0.0
3,725,0.0
4,5233,1.0


In [7]:
# Drop rows that have transient id that were droped during vetting when using 8-2 split
for i in range(sel_nan_vet.shape[0]):
    #print(sel_nan_vet.transientid.iloc[i])
    image_table = image_table[image_table.transientid != sel_nan_vet.transientid.iloc[i]]
image_table.shape

(4377, 5)

In [8]:
# Merge selected images and their respective labels
final_dataset = pd.merge(image_table, sel_vetted_sources, on='transientid')
final_dataset.head()

Unnamed: 0,transientid,image_scorr,image_difference,image_ref,real_image,label
0,510,8d8a88bd7f3e46befdf0efbe58d20cbfc7749fbee68920...,a15e0d4213091d42bb6934c2056f82c1dba00b3f0a0b08...,e0c086c180d186c1400a8e4140e6db41009980c1c0335c...,20cf2fc280e34242c088b04100caf2bf2067b741600397...,0.0
1,607,249f21bf06c047be191df33e7e3a8b3f9477b73f56ecc0...,a1e2c6405e74f4c1e0e3fb41d1509b424b9262c089c888...,00ae13c060660742001b8140a03771c2803c1dc2d0d00d...,009302c2a0eea54180434d410030babf60dbb14100378c...,0.0
2,707,d11bba3eef35283f055c9c3fda18e23f1543f43f5852c3...,f2f87b3f37732ec2f17e84c0764330422820c940270e28...,805f99c0a08b1c42c0551fc1801c49c200a0eb3ee0d8be...,0080004080b0dbc140f5cd410041eb400006abbf00f86d...,0.0
3,725,749523c05f6b0ec07ebab4bf93bb3cbf0f1817bfbea518...,2b7210c2dd0db9c148d605c2df31514223e31142bee1ef...,8075f04000d6c1bfc047a541409d50c130e312c240c553...,00f434c2f0b13f42a088c0c160b102c2e0aa874100f929...,0.0
4,5233,b9039fbf1860c4be3dba943e1fdda43e4658c5be7a25a9...,1fce21c1548e9041a1b00a43f1a2da417bcc39c2c6092a...,10d80042303906c2f0e635c26015e8c140f052c1506b2b...,000ef1c0a07c95c1585a9242c0cf36c2809d834040d989...,1.0


In [9]:
final_dataset.to_csv('../data/'+threshold+'_image_data_with_label.csv',index=None)

# Split into training and test set

In [None]:
bogus_table = final_dataset[final_dataset.label == 0].sample(frac=1)
real_table  = final_dataset[final_dataset.label == 1].sample(frac=1)
print('Number of bogus objects is {}'.format(bogus_table.shape[0]))
print('Number of real objects is {}'.format(real_table.shape[0]))

In [None]:
percentage_split =  0.75#0.7
train_bogus_table = bogus_table.iloc[0:int(percentage_split*bogus_table.shape[0]),:]
test_bogus_table  = bogus_table.iloc[int(percentage_split*bogus_table.shape[0]):,:]
train_real_table  = real_table.iloc[0:int(percentage_split*real_table.shape[0]),:]
test_real_table   = real_table.iloc[int(percentage_split*real_table.shape[0]):,:]
print('Bogus objects in training set is {} and in test set is {}'.format(train_bogus_table.shape,test_bogus_table.shape))
print('Real objects in training set is {} and in test set is {}'.format(train_real_table.shape,test_real_table.shape))


In [None]:
trainingset_merge = [train_bogus_table,train_real_table]
testingset_merge  = [test_bogus_table,test_real_table]
trainingset       = pd.concat(trainingset_merge)
testingset        = pd.concat(testingset_merge).sample(frac=1).reset_index(drop=True)
print('The training set contains {} objects'.format(trainingset.shape[0]))
print('The test set contains {} objects'.format(testingset.shape[0]))


In [None]:
trainingset.to_csv('../data/'+threshold+'_training_set.csv',index=None)
testingset.to_csv('../data/'+threshold+'_testing_set.csv',index=None)