# Here are the list of datasets

In [1]:
import glob
val_samples = sorted(glob.glob('../*validation*'))
quasar_val_samples = sorted(glob.glob('../original_samples/*validation*'))

print("validation samples:\n * ", end='')
print("\n * ".join(val_samples))
print("quasar validation samples:\n * ", end='')
print("\n * ".join(quasar_val_samples))


validation samples:
 * ..\candidates_width70_sig6_validation_64plates_sample0.json
 * ..\candidates_width70_sig6_validation_64plates_sample1.json
 * ..\candidates_width70_sig6_validation_64plates_sample2.json
 * ..\candidates_width70_sig6_validation_64plates_sample3.json
 * ..\candidates_width70_sig6_validation_64plates_sample4.json
 * ..\candidates_width70_sig6_validation_64plates_sample5.json
 * ..\candidates_width70_sig6_validation_64plates_sample6.json
 * ..\candidates_width70_sig6_validation_64plates_sample7.json
quasar validation samples:
 * ../original_samples\quasars_validation_64plates_sample0.json
 * ../original_samples\quasars_validation_64plates_sample1.json
 * ../original_samples\quasars_validation_64plates_sample2.json
 * ../original_samples\quasars_validation_64plates_sample3.json
 * ../original_samples\quasars_validation_64plates_sample4.json
 * ../original_samples\quasars_validation_64plates_sample5.json
 * ../original_samples\quasars_validation_64plates_sample6.json
 

# Load datasets

In [2]:
from squeze.squeze_common_functions import load_json, deserialize
val_dfs = [deserialize(load_json(sample)) for sample in val_samples]
quasar_val_dfs = [deserialize(load_json(sample)) for sample in quasar_val_samples]

# Compute purity and completeness

In [3]:
from candidates import CombCandidates
candidates = CombCandidates(mode="test")

In [4]:
prob = 0.32

for (i, data_frame), quasar_df in zip(enumerate(val_dfs), quasar_val_dfs):
    print("\n-----------------------------------")
    print("Statistics for validation sample {}".format(i))
    print("-----------------------------------\n")
    candidates.find_completeness_purity(quasar_df, data_frame[(data_frame["prob"] > prob) &
                                                              ~(data_frame["duplicated"]) &
                                                              (data_frame["z_conf_person"] == 3)])


-----------------------------------
Statistics for validation sample 0
-----------------------------------

There are 6302 candidates  
for 6176 catalogued quasars 
number of quasars = 6176 
found quasars = 6053 
completeness = 98.01% 
completeness z>=1 = 98.67% 
completeness z>=2.1 = 99.08% 
purity = 96.05% 
purity z >=1 = 98.01% 
purity z >=2.1 = 99.77% 

-----------------------------------
Statistics for validation sample 1
-----------------------------------

There are 6499 candidates  
for 6441 catalogued quasars 
number of quasars = 6441 
found quasars = 6288 
completeness = 97.62% 
completeness z>=1 = 98.54% 
completeness z>=2.1 = 98.89% 
purity = 96.75% 
purity z >=1 = 98.01% 
purity z >=2.1 = 99.65% 

-----------------------------------
Statistics for validation sample 2
-----------------------------------

There are 6415 candidates  
for 6353 catalogued quasars 
number of quasars = 6353 
found quasars = 6201 
completeness = 97.61% 
completeness z>=1 = 98.40% 
completeness z>

# Reproduce SQUEzE I results

## List of datasets

In [5]:
import glob
test_samples = sorted(glob.glob('../*test*'))
quasar_test_samples = sorted(glob.glob('../original_samples/*test*'))

print("test samples:\n * ", end='')
print("\n * ".join(val_samples))
print("quasar test samples:\n * ", end='')
print("\n * ".join(quasar_test_samples))

test samples:
 * ..\candidates_width70_sig6_validation_64plates_sample0.json
 * ..\candidates_width70_sig6_validation_64plates_sample1.json
 * ..\candidates_width70_sig6_validation_64plates_sample2.json
 * ..\candidates_width70_sig6_validation_64plates_sample3.json
 * ..\candidates_width70_sig6_validation_64plates_sample4.json
 * ..\candidates_width70_sig6_validation_64plates_sample5.json
 * ..\candidates_width70_sig6_validation_64plates_sample6.json
 * ..\candidates_width70_sig6_validation_64plates_sample7.json
quasar test samples:
 * ../original_samples\quasars_test_64plates_sample0.json
 * ../original_samples\quasars_test_64plates_sample1.json
 * ../original_samples\quasars_test_64plates_sample2.json
 * ../original_samples\quasars_test_64plates_sample3.json
 * ../original_samples\quasars_test_64plates_sample4.json
 * ../original_samples\quasars_test_64plates_sample5.json
 * ../original_samples\quasars_test_64plates_sample6.json
 * ../original_samples\quasars_test_64plates_sample7.js

## Load datasets

In [6]:
from squeze.squeze_common_functions import load_json, deserialize
test_dfs = [deserialize(load_json(sample)) for sample in test_samples]
quasar_test_dfs = [deserialize(load_json(sample)) for sample in quasar_test_samples]

## compute purity and completeness

In [8]:
from candidates import CombCandidates
candidates = CombCandidates(mode="test")

In [9]:
prob = 0.32

results = {}
for (i, data_frame), quasar_df in zip(enumerate(test_dfs), quasar_test_dfs):
    print("\n-----------------------------------")
    print("Statistics for test sample {}".format(i))
    print("-----------------------------------\n")
    results[i] = candidates.find_completeness_purity(quasar_df, data_frame[(data_frame["prob"] > prob) &
                                                                           ~(data_frame["duplicated"])],# &
                                                                           #(data_frame["z_conf_person"] == 3)],
                                                     get_results=True)


-----------------------------------
Statistics for test sample 0
-----------------------------------

There are 6526 candidates  
for 6476 catalogued quasars 
number of quasars = 6476 
found quasars = 6325 
completeness = 97.67% 
completeness z>=1 = 98.55% 
completeness z>=2.1 = 99.01% 
purity = 96.92% 
purity z >=1 = 98.66% 
purity z >=2.1 = 99.62% 

-----------------------------------
Statistics for test sample 1
-----------------------------------

There are 6580 candidates  
for 6680 catalogued quasars 
number of quasars = 6680 
found quasars = 6468 
completeness = 96.83% 
completeness z>=1 = 97.80% 
completeness z>=2.1 = 98.63% 
purity = 98.28% 
purity z >=1 = 99.11% 
purity z >=2.1 = 99.63% 

-----------------------------------
Statistics for test sample 2
-----------------------------------

There are 6964 candidates  
for 6961 catalogued quasars 
number of quasars = 6961 
found quasars = 6784 
completeness = 97.46% 
completeness z>=1 = 98.39% 
completeness z>=2.1 = 98.89% 
pur

# Combine the results

In [10]:
import pandas as pd
results_df = pd.DataFrame.from_dict(results, orient="index", columns=["purity", "completeness", "found quasars"])
results_df = results_df.drop("found quasars", axis=1)
results_df

Unnamed: 0,purity,completeness
0,0.9692,0.976683
1,0.982827,0.968263
2,0.974153,0.974573
3,0.969413,0.973302
4,0.978603,0.974463
5,0.972969,0.9789
6,0.974022,0.977399
7,0.970656,0.972609


In [11]:
pd.options.display.float_format = '{:,.2%}'.format
results_df.mean()

purity         97.40%
completeness   97.45%
dtype: float64

In [12]:
results_df.std()

purity         0.47%
completeness   0.33%
dtype: float64