In [1]:
import numpy as np
import pandas as pd
from penquins import Kowalski

# Authenticate kowalski, gloria (first time)

In [2]:
kowalski = Kowalski(username = '',
                   password = '',
                   )

gloria = Kowalski(username = '',
                  password = '',
                  host = "gloria.caltech.edu")

In [3]:
# Generate tokens (no need to do this every time)
k_token = kowalski.authenticate()
g_token = gloria.authenticate()

# Create kowalski, gloria instances using token

In [4]:
kowalski = Kowalski(
    token=k_token,
    protocol="https",
    host="kowalski.caltech.edu",
    port=443
)

gloria = Kowalski(
        token=g_token,
        protocol="https",
        host="gloria.caltech.edu",
        port=443
)

In [5]:
# Test connection
kowalski.ping(), gloria.ping()

(True, True)

# Get Kowalski, Gloria catalog names

In [6]:
query = {
    "query_type": "info",
    "query": {
        "command": "catalog_names",
    }
}

response_k = kowalski.query(query=query)
data_k = response_k.get("data")

response_g = gloria.query(query=query)
data_g = response_g.get("data")

In [7]:
data_k, data_g

(['sdss_ellipticals',
  'mzls_ellipticals',
  'milliquas_v6',
  'legacysurveys_photoz_DR7',
  'legacysurveys_photoz_DR6',
  'galaxy_redshifts_20200522',
  'cfht_w3_photozs',
  'ZUDS_alerts_aux',
  'ZUDS_alerts',
  'ZTF_sources_20200401',
  'ZTF_source_features_20201201',
  'ZTF_source_features_20200401_20_fields',
  'ZTF_source_features_20191101_20_fields',
  'ZTF_source_features_20191101',
  'ZTF_source_classifications_dr2_20_fields',
  'ZTF_source_classifications_dr2',
  'ZTF_source_classifications_20191101',
  'ZTF_ops',
  'ZTF_exposures_20201201',
  'ZTF_exposures_20200401',
  'ZTF_exposures_20191101',
  'ZTF_alerts_aux',
  'ZTF_alerts',
  'VLASS_DR1',
  'TNS',
  'TGSS_ADR1',
  'RFC_2019d',
  'RFC_2019a',
  'PS1_STRM',
  'PS1_DR1',
  'PGIR_alerts_aux',
  'PGIR_alerts',
  'NVSS_41',
  'LAMOST_DR5_v3',
  'LAMOST_DR4_v2',
  'Known_lenses_20180901',
  'IPHAS_DR2',
  'IGAPS_DR2',
  'Gaia_EDR3',
  'Gaia_DR2_light_curves',
  'Gaia_DR2_WD',
  'Gaia_DR2_2MASS_best_neighbour',
  'Gaia_DR2',


# Query Kowalski/Gloria for ZTF high-confidence IDs

In [None]:
qry = gloria.query({
            "query_type": "find", # find documents
            "query": {
            "catalog": "ZTF_source_classifications_DR5", # catalog name
            "filter": {
                "$or":[
                    {'%s_xgb' % 'vnv': {'$gt': 0.9}}, # filter by vnv_xgb > 0.9 or vnv_dnn > 0.9
                                {'%s_dnn' % 'vnv': {'$gt': 0.9}},
                ]}
            },
            "kwargs": {
                       "max_time_ms": 10000 # adjustable max time
            },
            "projection": {"_id": 1 # only return the ids
                          }
            })
data = qry.get('data')

Query times out - we have to query in batches 

# Query Kowalski/Gloria for ZTF DR5 classifications

## Get estimated document count

In [8]:
qry = gloria.query({
    "query_type": "estimated_document_count",
    "query": {
    "catalog": "ZTF_source_classifications_DR5"
        }
    }
)

estimated_nlightcurves = qry.get('data')
estimated_nlightcurves

45038327

## Get collection info

In [10]:
qry = gloria.query({
    "query_type": "info",
    "query": {
    "catalog": "ZTF_source_classifications_DR5",
        "command": "catalog_info"
        }
    }
)

In [None]:
qry

## Count number of high-vnv-confidence documents

In [11]:
qry = gloria.query({
    "query_type": "count_documents",
    "query": {
    "catalog": "ZTF_source_classifications_DR5",
    "filter": {
        "$or":[
                {'%s_xgb' % 'vnv': {'$gt': 0.9}},
                {'%s_dnn' % 'vnv': {'$gt': 0.9}},
        ]}
    }
})

nlightcurves = qry.get('data')
nlightcurves

758223

# Get scores using batch query

In [17]:
ids = []
batch_size=10000
nb = np.ceil(nlightcurves/batch_size).astype(int)
df_all = []

print(nb)
for n in range(nb):
    print(n)
    qry = gloria.query({
        "query_type": "find",
        "query": {
            "catalog": "ZTF_source_classifications_DR5",
            "filter": {
                "$or":[
                    {'%s_xgb' % 'vnv': {'$gt': 0.9}},
                    {'%s_dnn' % 'vnv': {'$gt': 0.9}},
                ]}
            },
        "kwargs": {"skip": int(n*batch_size),
                   "limit": int(batch_size),
                   "max_time_ms": 10000
            }
        })
    data = qry.get('data')
    df_tmp = pd.DataFrame.from_records(data)
    df_all += [df_tmp]
    
df_scores = pd.concat(df_all).reset_index(drop=True)
    

76
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75


In [18]:
# examine last entry of data
data[-1]

{'_id': 10488251097041,
 'agn_xgb': 0.00261604692786932,
 'e_xgb': 0.0017702399054542184,
 'bis_xgb': 0.0011740883346647024,
 'blyr_xgb': 0.0023729673121124506,
 'ceph_xgb': 0.0006138907046988606,
 'dscu_xgb': 0.0005540123675018549,
 'ea_xgb': 0.0017192184459418058,
 'eb_xgb': 0.000995049369521439,
 'el_xgb': 4.684308441937901e-05,
 'ew_xgb': 0.0005296574090607464,
 'fla_xgb': 0.015532546676695347,
 'i_xgb': 0.42541176080703735,
 'lpv_xgb': 0.7397453784942627,
 'msms_xgb': 1.574631909306845e-07,
 'osarg_xgb': 0.0017720566829666495,
 'pnp_xgb': 0.03704237937927246,
 'puls_xgb': 0.200709268450737,
 'rrlyr_xgb': 0.00018412555800750852,
 'rrlyrab_xgb': 3.9919304981594905e-05,
 'rrlyrc_xgb': 3.342087802593596e-05,
 'rrlyrd_xgb': 0.000148708961205557,
 'rscvn_xgb': 0.0018924509640783072,
 'saw_xgb': 0.002017577178776264,
 'sin_xgb': 0.009976361878216267,
 'srv_xgb': 0.596572995185852,
 'vnv_xgb': 0.735502302646637,
 'wuma_xgb': 3.954729254473932e-05,
 'yso_xgb': 0.2991492450237274,
 'agn_dnn

In [19]:
df_scores

Unnamed: 0,_id,agn_xgb,e_xgb,bis_xgb,blyr_xgb,ceph_xgb,dscu_xgb,ea_xgb,eb_xgb,el_xgb,...,ew_dnn,fla_dnn,i_dnn,pnp_dnn,puls_dnn,rrlyr_dnn,rscvn_dnn,srv_dnn,vnv_dnn,yso_dnn
0,10682173007525,0.002441,0.989391,0.995640,0.002024,0.000069,0.000601,0.001321,0.009229,0.000003,...,0.602747,0.076606,0.052485,0.929672,0.024490,0.071283,0.255771,0.004792,0.995866,0.016794
1,10563032034383,0.002441,0.863971,0.688059,0.003376,0.000466,0.000273,0.000646,0.001388,0.000008,...,0.600870,0.001860,0.040677,0.951871,0.465572,0.205893,0.391641,0.004092,0.994549,0.002346
2,10852041010293,0.002441,0.993425,0.997304,0.000126,0.000146,0.000541,0.000481,0.000244,0.000017,...,0.946621,0.004293,0.036726,0.980536,0.312045,0.113829,0.135175,0.001828,0.994974,0.022615
3,10487402158260,0.002441,0.998359,0.999413,0.000714,0.000032,0.000405,0.000535,0.000815,0.000015,...,0.927954,0.001444,0.030237,0.967157,0.072791,0.180559,0.070040,0.000192,0.996949,0.000157
4,10718391003193,0.002441,0.999245,0.999622,0.000678,0.000015,0.000275,0.000450,0.000521,0.000006,...,0.954129,0.002294,0.017033,0.983989,0.015397,0.036002,0.047459,0.000336,0.999765,0.001013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758218,10778302017608,0.002441,0.039295,0.026859,0.009575,0.000023,0.000414,0.050351,0.009166,0.000075,...,0.037600,0.207177,0.131862,0.152606,0.005543,0.040023,0.082671,0.013626,0.900012,0.079024
758219,10488102087494,0.002673,0.004058,0.000396,0.000034,0.000009,0.000456,0.000465,0.000039,0.000030,...,0.143293,0.083975,0.141822,0.260008,0.059695,0.125980,0.001191,0.003675,0.900010,0.008878
758220,10718363000424,0.002460,0.169008,0.064293,0.000153,0.000019,0.002716,0.001485,0.000034,0.000939,...,0.170544,0.004620,0.082079,0.692576,0.074918,0.071031,0.547728,0.002784,0.900004,0.042545
758221,10682202021304,0.002855,0.043243,0.052567,0.000082,0.000638,0.001064,0.007278,0.000282,0.000013,...,0.132807,0.057038,0.077609,0.589368,0.196420,0.083802,0.516842,0.021589,0.900002,0.297400


In [20]:
df_scores.columns

Index(['_id', 'agn_xgb', 'e_xgb', 'bis_xgb', 'blyr_xgb', 'ceph_xgb',
       'dscu_xgb', 'ea_xgb', 'eb_xgb', 'el_xgb', 'ew_xgb', 'fla_xgb', 'i_xgb',
       'lpv_xgb', 'msms_xgb', 'osarg_xgb', 'pnp_xgb', 'puls_xgb', 'rrlyr_xgb',
       'rrlyrab_xgb', 'rrlyrc_xgb', 'rrlyrd_xgb', 'rscvn_xgb', 'saw_xgb',
       'sin_xgb', 'srv_xgb', 'vnv_xgb', 'wuma_xgb', 'yso_xgb', 'agn_dnn',
       'bis_dnn', 'blyr_dnn', 'ceph_dnn', 'dscu_dnn', 'e_dnn', 'ea_dnn',
       'eb_dnn', 'ew_dnn', 'fla_dnn', 'i_dnn', 'pnp_dnn', 'puls_dnn',
       'rrlyr_dnn', 'rscvn_dnn', 'srv_dnn', 'vnv_dnn', 'yso_dnn'],
      dtype='object')

In [None]:
df_scores.to_csv('ZTF_DR5_vnv_gt0.9score_classifications.csv',index=False)

# Batch query to get features

In [21]:
source_ids = df_scores['_id'].values.tolist()
features_catalog = 'ZTF_source_features_DR5'
limit=1000

In [22]:
# Get features and dmdt from Gloria
id = 0
df_collection = []
dmdt_collection = []

while 1:
    query = {
        "query_type": "find",
        "query": {
            "catalog": features_catalog,
            "filter": {"_id": {"$in": source_ids[id * limit : (id + 1) * limit]}},
        },
    }
    response = gloria.query(query=query)
    source_data = response.get("data")

    if source_data is None:
        print(response)
        raise ValueError(f"No data found for source ids {source_ids}")

    df_temp = pd.DataFrame.from_records(source_data)
    df_collection += [df_temp]
    try:
        dmdt_temp = np.expand_dims(
            np.array([d for d in df_temp['dmdt'].values]), axis=-1
        )
    except Exception as e:
        print("Error", e)
        print(df_temp)
    dmdt_collection += [dmdt_temp]

    if ((id + 1) * limit) > len(source_ids):
        break
    id += 1
    if (id * limit) % 5000 == 0:
        print(id * limit, "done")

df_features = pd.concat(df_collection, axis=0)
dmdt = np.vstack(dmdt_collection)

5000 done
10000 done
15000 done
20000 done
25000 done
30000 done
35000 done
40000 done
45000 done
50000 done
55000 done
60000 done
65000 done
70000 done
75000 done
80000 done
85000 done
90000 done
95000 done
100000 done
105000 done
110000 done
115000 done
120000 done
125000 done
130000 done
135000 done
140000 done
145000 done
150000 done
155000 done
160000 done
165000 done
170000 done
175000 done
180000 done
185000 done
190000 done
195000 done
200000 done
205000 done
210000 done
215000 done
220000 done
225000 done
230000 done
235000 done
240000 done
245000 done
250000 done
255000 done
260000 done
265000 done
270000 done
275000 done
280000 done
285000 done
290000 done
295000 done
300000 done
305000 done
310000 done
315000 done
320000 done
325000 done
330000 done
335000 done
340000 done
345000 done
350000 done
355000 done
360000 done
365000 done
370000 done
375000 done
380000 done
385000 done
390000 done
395000 done
400000 done
405000 done
410000 done
415000 done
420000 done
425000 done


In [23]:
df_features 

Unnamed: 0,_id,AllWISE___id,AllWISE__ph_qual,AllWISE__w1mpro,AllWISE__w1sigmpro,AllWISE__w2mpro,AllWISE__w2sigmpro,AllWISE__w3mpro,AllWISE__w3sigmpro,AllWISE__w4mpro,...,roms,significance,skew,smallkurt,stetson_j,stetson_k,sw,welch_i,wmean,wstd
0,10296051000973,1.600197e+17,AABU,11.187,0.023,11.185,0.022,11.062,0.135,9.023,...,4.625000,97.765030,114.424396,2236.506294,38.249536,0.868645,0.906189,316.113600,13.000808,0.082622
1,10296052001893,1.600197e+17,AABU,11.187,0.023,11.185,0.022,11.062,0.135,9.023,...,6.125746,137.257889,69.310499,4469.140562,-79.369418,0.875632,0.705628,-509.791831,12.635400,0.092647
2,10296231003763,1.740182e+17,AABU,10.845,0.022,10.858,0.021,10.735,0.126,9.017,...,4.104021,75.630920,-7.845135,948.906288,15.611241,0.866148,0.950899,109.426372,12.289038,0.102297
3,10296312006200,1.420182e+17,AABU,13.256,0.025,13.275,0.030,11.970,0.273,8.691,...,5.397115,114.061401,70.303084,2579.232084,43.776211,0.884509,0.923027,330.052081,14.695383,0.078901
4,10296391001693,1.720167e+17,AACU,11.948,0.024,11.976,0.023,12.273,0.380,9.056,...,6.243889,82.965607,261.402524,7234.012488,-7.467630,0.851542,0.748122,-112.468225,13.745763,0.121387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,10853102034170,3.460165e+18,AABU,12.573,0.023,12.633,0.026,12.006,0.213,9.639,...,0.904977,8.530197,0.214546,-0.041025,16.201437,0.833071,0.970661,28.767007,15.471221,0.014086
219,10853112006974,3.488167e+18,AAUU,13.031,0.025,13.121,0.027,12.925,,9.540,...,1.257178,16.880486,0.040782,5.747449,95.700872,0.860170,0.642862,173.506551,15.465366,0.019727
220,10853291011691,3.441168e+18,AAAU,11.207,0.023,11.176,0.020,11.341,0.092,9.445,...,1.235708,9.276795,0.821836,8.024979,37.413922,0.834867,0.976947,78.802218,14.532259,0.025039
221,10853401008278,3.510171e+18,AABU,12.271,0.023,12.260,0.022,12.140,0.281,8.519,...,1.400709,8.282205,-0.233651,13.706485,48.324903,0.840434,0.978532,89.106400,16.083099,0.026479


In [24]:
df_features.columns

Index(['_id', 'AllWISE___id', 'AllWISE__ph_qual', 'AllWISE__w1mpro',
       'AllWISE__w1sigmpro', 'AllWISE__w2mpro', 'AllWISE__w2sigmpro',
       'AllWISE__w3mpro', 'AllWISE__w3sigmpro', 'AllWISE__w4mpro',
       'AllWISE__w4sigmpro', 'Gaia_EDR3___id',
       'Gaia_EDR3__astrometric_excess_noise', 'Gaia_EDR3__parallax',
       'Gaia_EDR3__parallax_error', 'Gaia_EDR3__phot_bp_mean_mag',
       'Gaia_EDR3__phot_bp_rp_excess_factor', 'Gaia_EDR3__phot_g_mean_mag',
       'Gaia_EDR3__phot_rp_mean_mag', 'Gaia_EDR3__pmdec',
       'Gaia_EDR3__pmdec_error', 'Gaia_EDR3__pmra', 'Gaia_EDR3__pmra_error',
       'PS1_DR1___id', 'PS1_DR1__gMeanPSFMag', 'PS1_DR1__gMeanPSFMagErr',
       'PS1_DR1__iMeanPSFMag', 'PS1_DR1__iMeanPSFMagErr',
       'PS1_DR1__qualityFlag', 'PS1_DR1__rMeanPSFMag',
       'PS1_DR1__rMeanPSFMagErr', 'PS1_DR1__yMeanPSFMag',
       'PS1_DR1__yMeanPSFMagErr', 'PS1_DR1__zMeanPSFMag',
       'PS1_DR1__zMeanPSFMagErr', 'ad', 'ccd', 'chi2red', 'coordinates', 'dec',
       'dmdt', 'f

In [25]:
# Merge features, labels
df_merge = pd.merge(df_scores, df_features, on='_id')

In [None]:
df_merge.to_csv('golden_merged_scores_features.csv',index=False)

In [None]:
df_merge.to_pickle('golden_merged_scores_features.pkl')