# SCoPe data wrangling notebook

*Note that you need to be using scope-ml installed from the source (as opposed to PyPI) for this notebook to run.*

When you want to run SCoPe feature generation on specific sources instead of fields, you will need to begin with a source list. This is an example notebook demonstrating the data wrangling required to format such lists in a way compatible with SCoPe feature generation code. The two most important aspects of this process to remember are:
- **Column names**: files input to SCoPe feature generation must contain "**ztf_id**" and "**coordinates**" columns in the appropriate format (see below)
- **Number of sources per job**: on SDSC Expanse, GPU jobs are capped at 48 hours of runtime. If more than ~100,000 light curves require feature generation, be sure to save multiple files each containg approximately this amount of light curves per file. **Remember that one source can have multiple associated ZTF light curves.**

**See also the `generated_features_underMS` directory for an example slurm script to run on Expanse.**

In [None]:
from tools.get_quad_ids import get_cone_ids
import pandas as pd
import numpy as np
from scope.utils import write_parquet, read_parquet
from scope.fritz import radec_to_iau_name
import os

# Modify underMS_sample.csv sources list

In [2]:
fritz_sources = pd.read_csv('underMS_sample.csv')
fritz_sources = fritz_sources.dropna()
fritz_sources

Unnamed: 0,designation,ra,dec,parallax,parallax_over_error,phot_g_mean_mag,phot_bp_mean_mag,phot_rp_mean_mag
0,Gaia DR3 2266288490722687872,278.259303,72.869800,4.005372,12.346051,19.754240,19.966717,19.629162
1,Gaia DR3 2266292029775762176,278.113637,72.929177,3.154598,22.756111,18.335480,18.228960,18.481825
2,Gaia DR3 2266294228799045376,278.151456,73.028259,3.618712,12.450397,19.614813,20.628176,18.546251
3,Gaia DR3 2266297905289660672,280.912052,72.555164,0.684440,6.363533,18.399763,18.909082,17.703053
4,Gaia DR3 2263116193518195328,295.271479,70.123637,7.748245,126.810470,17.360775,18.108380,16.413525
...,...,...,...,...,...,...,...,...
383772,Gaia DR3 6030019189004253952,256.860522,-28.247218,1.172813,5.126069,18.618620,19.140526,17.614138
383773,Gaia DR3 6030019356453422464,256.888673,-28.207527,1.341936,5.237698,18.894125,19.352600,17.952679
383774,Gaia DR3 6030020185437869824,256.784336,-28.190900,2.018885,6.161395,19.203940,19.568428,18.362045
383775,Gaia DR3 6030020254157574528,256.777798,-28.175627,1.143710,5.135977,18.730840,19.084480,17.964695


In [4]:
fritz_sources['obj_id'] = fritz_sources.apply(lambda row: radec_to_iau_name(row['ra'], row['dec']), axis=1)

In [5]:
fritz_sources

Unnamed: 0,designation,ra,dec,parallax,parallax_over_error,phot_g_mean_mag,phot_bp_mean_mag,phot_rp_mean_mag,obj_id
0,Gaia DR3 2266288490722687872,278.259303,72.869800,4.005372,12.346051,19.754240,19.966717,19.629162,ZTFJ183302.23+725211.3
1,Gaia DR3 2266292029775762176,278.113637,72.929177,3.154598,22.756111,18.335480,18.228960,18.481825,ZTFJ183227.27+725545.0
2,Gaia DR3 2266294228799045376,278.151456,73.028259,3.618712,12.450397,19.614813,20.628176,18.546251,ZTFJ183236.35+730141.7
3,Gaia DR3 2266297905289660672,280.912052,72.555164,0.684440,6.363533,18.399763,18.909082,17.703053,ZTFJ184338.89+723318.6
4,Gaia DR3 2263116193518195328,295.271479,70.123637,7.748245,126.810470,17.360775,18.108380,16.413525,ZTFJ194105.16+700725.1
...,...,...,...,...,...,...,...,...,...
383772,Gaia DR3 6030019189004253952,256.860522,-28.247218,1.172813,5.126069,18.618620,19.140526,17.614138,ZTFJ170726.53-281450.0
383773,Gaia DR3 6030019356453422464,256.888673,-28.207527,1.341936,5.237698,18.894125,19.352600,17.952679,ZTFJ170733.28-281227.1
383774,Gaia DR3 6030020185437869824,256.784336,-28.190900,2.018885,6.161395,19.203940,19.568428,18.362045,ZTFJ170708.24-281127.2
383775,Gaia DR3 6030020254157574528,256.777798,-28.175627,1.143710,5.135977,18.730840,19.084480,17.964695,ZTFJ170706.67-281032.3


In [6]:
fritz_sources = fritz_sources.rename({"designation": "fritz_name"}, axis=1)

In [7]:
fritz_sources

Unnamed: 0,fritz_name,ra,dec,parallax,parallax_over_error,phot_g_mean_mag,phot_bp_mean_mag,phot_rp_mean_mag,obj_id
0,Gaia DR3 2266288490722687872,278.259303,72.869800,4.005372,12.346051,19.754240,19.966717,19.629162,ZTFJ183302.23+725211.3
1,Gaia DR3 2266292029775762176,278.113637,72.929177,3.154598,22.756111,18.335480,18.228960,18.481825,ZTFJ183227.27+725545.0
2,Gaia DR3 2266294228799045376,278.151456,73.028259,3.618712,12.450397,19.614813,20.628176,18.546251,ZTFJ183236.35+730141.7
3,Gaia DR3 2266297905289660672,280.912052,72.555164,0.684440,6.363533,18.399763,18.909082,17.703053,ZTFJ184338.89+723318.6
4,Gaia DR3 2263116193518195328,295.271479,70.123637,7.748245,126.810470,17.360775,18.108380,16.413525,ZTFJ194105.16+700725.1
...,...,...,...,...,...,...,...,...,...
383772,Gaia DR3 6030019189004253952,256.860522,-28.247218,1.172813,5.126069,18.618620,19.140526,17.614138,ZTFJ170726.53-281450.0
383773,Gaia DR3 6030019356453422464,256.888673,-28.207527,1.341936,5.237698,18.894125,19.352600,17.952679,ZTFJ170733.28-281227.1
383774,Gaia DR3 6030020185437869824,256.784336,-28.190900,2.018885,6.161395,19.203940,19.568428,18.362045,ZTFJ170708.24-281127.2
383775,Gaia DR3 6030020254157574528,256.777798,-28.175627,1.143710,5.135977,18.730840,19.084480,17.964695,ZTFJ170706.67-281032.3


# Get/save light curve ids from Kowalski using cone search

In [9]:
ids = get_cone_ids(fritz_sources['fritz_name'].values, fritz_sources['ra'].values, fritz_sources['dec'].values, catalog='ZTF_sources_20240117', max_distance=2.0, get_coords=True)


1000 done
2000 done
3000 done
4000 done
5000 done
6000 done
7000 done
8000 done
9000 done
10000 done
11000 done
12000 done
13000 done
14000 done
15000 done
16000 done
17000 done
18000 done
19000 done
20000 done
21000 done
22000 done
23000 done
24000 done
25000 done
26000 done
27000 done
28000 done
29000 done
30000 done
31000 done
32000 done
33000 done
34000 done
35000 done
36000 done
37000 done
38000 done
39000 done
40000 done
41000 done
42000 done
43000 done
44000 done
45000 done
46000 done
47000 done
48000 done
49000 done
50000 done
51000 done
52000 done
53000 done
54000 done
55000 done
56000 done
57000 done
58000 done
59000 done
60000 done
61000 done
62000 done
63000 done
64000 done
65000 done
66000 done
67000 done
68000 done
69000 done
70000 done
71000 done
72000 done
73000 done
74000 done
75000 done
76000 done
77000 done
78000 done
79000 done
80000 done
81000 done
82000 done
83000 done
84000 done
85000 done
86000 done
87000 done
88000 done
89000 done
90000 done
91000 done
92000 do

In [10]:
ids

Unnamed: 0,_id,coordinates,obj_id
0,10865061005695,{'radec_geojson': {'coordinates': [98.25930729...,Gaia DR3 2266288490722687872
1,10865063010636,"{'radec_geojson': {'coordinates': [98.2592401,...",Gaia DR3 2266288490722687872
2,10849571000994,{'radec_geojson': {'coordinates': [98.25920450...,Gaia DR3 2266288490722687872
3,10849573002306,{'radec_geojson': {'coordinates': [98.25925489...,Gaia DR3 2266288490722687872
4,10849572009999,"{'radec_geojson': {'coordinates': [98.2592333,...",Gaia DR3 2266288490722687872
...,...,...,...
1368029,10643261022367,{'radec_geojson': {'coordinates': [142.0212503...,Gaia DR3 1798195814907331328
1368030,10643262027503,{'radec_geojson': {'coordinates': [142.0212449...,Gaia DR3 1798195814907331328
1368031,10643263018372,{'radec_geojson': {'coordinates': [142.0212379...,Gaia DR3 1798195814907331328
1368032,11640412003066,{'radec_geojson': {'coordinates': [142.5311714...,Gaia DR3 1798221786572291456


In [11]:
ids.drop_duplicates('_id')

Unnamed: 0,_id,coordinates,obj_id
0,10865061005695,{'radec_geojson': {'coordinates': [98.25930729...,Gaia DR3 2266288490722687872
1,10865063010636,"{'radec_geojson': {'coordinates': [98.2592401,...",Gaia DR3 2266288490722687872
2,10849571000994,{'radec_geojson': {'coordinates': [98.25920450...,Gaia DR3 2266288490722687872
3,10849573002306,{'radec_geojson': {'coordinates': [98.25925489...,Gaia DR3 2266288490722687872
4,10849572009999,"{'radec_geojson': {'coordinates': [98.2592333,...",Gaia DR3 2266288490722687872
...,...,...,...
1368029,10643261022367,{'radec_geojson': {'coordinates': [142.0212503...,Gaia DR3 1798195814907331328
1368030,10643262027503,{'radec_geojson': {'coordinates': [142.0212449...,Gaia DR3 1798195814907331328
1368031,10643263018372,{'radec_geojson': {'coordinates': [142.0212379...,Gaia DR3 1798195814907331328
1368032,11640412003066,{'radec_geojson': {'coordinates': [142.5311714...,Gaia DR3 1798221786572291456


In [25]:
ids['coordinates']

0         {'radec_geojson': {'coordinates': [109.1761478...
1         {'radec_geojson': {'coordinates': [109.1761687...
2         {'radec_geojson': {'coordinates': [163.7830669...
3         {'radec_geojson': {'coordinates': [163.783053,...
4         {'radec_geojson': {'coordinates': [163.7830786...
                                ...                        
142128    {'radec_geojson': {'coordinates': [11.27874919...
142129    {'radec_geojson': {'coordinates': [19.90753499...
142130    {'radec_geojson': {'coordinates': [19.907579, ...
142131    {'radec_geojson': {'coordinates': [19.9075742,...
142132    {'radec_geojson': {'coordinates': [28.66380950...
Name: coordinates, Length: 142133, dtype: object

In [58]:
len(np.unique(ids['_id']))

133467

In [12]:
ids.drop_duplicates('_id').to_csv('underMS_fritz_sources_ids_2arcsec.csv', index=False)

# Save batches of light curve ids for more reasonable runtime

In [14]:
ids = ids.drop_duplicates('_id').rename({'_id':'ztf_id', 'obj_id':'fritz_name'}, axis=1).reset_index(drop=True)

In [18]:
os.makedirs("underMS_ids_DR20", exist_ok=True)

n = 15
block = int(np.ceil(len(ids)/n))
print(block)
for i in range(n):
    slice_ids = ids.loc[i*block:(i+1)*block]
    write_parquet(slice_ids, f'underMS_ids_DR20/sources_ids_2arcsec_renamed_{i}.parquet')

91195
