# WSU size of computing estimate numbers (Database)

The goal here is to put together some numbers on the type of projects ALMA would process as part of the WSU to be used to produce a total size of computing estimate.

Amanda Kepley (20220921)

In [1]:
import numpy as np
import astropy.units as u
from ast import literal_eval
from astropy import constants as const
from matplotlib import pyplot as plt, ticker as mticker
import re
import math
from astropy.table import Table, QTable, vstack, join, unique
from astropy import constants as const
from importlib import reload
import pickle
import pandas as pd

## Read in massaged cycle 7 and 8 data <a id="readin"></a>

In [2]:
cycle7tab = Table.read('data/result_table_cycle7_with_calc_values_20220923.csv')
cycle8tab = Table.read('data/result_table_cycle8_with_calc_values_20220923.csv')

In [3]:
cycle7tab.columns

<TableColumns names=('obs_publisher_did','facility_name','instrument_name','obs_id','dataproduct_type','calib_level','target_name','s_ra','s_dec','s_fov','s_resolution','t_min','t_max','t_exptime','t_resolution','em_min','em_max','em_res_power','pol_states','o_ucd','access_url','access_format','proposal_id','data_rights','gal_longitude','gal_latitude','band_list','em_resolution','bandwidth','antenna_arrays','is_mosaic','spatial_resolution','frequency_support','frequency','velocity_resolution','publication_year','proposal_abstract','schedblock_name','sensitivity_10kms','cont_sensitivity_bandwidth','pwv','group_ous_uid','member_ous_uid','asdm_uid','type','scan_intent','science_observation','spatial_scale_max','qa2_passed','science_keyword','scientific_category','collections','array','points_per_fov','spw_freq','spw_specwidth','spw_nchan','pb','cell','imsize','spw_nchan_max','mitigated','failed_mitigation_nbin1','failed_mitigation_nbin2','ntarget')>

## Put together WSU mous data base <a id="wsu_db"></a>

In [4]:
import wsu_db

In [5]:
reload(wsu_db)

<module 'wsu_db' from '/Users/akepley/Dropbox/Support/naasc/WSU/big_cubes/wsu_db.py'>

In [6]:
result = wsu_db.create_database(cycle7tab)

In [7]:
result.columns

<TableColumns names=('mous','proposal_id','array','nant_typical','nant_array','nant_all','band','ntarget','target_name','s_fov','s_resolution','mosaic','imsize','pb','cell','blc_npol','blc_nspw','blc_specwidth','blc_freq','blc_velres','blc_nchan_agg','blc_nchan_max','blc_bandwidth_max','blc_bandwidth_agg','wsu_freq','wsu_npol','wsu_bandwidth_early','wsu_bandwidth_later_2x','wsu_bandwidth_later_4x','wsu_bandwidth_spw','wsu_nspw_early','wsu_nspw_later_2x','wsu_nspw_later_4x','wsu_specwidth_finest','wsu_chanavg_finest','wsu_velres_finest','wsu_specwidth_stepped','wsu_chanavg_stepped','wsu_velres_stepped','wsu_specwidth_stepped2','wsu_chanavg_stepped2','wsu_velres_stepped2','wsu_tint','wsu_nchan_spw_finest','wsu_nchan_spw_stepped','wsu_nchan_spw_stepped2','wsu_frac_bw_early','wsu_frac_bw_later_2x','wsu_frac_bw_later_4x','wsu_frac_bw_spw','nbase_typical','nbase_array','nbase_all')>

In [8]:
result_c8 = wsu_db.create_database(cycle8tab)

In [9]:
# save data bases if what's desired
#result.write('data/cycle7wsu_20221003.fits',overwrite=True)
#result_c8.write('data/cycle8wsu_20221003.fits',overwrite=True)
result.write('data/cycle7wsu_20230103.fits',overwrite=True)
result_c8.write('data/cycle8wsu_20230103.fits',overwrite=True)

In [10]:
result.columns

<TableColumns names=('mous','proposal_id','array','nant_typical','nant_array','nant_all','band','ntarget','target_name','s_fov','s_resolution','mosaic','imsize','pb','cell','blc_npol','blc_nspw','blc_specwidth','blc_freq','blc_velres','blc_nchan_agg','blc_nchan_max','blc_bandwidth_max','blc_bandwidth_agg','wsu_freq','wsu_npol','wsu_bandwidth_early','wsu_bandwidth_later_2x','wsu_bandwidth_later_4x','wsu_bandwidth_spw','wsu_nspw_early','wsu_nspw_later_2x','wsu_nspw_later_4x','wsu_specwidth_finest','wsu_chanavg_finest','wsu_velres_finest','wsu_specwidth_stepped','wsu_chanavg_stepped','wsu_velres_stepped','wsu_specwidth_stepped2','wsu_chanavg_stepped2','wsu_velres_stepped2','wsu_tint','wsu_nchan_spw_finest','wsu_nchan_spw_stepped','wsu_nchan_spw_stepped2','wsu_frac_bw_early','wsu_frac_bw_later_2x','wsu_frac_bw_later_4x','wsu_frac_bw_spw','nbase_typical','nbase_array','nbase_all')>

In [11]:
len(result)

11519

In [12]:
result_c8.columns

<TableColumns names=('mous','proposal_id','array','nant_typical','nant_array','nant_all','band','ntarget','target_name','s_fov','s_resolution','mosaic','imsize','pb','cell','blc_npol','blc_nspw','blc_specwidth','blc_freq','blc_velres','blc_nchan_agg','blc_nchan_max','blc_bandwidth_max','blc_bandwidth_agg','wsu_freq','wsu_npol','wsu_bandwidth_early','wsu_bandwidth_later_2x','wsu_bandwidth_later_4x','wsu_bandwidth_spw','wsu_nspw_early','wsu_nspw_later_2x','wsu_nspw_later_4x','wsu_specwidth_finest','wsu_chanavg_finest','wsu_velres_finest','wsu_specwidth_stepped','wsu_chanavg_stepped','wsu_velres_stepped','wsu_specwidth_stepped2','wsu_chanavg_stepped2','wsu_velres_stepped2','wsu_tint','wsu_nchan_spw_finest','wsu_nchan_spw_stepped','wsu_nchan_spw_stepped2','wsu_frac_bw_early','wsu_frac_bw_later_2x','wsu_frac_bw_later_4x','wsu_frac_bw_spw','nbase_typical','nbase_array','nbase_all')>

In [13]:
len(result_c8)

10843

## Adding in calibration TOS information

This is needed to get the total number of visibilities and the data volume. Also necessary to start refine the data rates.

In [14]:
import large_cubes
from importlib import reload

In [15]:
reload(large_cubes)
tos_db = large_cubes.calc_time_on_source('data/project_mous_band_array_eb_size___source_intent_inttime')

Intent not recognized: BANDPASS DIFFGAIN FLUX PHASE WVR
Intent not recognized: BANDPASS DIFFGAIN FLUX PHASE WVR
Intent not recognized: BANDPASS DIFFGAIN FLUX PHASE WVR
Intent not recognized: DIFFGAIN PHASE WVR
Intent not recognized: DIFFGAIN PHASE WVR
Intent not recognized: DIFFGAIN PHASE WVR
Intent not recognized: DIFFGAIN PHASE WVR
Intent not recognized: DIFFGAIN PHASE WVR
Intent not recognized: BANDPASS DIFFGAIN FLUX PHASE WVR
Intent not recognized: BANDPASS DIFFGAIN FLUX PHASE WVR
Intent not recognized: BANDPASS PHASE WVR
Intent not recognized: BANDPASS PHASE WVR
project_id list greater than 1. This shouldn't happen. MOUS: uid://A002/X445835/X6
made it to table creation


In [16]:
tos_db.columns

<TableColumns names=('proposal_id','mous','band','array','bp_time_s','flux_time_s','phase_time_s','pol_time_s','check_time_s','target_time_s','target_name','target_time_tot_s','ntarget','time_tot_s','cal_time_s')>

In [17]:
len(tos_db)

22430

In [18]:
tos_db.write('data/tos_db.ecsv',overwrite=True)

In [19]:
reload(wsu_db)

<module 'wsu_db' from '/Users/akepley/Dropbox/Support/naasc/WSU/big_cubes/wsu_db.py'>

In [85]:
result_tos = wsu_db.add_tos_to_db(result,tos_db)

In [86]:
result_c8_tos = wsu_db.add_tos_to_db(result_c8,tos_db)

In [87]:
result_tos.write('data/result_tos.ecsv',overwrite=True)
result_c8_tos.write('data/result_c8_tos.ecsv',overwrite=True)

In [78]:
result_tos.columns

<TableColumns names=('mous','proposal_id','array','nant_typical','nant_array','nant_all','band','ntarget','target_name','s_fov','s_resolution','mosaic','imsize','pb','cell','blc_npol','blc_nspw','blc_specwidth','blc_freq','blc_velres','blc_nchan_agg','blc_nchan_max','blc_bandwidth_max','blc_bandwidth_agg','wsu_freq','wsu_npol','wsu_bandwidth_early','wsu_bandwidth_later_2x','wsu_bandwidth_later_4x','wsu_bandwidth_spw','wsu_nspw_early','wsu_nspw_later_2x','wsu_nspw_later_4x','wsu_specwidth_finest','wsu_chanavg_finest','wsu_velres_finest','wsu_specwidth_stepped','wsu_chanavg_stepped','wsu_velres_stepped','wsu_specwidth_stepped2','wsu_chanavg_stepped2','wsu_velres_stepped2','wsu_tint','wsu_nchan_spw_finest','wsu_nchan_spw_stepped','wsu_nchan_spw_stepped2','wsu_frac_bw_early','wsu_frac_bw_later_2x','wsu_frac_bw_later_4x','wsu_frac_bw_spw','nbase_typical','nbase_array','nbase_all','bp_time_s','flux_time_s','phase_time_s','pol_time_s','check_time_s','target_time_s','target_time_tot_s','time_t

In [79]:
len(result_tos)

11519

In [80]:
len(result_c8_tos)

10843

## Adding in data rates

In [88]:
reload(large_cubes)
reload(wsu_db)

<module 'wsu_db' from '/Users/akepley/Dropbox/Support/naasc/WSU/big_cubes/wsu_db.py'>

In [89]:
len(result_tos)

11519

In [90]:
wsu_db.add_rates_to_db(result_tos)

11519


In [91]:
result_tos.keys()

['mous',
 'proposal_id',
 'array',
 'nant_typical',
 'nant_array',
 'nant_all',
 'band',
 'ntarget',
 'target_name',
 's_fov',
 's_resolution',
 'mosaic',
 'imsize',
 'pb',
 'cell',
 'blc_npol',
 'blc_nspw',
 'blc_specwidth',
 'blc_freq',
 'blc_velres',
 'blc_nchan_agg',
 'blc_nchan_max',
 'blc_bandwidth_max',
 'blc_bandwidth_agg',
 'wsu_freq',
 'wsu_npol',
 'wsu_bandwidth_early',
 'wsu_bandwidth_later_2x',
 'wsu_bandwidth_later_4x',
 'wsu_bandwidth_spw',
 'wsu_nspw_early',
 'wsu_nspw_later_2x',
 'wsu_nspw_later_4x',
 'wsu_specwidth_finest',
 'wsu_chanavg_finest',
 'wsu_velres_finest',
 'wsu_specwidth_stepped',
 'wsu_chanavg_stepped',
 'wsu_velres_stepped',
 'wsu_specwidth_stepped2',
 'wsu_chanavg_stepped2',
 'wsu_velres_stepped2',
 'wsu_tint',
 'wsu_nchan_spw_finest',
 'wsu_nchan_spw_stepped',
 'wsu_nchan_spw_stepped2',
 'wsu_frac_bw_early',
 'wsu_frac_bw_later_2x',
 'wsu_frac_bw_later_4x',
 'wsu_frac_bw_spw',
 'nbase_typical',
 'nbase_array',
 'nbase_all',
 'bp_time_s',
 'flux_time_s

In [92]:
wsu_db.add_rates_to_db(result_c8_tos)

10843


In [93]:
result_c8_tos['wsu_nvis_early_stepped2_typical_total']

<Quantity [4.95945504e+03, 8.11182400e-01, 1.78256900e+01, ...,
           6.92185920e-01, 6.87170080e-01, 4.51078200e-01]>

In [94]:
np.max(result_c8_tos['wsu_productsize_early_stepped2'].to(u.TB))

<Quantity 80.47223552 Tbyte>

In [96]:
np.sum(result_c8_tos['wsu_productsize_later_2x_stepped2'].to(u.PB))

<Quantity 4.43559133 Pbyte>

In [97]:
np.sum(result_c8_tos['wsu_productsize_later_4x_stepped2'].to(u.PB))

<Quantity 8.87118265 Pbyte>

# creating per mous version of data base

This is going to be more useful for the DMGs as well as for combining with mitigation information (which is per MOUS). But I don't think the fractions of time are going to be useful, so I'll leave them off and recalculate.

I think that the path forward uses the group_by function and aggregation to get values.

In [98]:
reload(wsu_db)

<module 'wsu_db' from '/Users/akepley/Dropbox/Support/naasc/WSU/big_cubes/wsu_db.py'>

In [99]:
test = wsu_db.create_per_mous_db(result_tos)

AttributeError: 'Quantity' object has no 'groups' member

In [100]:
%debug

> [0;32m/Users/akepley/opt/anaconda3/lib/python3.8/site-packages/astropy/units/quantity.py[0m(841)[0;36m__getattr__[0;34m()[0m
[0;32m    839 [0;31m        """
[0m[0;32m    840 [0;31m        [0;32mif[0m [0;32mnot[0m [0mself[0m[0;34m.[0m[0m_include_easy_conversion_members[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 841 [0;31m            raise AttributeError(
[0m[0;32m    842 [0;31m                "'{}' object has no '{}' member".format(
[0m[0;32m    843 [0;31m                    [0mself[0m[0;34m.[0m[0m__class__[0m[0;34m.[0m[0m__name__[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/Users/akepley/Dropbox/Support/naasc/WSU/big_cubes/wsu_db.py[0m(697)[0;36mcreate_per_mous_db[0;34m()[0m
[0;32m    695 [0;31m                       'wsu_visrate_early_stepped2','wsu_visrate_later_2x_stepped2','wsu_visrate_later_4x_stepped2']:
[0m[0;32m    696 [0;31m[0;34m[0m[0m
[0m[0;32m--> 697 [0;31m            [0mnewdb_dict[0m[



*** AttributeError: 'Quantity' object has no 'groups' member
ipdb> mykey
's_fov'
ipdb> mydb['s_fov']
<Quantity [0.13050364, 0.13352957, 0.13357117, ..., 0.00720464, 0.01658833,
           0.01658865] deg>
ipdb> (mydb['s_fov'].value).groups.aggregate(np.max)
*** AttributeError: 'numpy.ndarray' object has no attribute 'groups'
ipdb> mydb['s_fov].groups
*** SyntaxError: EOL while scanning string literal
ipdb> mydb['s_fov'].groups
*** AttributeError: 'Quantity' object has no 'groups' member
ipdb> (mydb['s_fov']).groups
*** AttributeError: 'Quantity' object has no 'groups' member
ipdb> mydb_by_mous.groups.aggregate(np.max)
*** AttributeError: 'Quantity' object has no 'groups' member
ipdb> mydb_by_mous.groups.indices
array([    0,     1,     2, ..., 11517, 11518, 11519])
ipdb> mydb_by_mous['wsu_chanavg_finest'].groups.aggregate(np.max)
<Column name='wsu_chanavg_finest' dtype='float64' length=2719>
  71.0
  82.0
  82.0
  82.0
  82.0
  82.0
  82.0
  82.0
  83.0
  83.0
  83.0
  83.0
   ...
  83

## Adding in current mitigation information

In [None]:
c7_pickle = '/Users/akepley/Dropbox/Support/naasc/WSU/mitigation/weblog_stats/allc7_stats.20220930.pkl'
c7_mit = pickle.load(open(c7_pickle,'rb'))
c7_rpd = pd.DataFrame(c7_mit).transpose()

In [None]:
c7_astropy = Table.from_pandas(c7_rpd,index=True)
c7_astropy.rename_column('index','mous')

In [None]:
c7_astropy

In [None]:
reload(large_cubes)

In [None]:
large_cubes.fix_mous_col(c7_astropy)  

In [None]:
c8_pickle = '/Users/akepley/Dropbox/Support/naasc/WSU/mitigation/weblog_stats/cycle8_stats.ignacio.20230103.pkl'
c8_mit = pickle.load(open(c8_pickle,'rb'))
c8_rpd = pd.DataFrame(c8_mit).transpose()

In [None]:
c8_astropy = Table.from_pandas(c8_rpd,index=True)
c8_astropy.rename_column('index','mous')

In [None]:
c8_astropy

Things to think about:
* Need to confirm the above to mous version with slashes.
* This information is per MOUS not per MOUS/SRC. Can I do some sort of grouping to generate a table that might make sense and calculate totals per MOUS??


In [None]:
test = result_rates.unique('mous')

In [None]:
print(test.groups.keys)

In [None]:
len(result_rates)

In [None]:
c7_pickle = '/Users/akepley/Dropbox/Support/naasc/WSU/mitigation/weblog_stats/allc7_stats.20220930.pkl'
c7_mit = pickle.load(open(c7_pickle,'rb'))
c7_rpd = pd.DataFrame(c7_mit).transpose()

## calculating fractions of time

In [None]:
reload(wsu_db)

In [None]:
 wsu_db.calc_frac_time(result_tos,cycle='c7')

In [None]:
np.sum(result_tos['frac_c7_target_time'])

In [None]:
np.max(result_tos['frac_c7_target_time'])

In [None]:
wsu_db.calc_frac_time(result_c8_tos,cycle='c8')

In [None]:
np.sum(result_c8_tos['frac_c8_target_time'])

In [None]:
np.max(result_c8_tos['frac_c8_target_time'])

## Writing out the final file

In [None]:
# per MOUS/src
result_tos.write('data/wsu_datarates_per_moussrc_cycle7_20230109.ecsv')
result_c8_tos.write('data/wsu_datarates_per_moussrc_cycle8_20230109.ecsv')

## Mosaic imsize investigation

In [None]:
idx = (result['mosaic'] == 'T') & (result['imsize'] >5800)
result['mous','imsize','cell','s_fov','s_resolution','wsu_freq','pb','mosaic'][idx]

In [None]:
0.01056278408537675 *3600.0  

Image pre-check values for  2019.1.00796.S, uid://A001/X1471/X317	

* beam = 0.0457 x 0.0404 arcsec
* cell = 0.0081 x 0.0081 arcsec

Unmitigated imsize calculated in pipeline for X317 is 7776, 7776 according to SCG tests

eye balling the spatial set up it looks like there's 10-12arcsec between pointings and the plot says the primary beam is 26.0arcsec

The pipeline math is   

npts <= 3
* nxpix = int((1.65 * beam_radius_v + xspread) / cellx_v)

npts >3
* nxpix = int((1.5 * beam_radius_v + xspread) / cellx_v)

We only have two pointings here.



In [None]:
(26.0 + 10.0)

In [None]:
(1.65 * 26.0 + 10.0)/0.0081

So my estimate is a little on the low end, but not crazy

In [None]:
(0.01044*3600+25.6*0.70)/0.0072

## Imsize investigation

Something is  odd with my image sizes. I'm using 2019.1.01463.S uid://A001/X1465/Xc05 as my poster child

For the unmitigated imaging done by the pipeline, the pipeline calculates the following values:
* beam: 0.0322" x 0.0211"
* cell: 0.0042" x 0.0042"
* imsize: [11250, 11250] pixels
* FOV: 47.25 arcsec

Now let's look at what I get from my calculations

In [None]:
#2019.1.01463.S
idx =result['mous'] == 'uid://A001/X1465/Xc05'
result['mous','s_fov','s_resolution','imsize','wsu_nchan_final_stepped','wsu_nchan_final_finest','mosaic'][idx]

In [None]:
np.log10(237037.03703703705)

In [None]:
np.log10(32921.81069958848)

In [None]:
# the imsize is
0.007157768473981626*3600.00 # arcsec

In [None]:
# What's the estimated imsize at this frequency??
# frequency
freq = 218.821 #GHz
19.4*300/218.821

This is comparable to the imsize calculated above.

In [None]:
# What pixel size does this imply for five pixels per beam?
0.024588/5.0

In [None]:
# What pixel size does this imply for six pixels per beam?
0.024588/6.0

What happens if I use the points_per_fov value??

In [None]:
idx2 = cycle7tab['member_ous_uid'] == 'uid://A001/X1465/Xc05'
cycle7tab['proposal_id','member_ous_uid','s_fov','s_resolution','points_per_fov','spw_nchan','is_mosaic'][idx2]

In [None]:
# imsize from points per fov value
np.sqrt(1100957.4775723005)*5.0

Matches imsize above.

So it looks like the FOV is the difference:

In [None]:
(47.25/25.76)*5250

Still an underestimate, but closer.

The pipeline calculates the primary beam as

primary_beam_size = \
            1.22 \
            * cqa.getvalue(cqa.convert(cqa.constants('c'), 'm/s')) \
            / ref_frequency \
            / smallest_diameter \
            * (180.0 * 3600.0 / math.pi)

In [None]:
1.22 * ((const.c.value /  218.821e9) / (12.0) )*(180*3600.0/math.pi)

Pipeline calculation is here:

beam_radius_v = primary_beam

beam_fwhp = 1.12 / 1.22 * beam_radius_v

nxpix = int(utils.round_half_up(1.1 * beam_fwhp * math.sqrt(-math.log(sfpblimit) / math.log(2.)) / cellx_v))

In [None]:
(1.12/1.22)* 28.73

In [None]:
1.1 * 26.38 * math.sqrt(-math.log(0.2) / math.log(2.0))

Okay. This is the value I get above. 

What's the constant??

In [None]:
1.1* (1.12/1.22)*math.sqrt(-math.log(0.2) / math.log(2.0))

In [None]:
1.54*25.8

In [None]:
40.0/0.0040