In [1]:
from __future__ import print_function
import datetime
import time
from functools import reduce
from collections import defaultdict
import os
import tqdm
import gzip
import pickle

import pyarrow.parquet as parquet
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
import numba

from fastset import FastSet

In [2]:
#   DBS BLOCKS table schema:
#     BLOCK_ID NOT NULL NUMBER(38)
#     BLOCK_NAME NOT NULL VARCHAR2(500)
#     DATASET_ID NOT NULL NUMBER(38)
#     OPEN_FOR_WRITING NOT NULL NUMBER(38)
#     ORIGIN_SITE_NAME NOT NULL VARCHAR2(100)
#     BLOCK_SIZE NUMBER(38)
#     FILE_COUNT NUMBER(38)
#     CREATION_DATE NUMBER(38)
#     CREATE_BY VARCHAR2(500)
#     LAST_MODIFICATION_DATE NUMBER(38)
#     LAST_MODIFIED_BY VARCHAR2(500)
if not os.path.exists('data/block_size.npy'):
    # numpy readcsv couldn't handle the size :(
    blocksize = pd.read_csv("data/dbs_blocks.csv", dtype='i8', usecols=(0,5), names=['block_id', 'block_size'])
    np.save('data/block_size.npy', blocksize.values)
    blocksize = blocksize.values
else:
    blocksize = np.load('data/block_size.npy')

# We'll be accessing randomly, make a fast lookup
bsort = np.argsort(blocksize[:,0])
bsize_index = FastSet(blocksize[bsort, 0])
bsize_values = blocksize[bsort, 1]
bsize_values = np.append(bsize_values, 0)
def getsize(s):
    return bsize_values[np.searchsorted(bsize_index._set, s._set)].sum()

FileNotFoundError: [Errno 2] No such file or directory: 'data/dbs_blocks.csv'

In [None]:
blockmap_filename = 'data/blockmap.pkl'
if not os.path.exists(blockmap_filename):
    blockmap_in = pd.read_csv("data/dbs_blocks.csv", dtype='i8', usecols=(0,2), names=['block_id', 'dataset_id'])
    blockmap = defaultdict(FastSet)
    for bid, dsid in blockmap_in.values:
        blockmap[dsid] += FastSet([bid])

    del blockmap_in
    with gzip.open(blockmap_filename, 'wb') as fout:
        pickle.dump(blockmap, fout)
else:
    with gzip.open(blockmap_filename) as fin:
        blockmap = pickle.load(fin)

In [3]:
ws_cmssw = parquet.read_table('data/working_set_cmssw').to_pandas()
ws_cmssw['working_set_blocks'] = ws_cmssw.apply(lambda x: FastSet(x.working_set_blocks), 'columns')
ws_cmssw = ws_cmssw[ws_cmssw.site_name.str.startswith(('T3'))==False]
ws_cmssw = ws_cmssw[ws_cmssw.site_name.str.startswith(('T0'))==False]
ws_cmssw['site_name'] = ws_cmssw['site_name'].str.replace(r'_DISK$', '', regex=False)
ws_cmssw['site_name'] = ws_cmssw.site_name.apply(lambda s: s + '_Disk' if ((s.startswith('T1')==True) 
                                                                           & (s.endswith('Disk')==False)) else s) 

In [5]:
ws_fwjr = parquet.read_table('data/working_set_fwjr').to_pandas()
ws_fwjr['working_set_blocks'] = ws_fwjr.apply(lambda x: FastSet(x.working_set_blocks), 'columns')
ws_fwjr = ws_fwjr[ws_fwjr.site_name.str.startswith(('T3'))==False]
ws_fwjr = ws_fwjr[ws_fwjr.site_name.str.startswith(('T0'))==False]
ws_fwjr['site_name'] = ws_fwjr['site_name'].str.replace(r'_DISK$', '', regex=False)
ws_fwjr['site_name'] = ws_fwjr.site_name.apply(lambda s: s + '_Disk' if ((s.startswith('T1')==True) 
                                                                         & (s.endswith('Disk')==False)) else s) 

In [6]:
# join the data tier definitions
titles = ['id', 'data_tier', 'day', 'user']
datatiers = pd.read_csv('data/dbs_datatiers.csv', names=titles).set_index('id')

def add_datatiers(ws):
    ws['data_tier'] = datatiers.loc[ws.d_data_tier_id].data_tier.values
    
add_datatiers(ws_cmssw)
add_datatiers(ws_fwjr)

In [7]:
def aod(ws):
    return ws[ws['data_tier'].str.match('(|MINI|NANO)AOD')]

In [8]:
def mini(ws):
    return ws[ws['data_tier'].str.match('MINIAOD')]

In [9]:
def crab(ws):
    return ws.loc[ws['is_crab'] == True]

In [10]:
def dataByLoc(ws):
    data = ws.groupby('site_name').working_set_blocks.agg(lambda series: 
                                                          reduce(FastSet.union, series, 
                                                                 FastSet())).map(getsize)/1e15
    return(data)

In [None]:
def plot(suffix, title, axis):
    fig = plt.figure()
    gs = GridSpec(3, 2, figure = fig)
    

    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    ax3 = fig.add_subplot(gs[1:, :])

    fig.set_figheight(16)
    fig.set_figwidth(30)

    x1 = np.arange(len(globals()['cmssw_set' + suffix]))
    x2 = np.arange(len(globals()['fwjr_set' + suffix]))
    x3 = np.arange(len(globals()['combined' + suffix].fwjr))


    ax1.set_xticks(x1)
    ax2.set_xticks(x2)
    ax3.set_xticks(x3)

    ax1.set_ylabel(axis, fontsize=14)
    ax1.tick_params(axis='both', which='major', labelsize=12)
    ax1.set_title('CMSSW', fontsize=16)
    ax1.xaxis.label.set_visible(False)

    ax2.set_ylabel(axis, fontsize=14)
    ax2.tick_params(axis='both', which='major', labelsize=12)
    ax2.set_title('WMStats fwjr', fontsize=16)
    ax2.xaxis.label.set_visible(False)

    ax3.set_ylabel(axis, fontsize=20)
    ax3.tick_params(axis='both', which='major', labelsize=18)
    ax3.xaxis.label.set_visible(False)

    full =  globals()['combined' + suffix].cmssw +  globals()['combined' + suffix].fwjr
    
    globals()['cmssw_set' + suffix].plot.bar(ax=ax1, align='center', width=0.8, 
                                                    label='CMSSW', alpha=0.5, color='blue')
    globals()['fwjr_set' + suffix].plot.bar(ax=ax2, align='center', width=0.8,
                                                   label='WMStats fwjr', alpha=0.5, color='green')
    full.plot.bar(ax=ax3, align='center', width=0.8,
                                                  label='Union', alpha=1, color='pink')
    globals()['combined' + suffix].fwjr.plot.bar(ax=ax3, align='center', width=0.8,
                                                 label='WMStats fwjr', alpha=0.5, color='green')
    globals()['combined' + suffix].cmssw.plot.bar(ax=ax3, align='center', width=0.8,
                                                  label='CMSSW', alpha=0.5, color='blue')


    ax3.legend(prop={'size': 22})
    fig.suptitle("Data Use by Site%s" % title, fontsize=32)
    
    gs.tight_layout(fig, pad=3)
    
    fig.savefig("plots/sites/sites%s.pdf" % suffix, pad_inches=0.6)
    pp.savefig(pad_inches=0.6)
    return()

In [12]:
cmssw_set = dataByLoc(ws_cmssw)
fwjr_set = dataByLoc(ws_fwjr)

cmssw_set_crab = dataByLoc(crab(ws_cmssw))
fwjr_set_crab = fwjr_set

cmssw_set_aod = dataByLoc(aod(ws_cmssw))
fwjr_set_aod = dataByLoc(aod(ws_fwjr))

cmssw_set_mini = dataByLoc(mini(ws_cmssw))
fwjr_set_mini = dataByLoc(mini(ws_fwjr))

cmssw_set_aod_crab = dataByLoc(aod(crab(ws_cmssw)))
fwjr_set_aod_crab = fwjr_set_aod

cmssw_set_mini_crab = dataByLoc(mini(crab(ws_cmssw)))
fwjr_set_mini_crab = fwjr_set_mini

In [13]:
combined = pd.concat([cmssw_set.rename('cmssw'), fwjr_set.rename('fwjr')], axis=1).sort_index()

combined_crab = pd.concat([cmssw_set_crab.rename('cmssw'), 
                           fwjr_set.rename('fwjr')], axis=1).sort_index()

combined_aod = pd.concat([cmssw_set_aod.rename('cmssw'), 
                           fwjr_set_aod.rename('fwjr')], axis=1).sort_index()

combined_mini = pd.concat([cmssw_set_mini.rename('cmssw'), 
                           fwjr_set_mini.rename('fwjr')], axis=1).sort_index()

combined_aod_crab = pd.concat([cmssw_set_aod_crab.rename('cmssw'), 
                                fwjr_set_aod.rename('fwjr')], axis=1).sort_index()

combined_mini_crab = pd.concat([cmssw_set_mini_crab.rename('cmssw'), 
                                fwjr_set_mini.rename('fwjr')], axis=1).sort_index()

In [14]:
totals = pd.read_pickle('https://ncsmith.web.cern.ch/ncsmith/phedex2rucio/rucio_summary.pkl.gz')
totals = totals.drop('Total')

In [None]:
pp = PdfPages('plots/sites/all_site_plots.pdf')

store = ['static', 'rucio']
title = ['Satic', 'Rucio']
var = ['', ' AOD', ' MINIAOD', ' CMSSW CRAB', ' AOD & CMSSW CRAB',  ' MINIAOD & CMSSW CRAB']
    

for i, kind in enumerate(['', '_aod', '_mini', '_crab', '_aod_crab', '_mini_crab']):
    plot(kind, var[i], 'Data Use [PB]')
    for j, space in enumerate(['_stat', '_rucio']):
        globals()['cmssw_set' + space + kind] = globals()['combined' + kind].cmssw.divide(totals['used'][store[j]]
                                                                               /1e15, fill_value=0).fillna(0)
        
        globals()['cmssw_set' + space + kind] = globals()['cmssw_set' + space + kind].replace(np.inf, 0)
        
        globals()['fwjr_set' + space + kind] = globals()['combined'+ kind].fwjr.divide(totals['used'][store[j]]
                                                                             /1e15, fill_value=0).fillna(0)
        
        globals()['fwjr_set' + space + kind] = globals()['fwjr_set' + space + kind].replace(np.inf, 0)
        
        globals()['combined' + space + kind] = pd.concat([globals()['cmssw_set' + space + kind].rename('cmssw'),
                                                         globals()['fwjr_set' + space + kind].rename('fwjr')], 
                                                         axis=1).sort_index()
        plot(space + kind, ' as a Proportion of ' + title[j] + ' Disk Space' + var[i], 'Proportion of Data Use')

pp.close()

In [4]:
ws_cmssw

Unnamed: 0,day,input_campaign,d_data_tier_id,site_name,is_crab,is_local,working_set_blocks
0,1609459200,HC,21,T1_US_FNAL_Disk,True,True,"(17787214, 17787215)"
1,1609459200,RunIISummer16MiniAODv3,31224,T2_FR_GRIF_IRFU,True,False,"(19404618, 19407308, 19425635, 19436454, 19445..."
2,1609459200,RunIISummer16MiniAODv3,31224,T2_US_Caltech,True,,"(19411223, 19423777, 19424931, 19425636, 19441..."
3,1609459200,RunIISummer16MiniAODv3,31224,T2_US_MIT,,False,"(20193284, 22305304, 22305788)"
4,1609459200,RunIISummer20UL16RECO,21,T2_FR_GRIF_LLR,,True,"(23448088, 23549449, 23551185, 23551679, 23551..."
...,...,...,...,...,...,...,...
408945,1628726400,Run2018D-PromptReco,31223,T2_US_Purdue,True,True,"(19146439, 19257890, 19447162, 19450308, 19450..."
408946,1628726400,Run2018D-ZMu,125,T2_US_Nebraska,True,,(21904243)
408947,1628726400,RunIIAutumn18MiniAOD,31224,T2_FR_GRIF_LLR,True,,(21568083)
408948,1628726400,RunIISummer20UL16MiniAOD,31224,T2_BR_SPRACE,True,True,"(23883493, 23883513, 24036345, 24121382)"
