In [1]:
# NB: This is for spark running on parquet files converted from
#     baconbits skims. This is a prototype, there is lots of boilerplate.
#     We're making it better :-)

import pyspark.sql
import os
import sys

# The following line is necessary because we're working in a
# virtualenv. Without it, executors will use the wrong interpreter!
os.environ['PYSPARK_PYTHON'] = sys.executable

#    .config("spark.driver.extraClassPath","/home/cms.lgray/sparkMeasure/target/scala-2.12/spark-measure_2.12-0.14-SNAPSHOT.jar") \
#    .master('local[*]') \

session = pyspark.sql.SparkSession.builder \
    .appName("baconbits-spark") \
    .config('spark.executor.memory', "16g") \
    .config('spark.executor.cores', "4") \
    .config('spark.sql.execution.arrow.enabled',"true") \
    .config('spark.sql.execution.arrow.maxRecordsPerBatch', 500000) \
    .config('spark.driver.maxResultSize',0) \
    .config('spark.dynamicAllocation.minExecutors',2) \
    .config('spark.dynamicAllocation.maxExecutors',250) \
    .config('spark.cores.max',1000) \
    .config('spark.sql.files.maxPartitionBytes', 2 * 1024 * 1024 * 1024) \
    .getOrCreate()
sc = session.sparkContext
sc.setLogLevel("WARN")
sc._jsc.hadoopConfiguration().set("dfs.block.size", '1g')
sc._jsc.hadoopConfiguration().set("parquet.block.size", '1g')
spark = session

#from sparkmeasure import TaskMetrics
#taskmetrics = TaskMetrics(spark)

partitionsize = 200000
thread_workers = 16


In [2]:
import pyspark.sql.functions as fn
from tqdm import tqdm
import json

datasets = {}

with open('metadata/samplefiles.json') as f:
    temp = json.load(f)
    for dsgroup,datasetlist in temp.items():
        if dsgroup != 'Hbb_2017': continue
        datasets = datasetlist

datasets_spark = {}
skim_root = 'bitsconvert_17042019'
for ds, files in datasets.items():
    datasets_spark[ds] = ['hdfs:///store/parquet/zprimebits/%s/%s/'%(skim_root,ds)]


In [3]:
#get the hbb analysis worker from the cloudpickle file
import cloudpickle as cpkl
import lz4.frame as lz4f

processor_pkl = 'boostedHbbProcessor.cpkl.lz4'
processor_instance = None
with lz4f.open(processor_pkl, mode="rb") as fin:
    processor_instance = cpkl.load(fin)


In [4]:
import time
from coffea.processor import run_spark_job
from coffea.processor.spark.spark_executor import spark_executor

tic = time.time()
final_accumulator = run_spark_job(datasets_spark, processor_instance, spark_executor, 
                                  spark=spark, partitionsize=partitionsize, thread_workers=thread_workers)
dt = time.time() - tic


loading:   0%|          | 0/52 [00:00<?, ?datasets/s]

pyspark version: 2.4.1


loading: 100%|██████████| 52/52 [00:29<00:00,  1.79datasets/s]
Processing: 100%|██████████| 52/52 [07:05<00:00,  8.18s/datasets]


In [5]:
nevt = sum(spark_executor.counts.values())
print('processed:',nevt,'events')
print('total time: ',dt/60)
print('μs/evt', dt/nevt*1e6)
print('Mevt/s', nevt/dt/1e6)


processed: 290837857 events
total time:  7.7722529927889505
μs/evt 1.6034198036582872
Mevt/s 0.6236669883448159


In [7]:
from coffea import hist
import gzip
import pickle
import numexpr
import numpy as np

nbins = sum(sum(arr.size for arr in h._sumw.values()) for h in final_accumulator.values() if isinstance(h, hist.Hist))
nfilled = sum(sum(np.sum(arr>0) for arr in h._sumw.values()) for h in final_accumulator.values() if isinstance(h, hist.Hist))
print("Processed %.1fM events" % (nevt/1e6, ))
print("Filled %.1fM bins" % (nbins/1e6, ))
print("Nonzero bins: %.1f%%" % (100*nfilled/nbins, ))

# Pickle is not very fast or memory efficient, will be replaced by something better soon
with lz4f.open("hists.cpkl.lz4", mode="wb", compression_level=6) as fout:
    cpkl.dump(final_accumulator, fout)

#dt = time.time() - tstart
#print("%.2f us*cpu/event overall" % (1e6*dt*nworkers/final_accumulators['nentries'], ))


Processed 290.8M events
Filled 233.1M bins
Nonzero bins: 0.9%


In [6]:
spark.stop()
