### When running this notebook via the Galaxy portal
You can access your data via the dataset number. Using a Python kernel, you can access dataset number 42 with ``handle = open(get(42), 'r')``.
To save data, write your data to a file, and then call ``put('filename.txt')``. The dataset will then be available in your galaxy history.
<br><br>Note that if you are putting/getting to/from a different history than your default history, you must also provide the history-id.
<br><br>More information including available galaxy-related environment variables can be found at https://github.com/bgruening/docker-jupyter-notebook. This notebook is running in a docker container based on the Docker Jupyter container described in that link.


# ATLAS OpenData with RDataFrame

This notebook uses <a href="https://root.cern/doc/master/classROOT_1_1RDataFrame.html" target="_blank">RDataFrame</a> in ROOT to perform an analysis of the 13 TeV ATLAS OpenData. It needs ROOT version >= 6.24/02. 

## Includes and imports

The follwing cells includes the needed libraries as well as a helper function with some useful function to retrieve all the available samples and the categorization of backgrounds. See the ouput for more information.

In [None]:
import ROOT
#ROOT.EnableImplicitMT()
import os
import import_ipynb
import setPath
from Input.OpenDataPandaFramework13TeV import *
%jsroot on

In [None]:
import socket
print(socket.gethostname())

In [None]:
# Not really needed since lumi is set as a public variable in include above
lumi = 10064.0
print('Run on data corresponding to {:.2f} fb^-1'.format(lumi/ 1000.0))

## Get the samples and categories

Set the path to the location of the openData ntuples and the <a href="http://opendata.atlas.cern/release/2020/documentation/datasets/files.html" target="_blank">dataset</a> you want to run over. The *initialize()* checks for all available samples in the directory and categorize them accordingly.

In [None]:
dir = "/storage/shared/data/fys5555/ATLAS_opendata/"
#dir = "/storage/shared/data/fys5555/ATLAS_opendata/RNTuples/" #use RNtuple
ana = "2lep"
mcfiles = initialize(dir+"/"+ana+"/MC")
datafiles = initialize(dir+"/"+ana+"/Data")
allfiles = z = {**mcfiles, **datafiles}

In [None]:
processes = allfiles.keys()
df = {}
for p in processes:
    samples = []
    datafrs = []
    ns = 0
    for d in allfiles[p]["files"]:
        if ns == 0:
            fold = "/".join(d.split("/")[:-1])
            haddfile = "%s/%s.root"%(fold,p)
            if os.path.isfile(haddfile): 
                break
        samples.append(d)
        ns += 1
    if len(samples):
        print("Using %i unhadded files for %s"%(len(samples),p))
        df[p] = ROOT.RDataFrame("mini", samples)
    else:
        print("Using hadded file %s for %s"%(haddfile,p))
        df[p] = ROOT.RDataFrame("mini", haddfile)

In [None]:
! g++ -shared -fPIC -o Cfunctions.so /storage/shared/software/Input/Cfunctions.cxx `root-config --cflags --glibs`

Include a pre-compiled c++ library of useful functions. Do ROOT.help() to see content.

In [None]:
ROOT.gSystem.AddDynamicPath("/storage/shared/software/Input/")
ROOT.gROOT.ProcessLine(".include /storage/shared/software/Input/");
ROOT.gInterpreter.AddIncludePath("/storage/shared/software/Input/");
ROOT.gInterpreter.Declare('#include "/storage/shared/software/Input/Cfunctions.h"') # Header with the definition of the myFilter function
ROOT.gSystem.Load("Cfunctions.so") # Library with the myFilter function

In [None]:
ROOT.help()

In [None]:
%%time
import time
for p in processes:
    
    print("Looking at %s"%p)
    
    # Define good leptons using pT > 25 GeV and isolation
    df[p] = df[p].Define("goodLEP","lep_pt > 25000 && lep_etcone20/lep_pt < 0.15 && lep_ptcone30/lep_pt < 0.15")
    #df[p] = df[p].Define("goodLEP","1")
    df[p] = df[p].Define("n_goodLEP","Sum(goodLEP)")
    # Find number of good leptons
    df[p] = df[p].Filter("n_goodLEP == 2","2 good leptons")
    # Calculate flavour and charge of the two leptons
    df[p] = df[p].Define("isOS","isOS(lep_charge[goodLEP])")
    df[p] = df[p].Define("isSF","isSF(lep_type[goodLEP])")
    # Cut on SF + OS
    df[p] = df[p].Filter("isSF","Same flavour")
    df[p] = df[p].Filter("isOS","Opposite sign")
    # Compute mll
    df[p] = df[p].Define("mll","ComputeInvariantMass(lep_pt[goodLEP],lep_eta[goodLEP],lep_phi[goodLEP],lep_E[goodLEP])")
    
    # Compute costheta*
    df[p] = df[p].Define("costhstar","costhetastar(lep_pt[goodLEP],lep_eta[goodLEP],lep_phi[goodLEP],lep_E[goodLEP])")
    
    
    # Calculate weight for scaling (inlcudes scaling to luminosisty)
    if allfiles[p]["type"] == "data":
        df[p] = df[p].Define("weight", "1.0")
    else:
        df[p] = df[p].Define("weight", "scaleFactor_ELE * scaleFactor_MUON * scaleFactor_LepTRIGGER * scaleFactor_PILEUP * mcWeight * (XSection * {} / SumWeights)".format(lumi))
        

In [None]:
#hist = df["Gmumu"].Histo1D(ROOT.RDF.TH1DModel(p, "XSection",200, 0, 2), "XSection")
cols = ROOT.vector('string')()
cols.push_back("XSection")
cols.push_back("channelNumber")
cols.push_back("scaleFactor_ELE")
cols.push_back("scaleFactor_MUON")
cols.push_back("scaleFactor_LepTRIGGER")
cols.push_back("scaleFactor_PILEUP")
cols.push_back("mcWeight")
cols.push_back("XSection")
cols.push_back("SumWeights")
d = df["Gee"].Display(cols)
print(d.AsString())

In [None]:
#d2 = df["topX"].Display(cols)
#d2.Print()

In [None]:
# Create canvas with pad
##c = ROOT.TCanvas("c", "", 900, 700)
#c.Draw()
#pad = ROOT.TPad("upper_pad", "", 0, 0, 1, 1)
#pad.SetTickx(False)
#pad.SetTicky(False)
#pad.SetLogy()
#pad.Draw()
#pad.cd()
#hist.Draw()

In [None]:
for c in df["Zjets"].GetColumnNames():
    print(c)

Before defining the histograms axis ranges and number of bins must be set. There are some pre-defined values for some of the available variables alread stored in _plotdic_. You can easilly add new variables or change the current ones. 

In [None]:
print("Available variables : %s"%", ".join(plotdic.keys()))
print("Structure of dictionary is as follows : \n",plotdic["lep_type"])

In [None]:
%%time
import time
histos = {}
variables = ["costhstar","met_et","mll","lep_pt","lep_E","jet_pt"]
allhistos = []
for v in variables:
    if not v in plotdic.keys():
        print("ERROR \t Could not find plot information for %s"%v)
    histos[v] = {}
    for p in processes:
        histos[v][p] = df[p].Histo1D(ROOT.RDF.TH1DModel(p+"_"+v, v, plotdic[v]['nbin'], plotdic[v]['nmin'], plotdic[v]['nmax']), v, "weight")
        allhistos.append(histos[v][p])

In [None]:
%%time
print("Number of histograms = %i"%len(allhistos))
ROOT.RDF.RunGraphs(allhistos)

In [None]:
# Sort the background wrt to size (for plotting)

# First; get the sum of weights for each histogram
dir_sumw = {}
for v in variables:
    if not v in dir_sumw.keys():
        dir_sumw[v] = {}
    for p in processes:
        dir_sumw[v][p] = histos[v][p].GetSumOfWeights()
        
# Second; sort them accordingly
sorted_sumw = {}
for v in dir_sumw.keys():
    sorted_sumw[v] = []
    while True:
        maxi = -999
        for p in dir_sumw[v].keys():
            if (dir_sumw[v][p] > maxi) and (not p in sorted_sumw[v]):
                maxi = dir_sumw[v][p]
                maxip = p
        sorted_sumw[v].append(maxip)
        if len(sorted_sumw[v]) == len(dir_sumw[v].keys()): break

In [None]:
%%time
#import time
mcbkg = {}
data = {}
nv = 1
for v in variables:
    print("Doing variable %s (%i/%i)"%(v,nv,len(variables)+1))
    mcbkg[v] = []
    for p in reversed(sorted_sumw[v]):
        if allfiles[p]["type"] == "bkg":
            mcbkg[v].append(histos[v][p].GetValue())
        elif allfiles[p]["type"] == "data":
            data[v] = histos[v][p].GetValue()
    nv += 1

In [None]:
sorted_sumw[v]

In [None]:
# Add legend
legend = ROOT.TLegend(0.60, 0.60, 0.8, 0.85)
legend.SetTextFont(42)
legend.SetFillStyle(0)
legend.SetBorderSize(0)
legend.SetTextSize(0.04)
#legend.SetTextAlign(32)

donotplot = ["Zjetsincl","Wjetsincl"]

# Draw stack with MC contributions
stack = ROOT.THStack()

# Set the variable to plot
v = "costhstar"
for h in mcbkg[v]:
    p = h.GetName().split("_")[0]
    if p in bkg_plot_dic.keys():
        print(p)
        color = bkg_plot_dic[p]["color"]
    else:
        print("Could not find color for %s"%p)
        color = ROOT.kWhite
    print(color)
    #print(h.GetName()) 
    if h.GetName().split("_")[0] in donotplot: continue
    h.SetLineWidth(1)
    h.SetLineColor(1)
    h.SetFillColor(ROOT.TColor.GetColor(*color))
    h.SetDirectory(0)
    legend.AddEntry(h,"%-s"%h.GetName().split("_")[0].strip(),"f")
    stack.Add(h)

In [None]:
# Create canvas with pad
c = ROOT.TCanvas("c", "", 900, 700)
c.Draw()
ROOT.gStyle.SetOptStat(0)
pad = ROOT.TPad("upper_pad", "", 0, 0.2, 1, 1.0)
pad2 = ROOT.TPad("lower_pad", "", 0, 0, 1, 0.2)
pad.SetTickx(False)
pad.SetTicky(False)
pad.SetBottomMargin(0.005)
pad.SetLogy()
pad.Draw()
pad2.Draw()
pad.cd()
stack.Draw("HIST")

sumMC = stack.GetStack().Last()
sumMC.SetDirectory(0)
sumMC.Divide(data[v])

# Draw stack with MC contributions
stack.GetXaxis().SetLabelSize(0.04)
stack.GetXaxis().SetTitleSize(0.045)
stack.GetXaxis().SetTitleOffset(1.3)
stack.GetXaxis().SetTitle("m_{T}^{W#rightarrow l#nu} [GeV]")
stack.GetYaxis().SetTitle("Events")
stack.GetYaxis().SetLabelSize(0.04)
stack.GetYaxis().SetTitleSize(0.045)
stack.SetMaximum(1e7 * lumi*1e3)
stack.SetMinimum(10)

# Draw data
data[v].SetMarkerStyle(20)
data[v].SetMarkerSize(1.2)
data[v].SetLineWidth(2)
data[v].SetLineColor(ROOT.kBlack)
data[v].Draw("E SAME")

# Draw legend
legend.Draw("SAME")

# Add ATLAS label
text = ROOT.TLatex()
text.SetNDC()
text.SetTextFont(72)
text.SetTextSize(0.045)
text.DrawLatex(0.21, 0.86, "ATLAS")
text.SetTextFont(42)
text.DrawLatex(0.21 + 0.09, 0.86, "Open Data")
text.SetTextSize(0.04)
text.DrawLatex(0.21, 0.80, "#sqrt{{s}} = 13 TeV, {:.1f} fb^{{-1}}".format(lumi / 1000.0))

pad2.cd()
pad2.SetGridy()
pad2.SetTopMargin(0.01)
pad2.SetTickx(False)
pad2.SetTicky(False)
sumMC.SetTitle("")
sumMC.GetXaxis().SetLabelSize(0.15)
sumMC.GetYaxis().SetLabelSize(0.15)
sumMC.SetMaximum(2)
sumMC.SetMinimum(2)
sumMC.Draw("ep")

In [None]:
allcuts = {}
for p in processes:
    allcuts[p] = df[p].Report()
    print(p)
    #allcuts.Print()

In [None]:
def convertRDFCutflowToTex(cutflow, tex = False):
    i = 0
    tabstr = ""
    stdstr = ""
    for p in cutflow.keys():
        tabstr += "%s &" %p
        stdstr += "%-10s " %p
        if i == 0:
            headerstr = "Background & "
            stdheadstr = "%10s "%"Background"
            for c in cutflow[p]:
                headerstr += "%s & " %c.GetName()
                stdheadstr += "| {:32s}".format(c.GetName().strip())
            headerstr = headerstr[:-2]+"\\\ \n"
            stdheadstr += "\n"
        for c in cutflow[p]:
            cname = c.GetName()
            nevc1 = c.GetAll()
            stdstr += "| %9.0f %9.0f %5.1f %5.1f "%(c.GetPass(),c.GetAll(),c.GetEff(),(c.GetPass()/nevc1)*100.)
            cname = cname.replace(">","$>$")
            cname = cname.replace("<","$<$")
            tabstr += "$%.0f$ & $%.0f$ & $%.2f$ & $%.2f$ "%(c.GetPass(),c.GetAll(),c.GetEff(),(c.GetPass()/nevc1)*100.)
            i += 1
        tabstr += "\\\ \n"
        stdstr += "\n"
    if tex:
        print(headerstr)
        print(tabstr)
    else:
        print(stdheadstr)
        print(stdstr)

In [None]:
convertRDFCutflowToTex(allcuts)

In [None]:
import os, subprocess
prefix_path = subprocess.run(["root-config", "--prefix"], capture_output=True).stdout.decode().strip()