### When running this notebook via the Galaxy portal
You can access your data via the dataset number. Using a Python kernel, you can access dataset number 42 with ``handle = open(get(42), 'r')``.
To save data, write your data to a file, and then call ``put('filename.txt')``. The dataset will then be available in your galaxy history.
<br><br>Note that if you are putting/getting to/from a different history than your default history, you must also provide the history-id.
<br><br>More information including available galaxy-related environment variables can be found at https://github.com/bgruening/docker-jupyter-notebook. This notebook is running in a docker container based on the Docker Jupyter container described in that link.


In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))
#%jsroot on

In [None]:
from os import listdir
import pandas as pd
import numpy as np
import h5py
import import_ipynb
import setPath
from Input.OpenDataPandaFramework13TeV import *

import os.path
from os import path
import sys

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

#%matplotlib widget

In [None]:
mcdir = "/storage/shared/software/Input/MC/hdf5/"
datadir = "/storage/shared/software/Input/Data/hdf5/"
files_exist = True
if not path.isdir(mcdir) or not path.isdir(datadir):
    print("Can not find hdf5 files. Have you run ConvertNtupToHdf5.ipynb?")
    files_exist = False

In [None]:
if not files_exist:
    print("#"*100)
    print("WARNING \t Can not find hdf5 files. Have you run ConvertNtupToHdf5.ipynb? Make sure these file exists before continuing...")
    print("#"*100)

In [None]:
skimtag = "_4L_"

rBackgroundEvents = 0.5
rSignalEvents = 0.5

Backgrounds = getBkgCategories()
Signals = getSignalCategories()

Backgrounds.remove('Wjetsincl')
Backgrounds.remove('Zjetsincl')

In [None]:
root_files = [f for f in listdir(mcdir) if (f.endswith('.h5') and (skimtag in f) and not ('testing_' in f or ('training' in f)))]
print("Will load the following {:d} files:\n\t{:s}".format(len(root_files),"\n\t".join(sorted(root_files))))

In [None]:
nfile = 0
nx = 0

# Clean the files 
onlyfiles = [f for f in listdir(mcdir) if isfile(join(mcdir, f)) and 
                                        (f.endswith(".h5") and 
                                        (f.startswith("testing_") or f.startswith("training_")))]
for of in onlyfiles:
    os.remove(mcdir+"/"+of)

# Read all the root files with a given skim
for f in root_files:
    
    print("INFO  \t Opening file {:d}/{:d}: {:s}".format(nfile+1,len(root_files),f))
    df = pd.read_hdf(mcdir+"/"+f, 'mini')
    
    # Find the unique DSIDs in the file
    dsid = np.unique(df.iloc[:,[df.columns.get_loc('channelNumber')]].to_numpy())
    #dsid = np.unique(df.iloc[:,[1]].to_numpy())
    # Loop over each DSID and put random selections into training and testing sample 
    nx = 0
    for ids in dsid:
        print("Doing DSID {:f} i.e. num {:d}. In this file: {:d} new DSIDs".format(int(ids),nx+1,len(dsid)))
        newdf = df.loc[df['channelNumber'] == ids]
        
        cat = np.unique(newdf.iloc[:,[newdf.columns.get_loc('MCType')]].to_numpy())
        
        if len(cat) > 1:
            print("ERROR \t More than one type (%s) for dsid %s"%(",".join(cat),ids))
            continue
        cat = cat[0]
        
        print("cat = %s"%cat)
        # If X_test/train exists: concatenate, 
        # If not (i.e. we just wrote to a file): start new ones
        try:
            midl = newdf.sample(frac=rBackgroundEvents)
            X_train = pd.concat([X_train,midl],axis=0)
            X_test  = pd.concat([X_test, newdf.drop(midl.index.values)],axis=0)
            del [midl]
        except:
            X_train = newdf.sample(frac=rBackgroundEvents)
            X_test  = newdf.drop(X_train.index.values)
        del [newdf]
        nx += 1
        # Dump testing/training samples to file every now and then (here: every tenth DSID)
        if nx%10 == 0:
            path = mcdir+"/testing_%s_%s.h5"%(cat,skimtag)
            print("->Writing to file {:s}".format(path))
            X_test.to_hdf(path,key='result', mode='a')
            path = mcdir+"/training_%s_%s.h5"%(cat,skimtag)
            print("->Writing to file {:s}".format(path))
            X_train.to_hdf(path,key='result', mode='a')
            del [X_test]
            del [X_train]
    nfile += 1
    #if nfile > 2: break
    del [df]
    
    # Needed in case we left without writing the last DSIDs to file
    if nx%10 != 0:
        path = mcdir+"/testing_%s_%s.h5"%(cat,skimtag)
        print("<-Writing to file {:s}".format(path))
        X_test.to_hdf(path,key='result', mode='a')
        path = mcdir+"/training_%s_%s.h5"%(cat,skimtag)
        print("<-Writing to file {:s}".format(path))
        X_train.to_hdf(path,key='result', mode='a')
        del [X_test]
        del [X_train]

In [None]:
#/scratch/eirikgr/openData_13TeV/2lep/MC//hdf5//
df = pd.read_hdf(mcdir+"/testing_SUSYC1N2_2L_pt25_25_met50.h5", 'result')
col = df.columns
print(col)

df.iloc[:,[df.columns.get_loc('wgt')]]

In [None]:
#/scratch/eirikgr/openData_13TeV/2lep/MC//hdf5//
df = pd.read_hdf(mcdir+"/training_SUSYC1N2_2L_pt25_25_met50.h5", 'result')
col = df.columns
print(col)

df.iloc[:,[df.columns.get_loc('wgt')]]

# Plotting
The following cells show an example on how to plot the variables stored in the data frame.

First we retrieve the name of all the training and testing files just created

In [None]:
testing_files = [f for f in listdir(mcdir) if (f.endswith('.h5') and (f.startswith("testing") and skimtag in f))]
print("TESTING:  Will load the following {:d} files:\n\t{:s}".format(len(testing_files),"\n\t".join(sorted(testing_files))))
training_files = [f for f in listdir(mcdir) if (f.endswith('.h5') and (f.startswith("training") and skimtag in f))]
print("TRAINING: Will load the following {:d} files:\n\t{:s}".format(len(training_files),"\n\t".join(sorted(training_files))))

Then we load the files. If we want to load the whole set (test+trainin) or only one of them can be specified in *load_files*

In [None]:
nfile = 0
load_files = training_files
try:
    del [X_train]
except:
    print("X_train does not exists yet...")
for f in load_files:
    print("INFO  \t Opening file {:d}/{:d}: {:s}".format(nfile+1,len(load_files),f))
    df = pd.read_hdf(mcdir+"/"+f, 'result')
    try:
        X_train = pd.concat([X_train,df],axis=0)
    except:
        X_train = df
    del [df]
    nfile += 1
    #break

In [None]:
nfile = 0
load_files = testing_files
try:
    del [X_test]
except:
    print("X_test does not exists yet...")
for f in load_files:
    print("INFO  \t Opening file {:d}/{:d}: {:s}".format(nfile+1,len(load_files),f))
    df = pd.read_hdf(mcdir+"/"+f, 'result')
    try:
        X_test = pd.concat([X_test,df],axis=0)
    except:
        X_test = df
    del [df]
    nfile += 1
    #break

In [None]:
X_test.loc[X_test['isSignal'] == 1]

In [None]:
X_test.iloc[5]

In [None]:
# Add data
data_files = [f for f in listdir(datadir) if (f.endswith('.h5') and f.startswith("data"+skimtag))]
print("Will load the following {:d} file(s) for data:\n\t{:s}".format(len(data_files),"\n\t".join(sorted(data_files))))

In [None]:
#notuse = ["Wjetsincl","Zjetsincl"]
#Backgrounds = []
#for tf in testing_files:
#    key = tf.split("_")[1]
#    if key in notuse: continue
#    if key in Signals: continue
#    Backgrounds.append(key)
#Backgrounds

In [None]:
# Concatenate the data (not strictly needed if only 1 file)
nfile = 0
for f in data_files:
    print("INFO  \t Opening file {:d}/{:d}: {:s}".format(nfile+1,len(data_files),f))
    df = pd.read_hdf(datadir+"/"+f, 'mini')
    try:
        X_data = pd.concat([X_data,df],axis=0)
    except:
        X_data = df
    del [df]  
    nfile += 1

Some plotting specific setting (order of plotting, color of backgrounds).The *stack_order* must have the same keys as in the *MCType* column in the data frame

In [None]:
stack_order = ['Data'] + Backgrounds + ["Gee"]
bkgs = X_train['MCType'].unique()
for s in stack_order:
    if "Data" in s: continue
    if not s in bkgs: print("ERROR \t Key {:s} is not in panda".format(s))

In [None]:
getSamplesInCategory("Gee");

In [None]:
sf = {}
for s in stack_order:
    if not s in sf.keys():
        sf[s] = {"train":0,"test":0}
    print(s)
    if s in Backgrounds:
        rslt_df_train = X_train.loc[X_train['MCType'] == s]
        rslt_df_test  = X_test.loc[X_test['MCType'] == s]
        train_sum = rslt_df_train['wgt'].sum()
        test_sum  = rslt_df_test['wgt'].sum()
        print("Train: %s %s"%(s,train_sum))
        print("Test : %s %s"%(s,test_sum))
        sf[s]["test"] = (train_sum+test_sum)/test_sum
        sf[s]["train"] = (train_sum+test_sum)/train_sum
        #print(sf[s]["test"])
        #print(sf[s]["train"])

Extract the numpy arrays from the panda data frame (specify the variable of interest in *var*). Here the limits, bin width etc. are set. 

In [None]:
mc_mll = []
mc_weights = []
mc_colors = []
mc_labels = []

data_mll = []
data_mll_errors = []

sig_mll = []
sig_mll_errors = []
sig_weights = []

signal_dsid = 341122

var = "lep1_pt"
top = -999

nmax = 1000
nmin = 0
binw = 20

data_x = []
if not ((nmax-nmin)/binw).is_integer():
    print("ERROR \t Limits and bin width are not compatible")
#print(int((nmax-nmin)/binw)+1)
bins = [nmin + (x*binw) for x in range(int((nmax-nmin)/binw)+1)]
for i in range(len(bins)-1):
    #print(bins[i])
    data_x.append(bins[i]+(bins[i+1]-bins[i])/2)
#data_x = [((nmin+1) + x*binw) for x in range(int((nmax-nmin)/binw)) ]

for s in stack_order:
    if s == "Data":
        data_mll,_ = np.histogram(X_data[X_data.columns[X_data.columns.get_loc(var):X_data.columns.get_loc(var)+1]]/1000.,bins=bins)
        #data_mll,_ = np.histogram(X_data.as_matrix(columns=X_data.columns[X_data.columns.get_loc(var):X_data.columns.get_loc(var)+1])/1000., bins=bins)
        data_mll_errors = np.sqrt(data_mll)
    elif s in Signals:
        rslt_df = X_train.loc[X_train['channelNumber'] == signal_dsid]
        sig_mll.append(rslt_df.iloc[:,[rslt_df.columns.get_loc(var)]].to_numpy()/1000.)
        sig_weights.append(rslt_df.iloc[:,[rslt_df.columns.get_loc("wgt")]].to_numpy()/1000.)
    elif s in Backgrounds:
        rslt_df = X_train.loc[X_train['MCType'] == s]
        mc_mll.append(rslt_df.iloc[:,[rslt_df.columns.get_loc(var)]].to_numpy()/1000.)
        mc_weights.append(rslt_df.iloc[:,[rslt_df.columns.get_loc("wgt")]].to_numpy()/1000.)
        #mc_mll.append(rslt_df[rslt_df.columns[rslt_df.columns.get_loc(var):rslt_df.columns.get_loc(var)+1]].to_numpy()/1000)
        #mc_weights.append(rslt_df[rslt_df.columns[rslt_df.columns.get_loc("wgt"):rslt_df.columns.get_loc("wgt")+1]].to_numpy()*1000)
        #mc_mll.append(rslt_df.as_matrix(columns=rslt_df.columns[rslt_df.columns.get_loc(var):rslt_df.columns.get_loc(var)+1])/1000.)
        #mc_weights.append(rslt_df.as_matrix(columns=rslt_df.columns[rslt_df.columns.get_loc("wgt"):rslt_df.columns.get_loc("wgt")+1])*(1000.))
        mc_colors.append(bkg_plot_dic[s]['color'])
        mc_labels.append(s)
        if np.amax(mc_mll[-1]) > top:
            top = np.amax(mc_mll[-1])
        del [rslt_df]
        

mc_mll_array = np.array(mc_mll,dtype='object')
mc_weights_array = np.array(mc_weights,dtype='object')

sig_mll_array = np.array(sig_mll,dtype='float32')[0]
sig_weights_array = np.array(sig_weights,dtype='float32')[0]
data_mll

Finally, do the plotting:

In [None]:
%matplotlib inline
figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
plt.hist(x=mc_mll_array,bins=bins,weights=mc_weights_array,stacked=True, label=mc_labels); #weights=mc_weights,color=mc_colors,
plt.errorbar( x=data_x, y=data_mll, yerr=data_mll_errors, fmt='ko', label='Data')
plt.hist(x=sig_mll_array,bins=bins,stacked=True,weights=sig_weights_array, label=['Signal'])
plt.yscale('log')
plt.ylabel(r'Events',fontname='sans-serif',horizontalalignment='right',y=1.0,fontsize=11)
plt.xlabel(r'$M_{ll}$ [GeV]',fontname='sans-serif',horizontalalignment='right',x=1.0,fontsize=11)

plt.ylim(bottom=0.001,top=50000)#top/40.)

ax = plt.gca()
plt.text(0.05,0.97,r'$\mathbf{{ATLAS}}$ Open Data',ha="left",va="top",family='sans-serif',transform=ax.transAxes,fontsize=13)
plt.text(0.05,0.92,'for education only',ha="left",va="top",family='sans-serif',transform=ax.transAxes,style='italic',fontsize=8)
plt.text(0.05,0.90,r'$\sqrt{s}=13\,\mathrm{TeV},\;\int L\,dt=10\,\mathrm{fb}^{-1}$',ha="left",va="top",family='sans-serif',transform=ax.transAxes)


plt.legend()
