In [1]:
import numpy as np
import pandas as pd
import os,sys

In [2]:
#sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
#cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-0,Monojet.csv'
cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-0mediator.csv'

In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            binErrorPlus = myeval(values[-2])
            binErrorMinus = myeval(values[-1])            
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
        if isBins:
            columnsNameErrorPlus = columnName+'_ErrorPlus'
            columnsNameErrorMinus = columnName+'_ErrorMinus'
            blockDict[columnsNameErrorPlus] = [binErrorPlus]
            blockDict[columnsNameErrorMinus] = [binErrorMinus]
        
    
    return blockDict

### Get cut flow data

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(cutFlowFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)


### Get signal template data

In [5]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(sigTemplatesFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
dfB = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    dfB = pd.concat([dfB,newDF],ignore_index=True)


### Make sure values are standardized

In [6]:
# Rename column to match dF
df.rename(columns={'$g_{\chi}$' : '$g_{DM}$'}, inplace=True)
# Rename values to match dF
df['Mode']=df['Mode'].replace({'$\chi\chi+j$' : 'DM+QCDjets', 
                               '$\\chi\\chi$+Z(qq)' : 'DM+Z(qq)',
                               '$\\chi\\chi$+W(qq)' : 'DM+W(qq)'})

df.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                   '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                  '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)
dfB.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                    '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                   '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)


### Merge cut flow and signal template

In [7]:
commonColumns = list(set(df.columns).intersection(set(dfB.columns)))
dfComb = pd.merge(df,dfB,on=commonColumns,how='right')

# Filter mojet only (it seems monov is also present for spin0)
dfComb = dfComb[dfComb['Mode'] == 'DM+QCDjets']

dfComb.sort_values(['Coupling','Mode','$m_{med}$','$m_{DM}$','Data-takingperiod'],inplace=True,
              ascending=[False,False,True,True,True])

### Compute total cross-section

In [8]:
binCols = [c for c in dfComb.columns if ('bin_' in c and not 'Error' in c)]
# Add total yield column
# (total yield = sum over MET bins/pre-selection efficiency)
# dfComb['TotalYield'] = dfComb[binCols].sum(axis=1)/dfComb['$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'] 
dfComb['TotalYield'] = dfComb[binCols].sum(axis=1)/dfComb['$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad']
# Add luminosity
luminosities = {2016: 36.0, 2017 : 41.5, 2018 : 59.7}
dfComb['Luminosity (1/fb)'] = 36.0
for dp, lum in luminosities.items():
    dfComb.loc[dfComb['Data-takingperiod'] == dp, 'Luminosity (1/fb)'] = lum
# Compute total cross-section (total yield/eff)
# for the corresponding luminosities
dfComb['Total xsec-pT150 (pb)'] = dfComb['TotalYield']/(1e3*dfComb['Luminosity (1/fb)'])
# Remove auxiliary columns:
dfComb.drop('TotalYield',axis='columns',inplace=True)

In [9]:
dfComb

Unnamed: 0,Coupling,Mode,$m_{med}$,$m_{DM}$,$g_{DM}$,$g_{q}$,Data-takingperiod,Fullsample,Triggeremulation,$p_{T}^{miss}>250$GeV,...,bin_1090.0_1160.0_ErrorPlus,bin_1090.0_1160.0_ErrorMinus,bin_1160.0_1250.0,bin_1160.0_1250.0_ErrorPlus,bin_1160.0_1250.0_ErrorMinus,bin_1250.0_1400.0,bin_1250.0_1400.0_ErrorPlus,bin_1250.0_1400.0_ErrorMinus,Luminosity (1/fb),Total xsec-pT150 (pb)
54,Scalar,DM+QCDjets,10.0,1.0,1.0,1.0,2016,,,,...,0.258970,-0.258970,0.42798,0.247320,-0.247320,0.57527,0.280290,-0.280290,36.0,
154,Scalar,DM+QCDjets,10.0,1.0,1.0,1.0,2017,1.0,0.42684,0.072720,...,0.402590,-0.402590,0.15638,0.156380,-0.156380,0.00000,0.000000,0.000000,41.5,1.940543
253,Scalar,DM+QCDjets,10.0,1.0,1.0,1.0,2018,1.0,0.37859,0.070156,...,0.236330,-0.236330,0.34179,0.341790,-0.341790,1.44020,0.610920,-0.610920,59.7,1.955077
55,Scalar,DM+QCDjets,10.0,4.0,1.0,1.0,2016,,,,...,0.197390,-0.197390,0.00000,0.000000,0.000000,0.13498,0.134980,-0.134980,36.0,
155,Scalar,DM+QCDjets,10.0,4.0,1.0,1.0,2017,1.0,0.42616,0.072360,...,0.269890,-0.269890,0.00000,0.000000,0.000000,0.19305,0.193050,-0.193050,41.5,1.952818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,Pseudoscalar,DM+QCDjets,700.0,1.0,1.0,1.0,2017,1.0,0.70783,0.331170,...,0.041947,-0.041947,0.27017,0.035313,-0.035313,0.29457,0.037139,-0.037139,41.5,0.054702
222,Pseudoscalar,DM+QCDjets,700.0,1.0,1.0,1.0,2018,1.0,0.67941,0.329680,...,0.059003,-0.059003,0.37246,0.051949,-0.051949,0.41745,0.055014,-0.055014,59.7,0.055569
24,Pseudoscalar,DM+QCDjets,800.0,1.0,1.0,1.0,2016,,,,...,0.020855,-0.020855,0.17397,0.018789,-0.018789,0.24293,0.022047,-0.022047,36.0,
124,Pseudoscalar,DM+QCDjets,800.0,1.0,1.0,1.0,2017,1.0,0.72145,0.357200,...,0.025137,-0.025137,0.19303,0.022354,-0.022354,0.20421,0.022418,-0.022418,41.5,0.027736


In [10]:
massPairs = set([])
for i,row in dfComb[dfComb['Coupling'] == 'Scalar'].iterrows():
    massPairs.add((row['$m_{DM}$'],row['$m_{med}$']))
massPairs = sorted(list(massPairs))
print(len(massPairs))
print(massPairs)

25
[(1.0, 10.0), (1.0, 50.0), (1.0, 100.0), (1.0, 200.0), (1.0, 300.0), (1.0, 350.0), (1.0, 400.0), (1.0, 450.0), (1.0, 500.0), (1.0, 600.0), (1.0, 700.0), (1.0, 800.0), (4.0, 10.0), (6.0, 10.0), (20.0, 50.0), (22.0, 50.0), (28.0, 50.0), (40.0, 100.0), (45.0, 100.0), (50.0, 500.0), (55.0, 100.0), (150.0, 500.0), (200.0, 500.0), (225.0, 500.0), (275.0, 500.0)]


In [11]:
# Save to pickle file
pickleFile = os.path.basename(sigTemplatesFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile.split('_',1)[1]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
dfComb.to_pickle(pickleFile)

Saving to DMsimp_spin-0_Monojet_DF.pcl
