In [1]:
import numpy as np
import pandas as pd
import os,sys
pd.set_option('display.max_rows', 200)

In [2]:
#sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
#cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-0,Monojet.csv'
cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-0mediator.csv'

# sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,ADD,Monojet.csv'
# cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,ADD.csv'


In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('[TeV]','')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            binErrorPlus = myeval(values[-2])
            binErrorMinus = myeval(values[-1])            
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
        if isBins:
            columnsNameErrorPlus = columnName+'_ErrorPlus'
            columnsNameErrorMinus = columnName+'_ErrorMinus'
            blockDict[columnsNameErrorPlus] = [binErrorPlus]
            blockDict[columnsNameErrorMinus] = [binErrorMinus]
        
    
    return blockDict

### Get cut flow data

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(cutFlowFile,'r') as f:
    data = f.read()
    for tag in ['#: Coupling','#: $M_{D}$ [TeV]']:
        if not tag in data:
            continue
        blocks = data.split(tag)
        for ib,b in enumerate(blocks):
            blocks[ib] = tag.replace('# :','')+b
        break
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)


### Get signal template data

In [5]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(sigTemplatesFile,'r') as f:
    data = f.read()
    for tag in ['#: Coupling','#: d,']:
        if not tag in data:
            continue
        blocks = data.split(tag)
        for ib,b in enumerate(blocks):
            blocks[ib] = tag.replace('# :','')+b
        break
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
dfB = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    dfB = pd.concat([dfB,newDF],ignore_index=True)


In [6]:
print(len(df),len(dfB))

193 299


### Make sure values are standardized

In [7]:
# Rename column to match dF
df.rename(columns={'$g_{\chi}$' : '$g_{DM}$', 'd' : '$d$'}, inplace=True)
dfB.rename(columns={'$g_{\chi}$' : '$g_{DM}$', 'd' : '$d$'}, inplace=True)
if not 'Mode' in df.columns:
    df['Mode'] = 'DM+QCDjets'
    dfB['Mode'] = 'DM+QCDjets'
if not 'Coupling' in df.columns:
    df['Coupling'] = 'ADD'
    dfB['Coupling'] = 'ADD'

# Rename values to match dF
df['Mode']=df['Mode'].replace({'$\chi\chi+j$' : 'DM+QCDjets', 
                               '$\\chi\\chi$+Z(qq)' : 'DM+Z(qq)',
                               '$\\chi\\chi$+W(qq)' : 'DM+W(qq)'})
df['Data-takingperiod']=df['Data-takingperiod'].replace({7 : 2017, 8 : 2018, 6 : 2016})
dfB['Data-takingperiod']=dfB['Data-takingperiod'].replace({7 : 2017, 8 : 2018, 6 : 2016})

df.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                   '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                  '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)
dfB.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                    '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                   '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)


### Merge cut flow and signal template

In [8]:
commonColumns = list(set(df.columns).intersection(set(dfB.columns)))
print(commonColumns)

dfComb = pd.merge(df,dfB,on=commonColumns,how='right')

# Filter mojet only (it seems monov is also present for spin0)
dfComb = dfComb[dfComb['Mode'] == 'DM+QCDjets']

sortColumnsDict = {'Coupling' : False, 'Mode' : False, '$m_{med}$' : True,
               '$M_{D}$' : True,'$m_{DM}$' : True,'d' :True,'Data-takingperiod' : True}
sortColumns = [c for c in sortColumnsDict if c in dfComb.columns]
ascending = [v for c,v in sortColumnsDict.items() if c in dfComb.columns]
dfComb.sort_values(sortColumns,inplace=True,
              ascending=ascending)

['$m_{med}$', 'Mode', '$m_{DM}$', 'Coupling', '$g_{q}$', '$g_{DM}$', 'Data-takingperiod']


### Compute total cross-section

In [9]:
binCols = [c for c in dfComb.columns if ('bin_' in c and not 'Error' in c)]
# Add total yield column
# (total yield = sum over MET bins/pre-selection efficiency)
# dfComb['TotalYield'] = dfComb[binCols].sum(axis=1)/dfComb['$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'] 
dfComb['TotalYield'] = dfComb[binCols].sum(axis=1)/dfComb['$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad']
# Add luminosity
luminosities = {2016: 36.0, 2017 : 41.5, 2018 : 59.7}
dfComb['Luminosity (1/fb)'] = 36.0
for dp, lum in luminosities.items():
    dfComb.loc[dfComb['Data-takingperiod'] == dp, 'Luminosity (1/fb)'] = lum
# Compute total cross-section (total yield/eff)
# for the corresponding luminosities
dfComb['Total xsec-pT150 (pb)'] = dfComb['TotalYield']/(1e3*dfComb['Luminosity (1/fb)'])
# Remove auxiliary columns:
dfComb.drop('TotalYield',axis='columns',inplace=True)

In [10]:
dfComb

Unnamed: 0,Coupling,Mode,$m_{med}$,$m_{DM}$,$g_{DM}$,$g_{q}$,Data-takingperiod,Fullsample,Triggeremulation,$p_{T}^{miss}>250$GeV,...,bin_1090.0_1160.0_ErrorPlus,bin_1090.0_1160.0_ErrorMinus,bin_1160.0_1250.0,bin_1160.0_1250.0_ErrorPlus,bin_1160.0_1250.0_ErrorMinus,bin_1250.0_1400.0,bin_1250.0_1400.0_ErrorPlus,bin_1250.0_1400.0_ErrorMinus,Luminosity (1/fb),Total xsec-pT150 (pb)
54,Scalar,DM+QCDjets,10.0,1.0,1.0,1.0,2016,,,,...,0.25897,-0.25897,0.42798,0.24732,-0.24732,0.57527,0.28029,-0.28029,36.0,
154,Scalar,DM+QCDjets,10.0,1.0,1.0,1.0,2017,1.0,0.42684,0.07272,...,0.40259,-0.40259,0.15638,0.15638,-0.15638,0.0,0.0,0.0,41.5,1.940543
253,Scalar,DM+QCDjets,10.0,1.0,1.0,1.0,2018,1.0,0.37859,0.070156,...,0.23633,-0.23633,0.34179,0.34179,-0.34179,1.4402,0.61092,-0.61092,59.7,1.955077
55,Scalar,DM+QCDjets,10.0,4.0,1.0,1.0,2016,,,,...,0.19739,-0.19739,0.0,0.0,0.0,0.13498,0.13498,-0.13498,36.0,
155,Scalar,DM+QCDjets,10.0,4.0,1.0,1.0,2017,1.0,0.42616,0.07236,...,0.26989,-0.26989,0.0,0.0,0.0,0.19305,0.19305,-0.19305,41.5,1.952818
254,Scalar,DM+QCDjets,10.0,4.0,1.0,1.0,2018,1.0,0.37918,0.069837,...,0.62058,-0.62058,0.038339,0.038339,-0.038339,0.9227,0.47641,-0.47641,59.7,1.976468
56,Scalar,DM+QCDjets,10.0,6.0,1.0,1.0,2016,,,,...,0.014679,-0.014679,0.025443,0.014729,-0.014729,0.016362,0.011572,-0.011572,36.0,
156,Scalar,DM+QCDjets,10.0,6.0,1.0,1.0,2017,1.0,0.47423,0.1023,...,0.021725,-0.021725,0.05859,0.024553,-0.024553,0.038261,0.019391,-0.019391,41.5,0.112943
255,Scalar,DM+QCDjets,10.0,6.0,1.0,1.0,2018,1.0,0.42764,0.098906,...,0.033973,-0.033973,0.054059,0.026827,-0.026827,0.043835,0.025594,-0.025594,59.7,0.11444
68,Scalar,DM+QCDjets,50.0,1.0,1.0,1.0,2016,,,,...,0.12679,-0.12679,0.50239,0.25394,-0.25394,0.3737,0.21624,-0.21624,36.0,


In [11]:
massPairs = set([])
for i,row in dfComb.iterrows():
    if '$m_{med}$' in row:
        massPairs.add((row['$m_{DM}$'],row['$m_{med}$']))
    else:
        massPairs.add((row['$M_{D}$'],row['$d$']))
massPairs = sorted(list(massPairs))
print(len(massPairs))
print(massPairs)

25
[(1.0, 10.0), (1.0, 50.0), (1.0, 100.0), (1.0, 200.0), (1.0, 300.0), (1.0, 350.0), (1.0, 400.0), (1.0, 450.0), (1.0, 500.0), (1.0, 600.0), (1.0, 700.0), (1.0, 800.0), (4.0, 10.0), (6.0, 10.0), (20.0, 50.0), (22.0, 50.0), (28.0, 50.0), (40.0, 100.0), (45.0, 100.0), (50.0, 500.0), (55.0, 100.0), (150.0, 500.0), (200.0, 500.0), (225.0, 500.0), (275.0, 500.0)]


In [12]:
# Save to pickle file
pickleFile = os.path.basename(sigTemplatesFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile.split('_',1)[1]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
dfComb.to_pickle(pickleFile)

Saving to DMsimp_spin-0_Monojet_DF.pcl
