In [1]:
import numpy as np
import pandas as pd
import os,sys

In [2]:
sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            binErrorPlus = myeval(values[-2])
            binErrorMinus = myeval(values[-1])            
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
        if isBins:
            columnsNameErrorPlus = columnName+'_ErrorPlus'
            columnsNameErrorMinus = columnName+'_ErrorMinus'
            blockDict[columnsNameErrorPlus] = [binErrorPlus]
            blockDict[columnsNameErrorMinus] = [binErrorMinus]
        
    
    return blockDict

### Get cut flow data

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(cutFlowFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)


### Get signal template data

In [5]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(sigTemplatesFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
dfB = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    dfB = pd.concat([dfB,newDF],ignore_index=True)


### Make sure values are standardized

In [6]:
# Rename column to match dF
df.rename(columns={'$g_{\chi}$' : '$g_{DM}$'}, inplace=True)
# Rename values to match dF
df['Mode']=df['Mode'].replace({'$\chi\chi+j$' : 'DM+QCDjets', 
                               '$\\chi\\chi$+Z(qq)' : 'DM+Z(qq)',
                               '$\\chi\\chi$+W(qq)' : 'DM+W(qq)'})

df.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                   '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                  '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)
dfB.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                    '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                   '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)


### Merge cut flow and signal template

In [7]:
commonColumns = list(set(df.columns).intersection(set(dfB.columns)))
dfComb = pd.merge(df,dfB,on=commonColumns,how='right')

dfComb.sort_values(['Coupling','Mode','$m_{med}$','$m_{DM}$','Data-takingperiod'],inplace=True,
              ascending=[False,False,True,True,True])

In [8]:
dfComb

Unnamed: 0,Coupling,Mode,$m_{med}$,$m_{DM}$,$g_{DM}$,$g_{q}$,Data-takingperiod,Fullsample,Triggeremulation,$p_{T}^{miss}>250$GeV,...,bin_1020.0_1090.0_ErrorMinus,bin_1090.0_1160.0,bin_1090.0_1160.0_ErrorPlus,bin_1090.0_1160.0_ErrorMinus,bin_1160.0_1250.0,bin_1160.0_1250.0_ErrorPlus,bin_1160.0_1250.0_ErrorMinus,bin_1250.0_1400.0,bin_1250.0_1400.0_ErrorPlus,bin_1250.0_1400.0_ErrorMinus
221,Vector,DM+Z(qq),100.0,1.0,1.0,0.25,2016,,,,...,-0.266110,0.301740,0.171580,-0.171580,0.165720,0.157060,-0.157060,0.869460,0.208170,-0.208170
479,Vector,DM+Z(qq),100.0,1.0,1.0,0.25,2017,1.0,0.84781,0.23509,...,-0.354550,1.042700,0.359650,-0.359650,0.686150,0.242980,-0.242980,0.792270,0.309570,-0.309570
737,Vector,DM+Z(qq),100.0,1.0,1.0,0.25,2018,1.0,0.81964,0.22973,...,-0.453100,0.213380,0.530980,-0.530980,0.782610,0.386950,-0.386950,1.089600,0.480280,-0.480280
223,Vector,DM+Z(qq),100.0,30.0,1.0,0.25,2016,,,,...,-0.205150,0.292900,0.181050,-0.181050,0.343160,0.159970,-0.159970,0.505050,0.198650,-0.198650
481,Vector,DM+Z(qq),100.0,30.0,1.0,0.25,2017,1.0,0.84875,0.23624,...,-0.341780,0.836170,0.275590,-0.275590,0.487660,0.226090,-0.226090,0.577270,0.267250,-0.267250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,Axial,DM+QCDjets,2500.0,1000.0,1.0,0.25,2017,1.0,0.90069,0.47335,...,-0.044993,0.381990,0.038068,-0.038068,0.322040,0.034465,-0.034465,0.844780,0.055232,-0.055232
546,Axial,DM+QCDjets,2500.0,1000.0,1.0,0.25,2018,1.0,0.88296,0.47240,...,-0.065088,0.526230,0.059403,-0.059403,0.484880,0.056180,-0.056180,1.129600,0.079979,-0.079979
31,Axial,DM+QCDjets,2500.0,1500.0,1.0,0.25,2016,,,,...,-0.000056,0.000987,0.000049,-0.000049,0.000920,0.000047,-0.000047,0.002347,0.000076,-0.000076
289,Axial,DM+QCDjets,2500.0,1500.0,1.0,0.25,2017,1.0,0.90311,0.49001,...,-0.000105,0.001132,0.000086,-0.000086,0.000962,0.000086,-0.000086,0.002792,0.000139,-0.000139


In [9]:
# Save to pickle file
pickleFile = os.path.basename(sigTemplatesFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile.split('_',1)[1]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
dfComb.to_pickle(pickleFile)

Saving to DMsimp_spin-1_Monojet_DF.pcl
