In [1]:
import numpy as np
import pandas as pd
import os,sys
pd.set_option('display.max_rows', 200)

In [2]:
#sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
#cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

# sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-0,Monojet.csv'
# cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-0mediator.csv'

sigTemplatesFile = './HEPData-ins1894408-v2-csv/Signaltemplates,ADD,Monojet.csv'
cutFlowFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,ADD.csv'


In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('[TeV]','')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            binErrorPlus = myeval(values[-2])
            binErrorMinus = myeval(values[-1])            
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
        if isBins:
            columnsNameErrorPlus = columnName+'_ErrorPlus'
            columnsNameErrorMinus = columnName+'_ErrorMinus'
            blockDict[columnsNameErrorPlus] = [binErrorPlus]
            blockDict[columnsNameErrorMinus] = [binErrorMinus]
        
    
    return blockDict

### Get cut flow data

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(cutFlowFile,'r') as f:
    data = f.read()
    for tag in ['#: Coupling','#: $M_{D}$ [TeV]']:
        if not tag in data:
            continue
        blocks = data.split(tag)
        for ib,b in enumerate(blocks):
            blocks[ib] = tag.replace('# :','')+b
        break
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)


### Get signal template data

In [5]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(sigTemplatesFile,'r') as f:
    data = f.read()
    for tag in ['#: Coupling','#: d,']:
        if not tag in data:
            continue
        blocks = data.split(tag)
        for ib,b in enumerate(blocks):
            blocks[ib] = tag.replace('# :','')+b
        break
# Skip header
blocks = blocks[1:]

# Define data frame from first block:
dfB = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    dfB = pd.concat([dfB,newDF],ignore_index=True)


In [6]:
print(len(df),len(dfB))

78 112


### Make sure values are standardized

In [7]:
# Rename column to match dF
df.rename(columns={'$g_{\chi}$' : '$g_{DM}$', 'd' : '$d$'}, inplace=True)
dfB.rename(columns={'$g_{\chi}$' : '$g_{DM}$', 'd' : '$d$'}, inplace=True)
if not 'Mode' in df.columns:
    df['Mode'] = 'DM+QCDjets'
    dfB['Mode'] = 'DM+QCDjets'
if not 'Coupling' in df.columns:
    df['Coupling'] = 'ADD'
    dfB['Coupling'] = 'ADD'

# Rename values to match dF
df['Mode']=df['Mode'].replace({'$\chi\chi+j$' : 'DM+QCDjets', 
                               '$\\chi\\chi$+Z(qq)' : 'DM+Z(qq)',
                               '$\\chi\\chi$+W(qq)' : 'DM+W(qq)'})
df['Data-takingperiod']=df['Data-takingperiod'].replace({7 : 2017, 8 : 2018, 6 : 2016})
dfB['Data-takingperiod']=dfB['Data-takingperiod'].replace({7 : 2017, 8 : 2018, 6 : 2016})

df.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                   '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                  '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)
dfB.rename(columns={'"$\Delta\phi(jet_p_{T}^{miss})>0.5$rad"' :  '$\Delta \phi (jet,p_{T}^{miss})>0.5$ rad',
                    '$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad' : '$\Delta p_{T}^{miss}$ (PF-Cal)$<0.5$ rad',
                   '"$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"' : '$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'}, 
          inplace=True)


### Merge cut flow and signal template

In [8]:
commonColumns = list(set(df.columns).intersection(set(dfB.columns)))
print(commonColumns)

dfComb = pd.merge(df,dfB,on=commonColumns,how='right')

# Filter mojet only (it seems monov is also present for spin0)
dfComb = dfComb[dfComb['Mode'] == 'DM+QCDjets']

sortColumnsDict = {'Coupling' : False, 'Mode' : False, '$m_{med}$' : True,
               '$M_{D}$' : True,'$m_{DM}$' : True,'d' :True,'Data-takingperiod' : True}
sortColumns = [c for c in sortColumnsDict if c in dfComb.columns]
ascending = [v for c,v in sortColumnsDict.items() if c in dfComb.columns]
dfComb.sort_values(sortColumns,inplace=True,
              ascending=ascending)

['Coupling', 'Data-takingperiod', 'Mode', '$M_{D}$', '$d$']


### Compute total cross-section

In [9]:
binCols = [c for c in dfComb.columns if ('bin_' in c and not 'Error' in c)]
# Add total yield column
# (total yield = sum over MET bins/pre-selection efficiency)
# dfComb['TotalYield'] = dfComb[binCols].sum(axis=1)/dfComb['$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad'] 
dfComb['TotalYield'] = dfComb[binCols].sum(axis=1)/dfComb['$\Delta \phi (\mathrm{PF}_\mathrm{Charged})<2.0$ rad']
# Add luminosity
luminosities = {2016: 36.0, 2017 : 41.5, 2018 : 59.7}
dfComb['Luminosity (1/fb)'] = 36.0
for dp, lum in luminosities.items():
    dfComb.loc[dfComb['Data-takingperiod'] == dp, 'Luminosity (1/fb)'] = lum
# Compute total cross-section (total yield/eff)
# for the corresponding luminosities
dfComb['Total xsec-pT150 (pb)'] = dfComb['TotalYield']/(1e3*dfComb['Luminosity (1/fb)'])
# Remove auxiliary columns:
dfComb.drop('TotalYield',axis='columns',inplace=True)

In [14]:
dfComb

Unnamed: 0,$M_{D}$,$d$,Data-takingperiod,Fullsample,Triggeremulation,$p_{T}^{miss}>250$GeV,$p_{T}^{miss}$qualityfilters,Electronveto,Muonveto,Tauveto,...,bin_1090.0_1160.0_ErrorPlus,bin_1090.0_1160.0_ErrorMinus,bin_1160.0_1250.0,bin_1160.0_1250.0_ErrorPlus,bin_1160.0_1250.0_ErrorMinus,bin_1250.0_1400.0,bin_1250.0_1400.0_ErrorPlus,bin_1250.0_1400.0_ErrorMinus,Luminosity (1/fb),Total xsec-pT150 (pb)
5,3,4,2016,,,,,,,,...,34.725,-34.725,253.54,32.08,-32.08,319.72,36.155,-36.155,36.0,
6,3,5,2016,,,,,,,,...,20.081,-20.081,143.05,20.838,-20.838,218.12,25.582,-25.582,36.0,
7,3,6,2016,,,,,,,,...,17.966,-17.966,157.23,20.115,-20.115,237.71,24.9,-24.9,36.0,
8,4,3,2016,,,,,,,,...,12.033,-12.033,102.62,12.779,-12.779,242.26,19.783,-19.783,36.0,
9,4,4,2016,,,,,,,,...,7.7138,-7.7138,55.337,6.6829,-6.6829,133.88,10.489,-10.489,36.0,
10,4,5,2016,,,,,,,,...,5.2773,-5.2773,44.143,4.7535,-4.7535,88.576,6.7936,-6.7936,36.0,
11,4,6,2016,,,,,,,,...,3.2362,-3.2362,27.81,3.1761,-3.1761,53.929,4.3688,-4.3688,36.0,
44,4,7,2017,1.0,0.53606,0.25598,0.25541,0.25485,0.25409,0.24805,...,2.9621,-2.9621,23.734,2.6804,-2.6804,46.527,3.823,-3.823,41.5,0.713365
84,4,7,2018,1.0,0.50003,0.25285,0.25244,0.25186,0.25116,0.24483,...,3.8954,-3.8954,29.275,3.7678,-3.7678,63.659,5.3162,-5.3162,59.7,0.719337
12,5,2,2016,,,,,,,,...,10.943,-10.943,95.285,12.433,-12.433,183.2,17.327,-17.327,36.0,


In [11]:
massPairs = set([])
for i,row in dfComb.iterrows():
    if '$m_{med}$' in row:
        massPairs.add((row['$m_{DM}$'],row['$m_{med}$']))
    else:
        massPairs.add((row['$M_{D}$'],row['$d$']))
massPairs = sorted(list(massPairs))
print(len(massPairs))
print(massPairs)

50
[(3, 4), (3, 5), (3, 6), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (7, 2), (7, 3), (7, 4), (7, 5), (7, 6), (7, 7), (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (9, 2), (9, 3), (9, 4), (9, 5), (9, 6), (9, 7), (10, 2), (10, 3), (10, 4), (11, 2), (11, 3), (11, 4), (12, 2), (12, 3), (13, 2), (13, 3), (14, 2), (15, 2)]


In [12]:
# Save to pickle file
pickleFile = os.path.basename(sigTemplatesFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile.split('_',1)[1]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
dfComb.to_pickle(pickleFile)

Saving to ADD_Monojet_DF.pcl
