In [1]:
import numpy as np
import pandas as pd
import os,sys

In [2]:
# inputFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
inputFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
    
    return blockDict

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(inputFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

In [5]:
# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)
    
# Sort according to model point
df.sort_values(['Coupling','Mode','$m_{med}$','$m_{DM}$','Data-takingperiod'],inplace=True,
              ascending=[False,False,True,True,True])    

In [6]:
df

Unnamed: 0,Coupling,Mode,$m_{med}$,$m_{DM}$,$g_{\chi}$,$g_{q}$,Data-takingperiod,Fullsample,Triggeremulation,$p_{T}^{miss}>250$GeV,...,Photonveto,"""$\Delta\phi(jet_p_{T}^{miss})>0.5$rad""",$\Deltap_{T}^{miss}$(PF-Calorimeter)$<0.5$rad,LeadingAK4jet$p_{T}>100$GeV,LeadingAK4jet$\eta<2.4$,LeadingAK4jetenergyfractions,Mono-Voverlapremoval,HCALmitigation(jets),HCALmitigation($\phi^{miss}$),"""$\Delta\phi(\mathrm{PF}_\mathrm{Charged})<2.0$rad"""
18,Vector,$\chi\chi+j$,100.0,1.0,1.0,0.25,2017,1.0,0.84336,0.18074,...,0.17392,0.15822,0.15698,0.14587,0.14587,0.14459,0.13908,0.13908,0.13908,0.13900
285,Vector,$\chi\chi+j$,100.0,1.0,1.0,0.25,2018,1.0,0.82011,0.17389,...,0.16741,0.15271,0.15224,0.14344,0.14344,0.14215,0.13671,0.13059,0.11026,0.11023
28,Vector,$\chi\chi+j$,100.0,30.0,1.0,0.25,2017,1.0,0.84348,0.18037,...,0.17415,0.15827,0.15705,0.14613,0.14613,0.14476,0.13920,0.13920,0.13920,0.13917
261,Vector,$\chi\chi+j$,100.0,30.0,1.0,0.25,2018,1.0,0.81952,0.17432,...,0.16771,0.15239,0.15192,0.14363,0.14363,0.14228,0.13660,0.13052,0.11035,0.11031
75,Vector,$\chi\chi+j$,100.0,40.0,1.0,0.25,2017,1.0,0.84199,0.18031,...,0.17329,0.15739,0.15627,0.14496,0.14496,0.14365,0.13813,0.13813,0.13813,0.13808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,Axial,$\chi\chi$+W(qq),2500.0,750.0,1.0,0.25,2018,1.0,0.91688,0.55378,...,0.51965,0.47329,0.47261,0.46047,0.46047,0.45723,0.29308,0.27749,0.24424,0.24424
178,Axial,$\chi\chi$+W(qq),2500.0,1000.0,1.0,0.25,2017,1.0,0.92768,0.55504,...,0.51920,0.47194,0.47069,0.45716,0.45716,0.45388,0.28909,0.28909,0.28909,0.28905
391,Axial,$\chi\chi$+W(qq),2500.0,1000.0,1.0,0.25,2018,1.0,0.91602,0.55297,...,0.51807,0.47128,0.47070,0.45852,0.45852,0.45515,0.29042,0.27527,0.24203,0.24199
134,Axial,$\chi\chi$+W(qq),2500.0,1500.0,1.0,0.25,2017,1.0,0.93447,0.60072,...,0.56156,0.50820,0.50697,0.49059,0.49059,0.48712,0.31153,0.31153,0.31153,0.31148


In [7]:
# Save to pickle file
pickleFile = os.path.basename(inputFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
df.to_pickle(pickleFile)

Saving to CutflowforMonojet_DM_spin-1mediator_DF.pcl
