In [1]:
import numpy as np
import pandas as pd
import os,sys

In [2]:
inputFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
# inputFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            binErrorPlus = myeval(values[-2])
            binErrorMinus = myeval(values[-1])            
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
        if isBins:
            columnsNameErrorPlus = columnName+'_ErrorPlus'
            columnsNameErrorMinus = columnName+'_ErrorMinus'
            blockDict[columnsNameErrorPlus] = [binErrorPlus]
            blockDict[columnsNameErrorMinus] = [binErrorMinus]
        
    
    return blockDict

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(inputFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

In [5]:
# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)
    
# Sort according to model point
df.sort_values(['Coupling','Mode','$m_{med}$','$m_{DM}$','Data-takingperiod'],inplace=True,
              ascending=[False,False,True,True,True])    

In [6]:
# Save to pickle file
pickleFile = os.path.basename(inputFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
df.to_pickle(pickleFile)

Saving to Signaltemplates_DMsimp_spin-1_Monojet_DF.pcl
