In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
inputFile = 'davinci_output.dat'
outputFile = '~/data/parsedDaVinciLog.h5'

# Helper functions
Should have made this a class, really.

## File management

In [3]:
def ProcessInputFile(file):
    with open(file) as f:
        lines = [line.rstrip() for line in f]
        
    beginning = "LambdaSel_T.Lam...WARNING -----------------BEGINNING EVENT-----------------"
    end = "LambdaSel_T.Lam...WARNING -------------------------------------------------"

    lindx = lines.index(beginning)
    rindx = len(lines) - lines[::-1].index(end)

    lines = lines[lindx:rindx]
    
    for index in range(len(lines)):
        warningPrologue = "LambdaSel_T.Lam...WARNING "
        if lines[index].startswith(warningPrologue):
            lines[index] = lines[index][len(warningPrologue):]
    
    return lines

## Individual parsing functions

In [4]:
def CheckPrefix(function, prefix, line):
    if not line.startswith(prefix):
        print("ERROR in ", function.__name__, ": prefix\n", prefix, "\ndoes not match line\n", line)

In [5]:
def ProcessTuple(line, prefix):
    CheckPrefix(ProcessTuple, prefix, line)
    return tuple(map(float, line[len(prefix):].split(', ')))

In [6]:
def ProcessTupleWithParentheses(line, prefix):
    CheckPrefix(ProcessTupleWithParentheses, prefix, line)
    return tuple(map(float, line[len(prefix)+1:-1].split(',')))

In [7]:
def ProcessInt(line, prefix):
    CheckPrefix(ProcessInt, prefix, line)
    return int(line[len(prefix):])

In [8]:
def ProcessFloat(line, prefix):
    CheckPrefix(ProcessFloat, prefix, line)
    return float(line[len(prefix):])

In [9]:
def ProcessMatrix(lines, prefix):
    CheckPrefix(ProcessMatrix, prefix, lines[0])
    matrix = []
    for line in lines[1:]:
        if line[0] == '[':
            line = line[1:]
        if line[-1] == ']':
            line = line[:-1]
        
        try:
            matrix.append([float(number) for number in line.split()])
        except ValueError:
            ## Whoever programmed matrix printing didn't account for the minus sign.
            ## As a result, sometimes two elements of the matrix will be squished,
            ## e.g. 0.000841-0.000119364. The above line of code uses whitespace as
            ## separator and doesn't like this. The following is a very simple fix,
            ## but it works.
            line = line.replace("-", " -")
            ## To avoid breaking up exponentials, e.g. 3.43e-5...
            line = line.replace("e -", "e-")
            matrix.append([float(number) for number in line.split()])
            
    return matrix

In [10]:
def ProcessPID(line):
    prefix = "PID: LHCb.ParticleID"
    CheckPrefix(ProcessPID, prefix, line)
    PID = int(line[len(prefix)+1:-1])
    if abs(PID) == 2212:
        return 'proton'
    elif abs(PID) == 211:
        return 'pion'
    else:
        print("ERROR: unrecognized particle in line\n", line)
        return -1

In [11]:
def ProcessStatus(line):
    prefix = "Status: "
    CheckPrefix(ProcessStatus, prefix, line)
    return line[len(prefix):]

## Chunk parsing functions

In [12]:
def ProcessIterationChunk(iterChunk):
    chunkBegin = "Iter: "
    chunkEnd =   "------------------------"
    
    if not (iterChunk[0].startswith(chunkBegin) and iterChunk[-1] == chunkEnd):
        print("ERROR: the following iteration chunk is not standard:\n", iterChunk)
        return -1
    
    iteration = ProcessInt(iterChunk[0],"Iter: ")
    currentVtx = ProcessTuple(iterChunk[1],"x: ")
    previousVtx = ProcessTuple(iterChunk[2],"x0: ")
    ci = ProcessMatrix(iterChunk[3:7], "ci:")
    chi2 = ProcessFloat(iterChunk[7], "chi2: ")
    deltaVtx = ProcessTuple(iterChunk[8], "dx: ")
    deltaDistance = ProcessFloat(iterChunk[9], "d1: ")
    deltaChi2 = ProcessFloat(iterChunk[10], "d2: ")
    
    return iteration, currentVtx, previousVtx, ci, chi2, deltaVtx, deltaDistance, deltaChi2

In [13]:
def ProcessParticleChunk(particleChunk):
    chunkBegin = "PID: LHCb.ParticleID"
    chunkEnd =   "------------------------"
    
    if not (particleChunk[0].startswith(chunkBegin) and particleChunk[-1] == chunkEnd):
        print("ERROR: the following particle chunk is not standard:\n", particleChunk)
        return -1
    
    particleName = ProcessPID(particleChunk[0])
    refPoint = ProcessTupleWithParentheses(particleChunk[1], "Reference point: ")
    fourMom = ProcessTupleWithParentheses(particleChunk[2], "4-momentum: ")
    posMomCovMatrix = ProcessMatrix(particleChunk[3:11], "PosMomCovMatrix:")
    
    return particleName, refPoint, fourMom, posMomCovMatrix

In [14]:
def LineIsGood(line):
    return not line.startswith("DaVinci::ParticleTransporter::")

In [15]:
def CleanChunk(chunk):
    return [line for line in chunk if LineIsGood(line)]

In [16]:
def ProcessChunk(chunk):
    chunkBegin = "-----------------BEGINNING EVENT-----------------"
    chunkEnd =   "-------------------------------------------------"
    
    chunk = CleanChunk(chunk)
    
    if not (chunk[0] == chunkBegin and chunk[-1] == chunkEnd):
        print("ERROR: the following chunk is not standard:\n", chunk)
        return -1
    
    initVtx = ProcessTuple(chunk[2], "Initial vtx: ")
    initChi2 = ProcessFloat(chunk[3], "Initial chi2: ")
    initCi = ProcessMatrix(chunk[4:8], "Initial ci:")
    
    particle1 = ProcessParticleChunk(chunk[9:21])
    particle2 = ProcessParticleChunk(chunk[21:33])
    
    if particle1[0] == 'proton':
        protonInfo = particle1
        pionInfo = particle2
    else:
        pionInfo = particle1
        protonInfo = particle2
    
    lengthOfIterationChunk = 12
    numberOfIterations = len(chunk[33:-2]) / lengthOfIterationChunk
    if int(numberOfIterations) != numberOfIterations:
        raise ValueError("ERROR: number of iterations", numberOfIterations, "is not an integer. Follows the chunk:\n", chunk)
    
    iter_currentVertices = []
    iter_previousVertices = []
    iter_covMatrices = []
    iter_chi2s = []
    iter_deltaVertices = []
    iter_deltaDistances = []
    iter_deltaChi2s = []
    
    for iIter in range(int(numberOfIterations)):
        startIterIndex = 33+iIter*lengthOfIterationChunk
        endIterIndex = startIterIndex + lengthOfIterationChunk
        
        iterationInfo = ProcessIterationChunk(chunk[startIterIndex:endIterIndex])
        
        iter_currentVertices.append(iterationInfo[1])
        iter_previousVertices.append(iterationInfo[2])
        iter_covMatrices.append(iterationInfo[3])
        iter_chi2s.append(iterationInfo[4])
        iter_deltaVertices.append(iterationInfo[5])
        iter_deltaDistances.append(iterationInfo[6])
        iter_deltaChi2s.append(iterationInfo[7])    
  
    status = ProcessStatus(chunk[-2])
    
    dictionary = {
        "seed_vtx": initVtx,
        "seed_chi2": initChi2,
        "seed_ci": initCi,
        "p_refPoint": protonInfo[1],
        "p_momentum": protonInfo[2][:3],
        "p_energy": protonInfo[2][3],
        "p_posMomCovMatrix": pionInfo[3],
        "pim_refPoint": pionInfo[1],
        "pim_momentum": pionInfo[2][:3],
        "pim_energy": pionInfo[2][3],
        "pim_posMomCovMatrix": pionInfo[3],
        "numberOfIterations": int(numberOfIterations),
        "iter_currentVertices": iter_currentVertices,
        "iter_previousVertices": iter_previousVertices,
        "iter_covMatrices": iter_covMatrices,
        "iter_chi2s": iter_chi2s,
        "iter_deltaVertices": iter_deltaVertices,
        "iter_deltaDistances": iter_deltaDistances,
        "iter_deltaChi2s": iter_deltaChi2s,
        "status": status
    }
    
    return dictionary

## Full file parsing function(s)

In [17]:
def ProcessLogFile(lines, verbose=False):
    searchStart = 0
    chunkBegin = "-----------------BEGINNING EVENT-----------------"
    chunkEnd =   "-------------------------------------------------"
    
    listOfEvents = []
    
    while True:
        try:
            beginIndex = lines[searchStart:].index(chunkBegin) + searchStart
        except ValueError:
            break     
        
        endIndex = lines[searchStart:].index(chunkEnd) + searchStart + 1
        
        if verbose:
            print(beginIndex, "\t", lines[beginIndex])
            print(endIndex, "\t", lines[endIndex-1])
        
        listOfEvents.append(ProcessChunk(lines[beginIndex:endIndex]))
        searchStart = endIndex
    
    return listOfEvents

# Helper functions

## Feature component functions
We'll use nested information (lists within the DataFrame), which make it a bit hard to single out individual components. These functions help with that.

In [18]:
## Use 1,2,3 instead of 0,1,2. Trust me, it's easier.
def Feature1DComponent(series, component):
    return series.map(lambda x: x[component-1])

In [19]:
def Feature2DComponent(series, row, column):
    return series.map(lambda x: x[row-1][column-1])

In [20]:
def FeatureCoordinate(series, coordinate):  
    coordToComponent = {'x': 1, 'y': 2, 'z': 3}
    return Feature1DComponent(series, coordToComponent[coordinate])

# Data parsing
Finally!

In [21]:
lines = ProcessInputFile(inputFile)
events = ProcessLogFile(lines)
df_events = pd.json_normalize(events)
df_events

Unnamed: 0,seed_vtx,seed_chi2,seed_ci,p_refPoint,p_momentum,p_energy,p_posMomCovMatrix,pim_refPoint,pim_momentum,pim_energy,pim_posMomCovMatrix,numberOfIterations,iter_currentVertices,iter_previousVertices,iter_covMatrices,iter_chi2s,iter_deltaVertices,iter_deltaDistances,iter_deltaChi2s,status
0,"(147.985, 102.425, 5813.65)",0.0,"[[0.155613, -0.00059156, -0.0018908], [-0.0005...","(137.247, 128.359, 7755.84)","(-138.28, 365.041, 25017.9)",25038.50,"[[0.00543169, 0.0153676, 0.0, -1.88666, -0.435...","(429.459, 173.613, 7862.59)","(835.081, 166.117, 6084.36)",6145.24,"[[0.00543169, 0.0153676, 0.0, -1.88666, -0.435...",10,"[(128.156, 86.9709, 4871.29), (113.514, 77.478...","[(147.985, 102.425, 5813.65), (128.156, 86.970...","[[[0.659589, -0.034261, -0.00900721], [-0.0342...","[3.08255, 3.50117, 1.73936, 4.06577, 2.75733, ...","[(-19.8295, -15.4544, -942.357), (-14.6421, -9...","[942.692, 573.933, 438.001, 278.849, 253.93, 2...","[245.489, 4.36432, 2.09498, 0.739273, 0.616799...",NonConverged
1,"(-146.168, 17.7867, 5942.26)",0.0,"[[0.118983, 8.69493e-05, 0.0132291], [8.69493e...","(-338.922, 27.3306, 7668.24)","(-1036.51, 54.3145, 9281.18)",9386.05,"[[1.64763, -2.50882, 0.0, -236.134, -17.4607, ...","(1431.68, 344.288, 7987.89)","(1562.49, 152.803, 2014.65)",2557.93,"[[1.64763, -2.50882, 0.0, -236.134, -17.4607, ...",5,"[(-127.834, 14.3138, 5384.84), (-121.396, 13.2...","[(-146.168, 17.7867, 5942.26), (-127.834, 14.3...","[[[0.286808, -0.0155054, 0.0219773], [-0.01550...","[38.6035, 24.0006, 20.9086, 20.0475, 19.788]","[(18.3332, -3.47288, -557.426), (6.43798, -1.1...","[557.738, 182.288, 58.0447, 18.2384, 5.70686]","[261.972, 4.86756, 0.247885, 0.0207951, 0.0019...",Converged
2,"(-707.805, 67.6195, 7060.87)",0.0,"[[0.0203782, -0.000126861, 0.00277254], [-0.00...","(-677.459, -145.048, 7853.69)","(233.939, -99.447, 5815.13)",5895.82,"[[0.0278222, 0.00205764, 0.0, -5.51889, 0.0711...","(-1476.38, 944.228, 7862.48)","(-1074.93, 36.0767, 1119.98)",1559.04,"[[0.0278222, 0.00205764, 0.0, -5.51889, 0.0711...",6,"[(-715.67, -52.001, 6263.48), (-696.752, -71.0...","[(-707.805, 67.6195, 7060.87), (-715.67, -52.0...","[[[1.24177, -0.118863, 0.0170006], [-0.118863,...","[23757.3, 4100.97, 513.884, 3.95342, 0.436837,...","[(-7.86515, -119.621, -797.394), (18.9179, -19...","[806.354, 1239.01, 2163.63, 3058.61, 69.2146, ...","[28185.9, 5359.6, 1521.69, 277.699, 0.0655407,...",Converged
3,"(-510.435, -58.8328, 7346.39)",0.0,"[[0.259843, -0.000363629, 0.0170464], [-0.0003...","(-533.746, -59.9612, 7668.24)","(-2631.76, -251.121, 36328.0)",36436.10,"[[0.00076176, 0.000191707, 0.0, 0.0938744, 0.0...","(-529.408, -64.6333, 7668.24)","(-456.258, -113.784, 7741.62)",7757.15,"[[0.00076176, 0.000191707, 0.0, 0.0938744, 0.0...",4,"[(-502.054, -57.5205, 7226.71), (-499.733, -57...","[(-510.435, -58.8328, 7346.39), (-502.054, -57...","[[[383.128, -0.189655, 25.9738], [-0.189655, 7...","[3.03136, 1.67087, 1.5105, 1.49432]","[(8.38128, 1.31236, -119.687), (2.32073, 0.360...","[119.988, 33.2847, 5.48213, 0.657721]","[109.583, 2.01083, 0.0308117, 0.000382704]",Converged
4,"(-12.3806, 92.5885, 5801.55)",0.0,"[[0.262182, -0.000739441, 0.00767508], [-0.000...","(0.3891, 123.46, 7755.84)","(604.198, 1520.0, 92425.2)",92444.50,"[[0.00077284, 0.000184163, 0.0, -0.575792, 0.1...","(-141.191, 126.382, 7755.84)","(-903.316, 226.483, 13704.8)",13737.10,"[[0.00077284, 0.000184163, 0.0, -0.575792, 0.1...",3,"[(-10.7259, 82.9239, 5218.85), (-10.3121, 80.5...","[(-12.3806, 92.5885, 5801.55), (-10.7259, 82.9...","[[[1.31809, -0.0459506, 0.0091716], [-0.045950...","[2.29967, 1.51361, 1.38683]","[(1.65477, -9.66463, -582.702), (0.413816, -2....","[582.784, 143.588, 11.9994]","[140.593, 1.22127, 0.00501794]",Converged
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,"(-869.552, -85.4424, 10409.1)",0.0,"[[0.122858, -0.000871916, 0.0172491], [-0.0008...","(-783.797, -42.3525, 7853.69)","(-347.049, -69.6395, 10360.5)",10408.90,"[[0.000961, 0.00151887, 0.0, -1.2709, -0.02359...","(-436.343, -76.9904, 7672.24)","(-1109.36, -38.3318, 7007.87)",7096.61,"[[0.000961, 0.00151887, 0.0, -1.2709, -0.02359...",3,"[(-847.033, -65.421, 10120.3), (-846.99, -64.9...","[(-869.552, -85.4424, 10409.1), (-847.033, -65...","[[[8.28967, -0.0261719, 0.553699], [-0.0261719...","[74.2682, 96.1596, 96.3151]","[(22.5189, 20.0214, -288.802), (0.0423759, 0.4...","[290.37, 1.75959, 0.00762202]","[3606.63, 0.421589, 1.34646e-05]",Converged
1086,"(-6.33693, 47.8751, 4840.09)",0.0,"[[0.213575, 0.00248458, -0.00607877], [0.00248...","(-230.828, 117.343, 7751.84)","(-1287.91, 274.412, 16700.1)",16778.20,"[[0.00159201, -0.0155357, 0.0, -0.888802, 0.13...","(484.395, 2.332, 7672.29)","(1285.45, 31.9525, 7445.99)",7557.49,"[[0.00159201, -0.0155357, 0.0, -0.888802, 0.13...",5,"[(-2.98968, 28.5927, 3453.31), (-6.53087, -4.1...","[(-6.33693, 47.8751, 4840.09), (-2.98968, 28.5...","[[[0.0412808, 0.00956795, -0.000385316], [0.00...","[188.658, 39.9437, 3.45855, 1.60207, 1.53739]","[(3.34724, -19.2824, -1386.78), (-3.54119, -32...","[1386.91, 2580.25, 1981.56, 132.435, 4.65559]","[301.349, 50.7064, 6.47518, 0.0175542, 2.12254...",Converged
1087,"(-2.77537, -19.2735, 4848.62)",0.0,"[[0.213945, 0.00222587, -0.0108378], [0.002225...","(-111.448, -38.3212, 7672.24)","(-1101.28, -160.999, 28618.0)",28655.00,"[[0.00159201, -0.0155357, 0.0, -0.888802, 0.13...","(484.395, 2.332, 7672.29)","(1285.45, 31.9525, 7445.99)",7557.49,"[[0.00159201, -0.0155357, 0.0, -0.888802, 0.13...",4,"[(-8.5721, -15.4629, 3625.76), (-12.8632, -13....","[(-2.77537, -19.2735, 4848.62), (-8.5721, -15....","[[[0.148786, -0.00985489, 0.000870163], [-0.00...","[0.455797, 1.09364, 0.956296, 1.04055]","[(-5.79673, 3.81059, -1222.86), (-4.2911, 2.40...","[1222.88, 575.041, 111.157, 35.1279]","[246.57, 3.22212, 0.0494256, 0.00493449]",Converged
1088,"(161.568, -80.815, 5788.54)",0.0,"[[0.104436, 0.00260099, -0.015545], [0.0026009...","(151.481, -291.649, 7858.86)","(-119.887, -1066.72, 29982.5)",30016.30,"[[0.00159201, -0.0155357, 0.0, -0.888802, 0.13...","(484.395, 2.332, 7672.29)","(1285.45, 31.9525, 7445.99)",7557.49,"[[0.00159201, -0.0155357, 0.0, -0.888802, 0.13...",4,"[(111.327, -97.996, 4735.77), (-17.9353, -58.4...","[(161.568, -80.815, 5788.54), (111.327, -97.99...","[[[0.165085, -0.0188863, -0.0121858], [-0.0188...","[2194.61, 466.424, 1.2325, 0.481157]","[(-50.2405, -17.1811, -1052.77), (-129.263, 39...","[1054.1, 2724.01, 1604.04, 8.49156]","[1019.87, 530.185, 49.2371, 0.00174364]",Converged


In [22]:
Feature1DComponent(df_events['seed_vtx'], 2)

0       102.4250
1        17.7867
2        67.6195
3       -58.8328
4        92.5885
          ...   
1085    -85.4424
1086     47.8751
1087    -19.2735
1088    -80.8150
1089   -108.6520
Name: seed_vtx, Length: 1090, dtype: float64

In [23]:
Feature2DComponent(df_events['seed_ci'], 1, 2)

0      -0.000592
1       0.000087
2      -0.000127
3      -0.000364
4      -0.000739
          ...   
1085   -0.000872
1086    0.002485
1087    0.002226
1088    0.002601
1089   -0.000317
Name: seed_ci, Length: 1090, dtype: float64

In [24]:
FeatureCoordinate(df_events['seed_vtx'], 'x')

0       147.98500
1      -146.16800
2      -707.80500
3      -510.43500
4       -12.38060
          ...    
1085   -869.55200
1086     -6.33693
1087     -2.77537
1088    161.56800
1089    689.09700
Name: seed_vtx, Length: 1090, dtype: float64

In [25]:
print("Preparing to save dataframe in HDF5 file...")
df_events.to_hdf(outputFile, "LHCbMC_Lb", mode='w');
print("File saved.")

Preparing to save dataframe in HDF5 file...
File saved.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['seed_vtx', 'seed_ci', 'p_refPoint', 'p_momentum', 'p_posMomCovMatrix',
       'pim_refPoint', 'pim_momentum', 'pim_posMomCovMatrix',
       'iter_currentVertices', 'iter_previousVertices', 'iter_covMatrices',
       'iter_chi2s', 'iter_deltaVertices', 'iter_deltaDistances',
       'iter_deltaChi2s', 'status'],
      dtype='object')]

  encoding=encoding,
