In [1]:
import re
import numpy as np
import pandas as pd
import pathlib

In [2]:
inputFile = str(pathlib.Path.home()) + '/data/log_truth_merge_rootfile/10_dst/davinci.log'
outputFile = str(pathlib.Path.home()) + '/data/log_truth_merge_rootfile/10_dst/parsedDaVinciLog.h5'

# Helper functions
Should have made this a class, really.

## File management

In [3]:
def ProcessInputFile(file):
    with open(file) as f:
        lines = [line.rstrip() for line in f]
        
    beginning = "LambdaSel_T.Lam...WARNING -----------------BEGINNING EVENT-----------------"
    end = "LambdaSel_T.Lam...WARNING -------------------EVENT END---------------------"

    lindx = lines.index(beginning)
    rindx = len(lines) - lines[::-1].index(end)

    lines = lines[lindx:rindx]
    
    for index in range(len(lines)):
        warningPrologue = "LambdaSel_T.Lam...WARNING "
        if lines[index].startswith(warningPrologue):
            lines[index] = lines[index][len(warningPrologue):]
    
    return lines

## Individual parsing functions

In [4]:
def CheckPrefix(function, prefix, line):
    if not line.startswith(prefix):
        print("ERROR in ", function.__name__, ": prefix\n", prefix, "\ndoes not match line\n", line)

In [5]:
def ProcessTuple(line, prefix):
    CheckPrefix(ProcessTuple, prefix, line)
    return tuple(map(float, line[len(prefix):].split(', ')))

In [6]:
def ProcessTupleWithParentheses(line, prefix):
    CheckPrefix(ProcessTupleWithParentheses, prefix, line)
    return tuple(map(float, line[len(prefix)+1:-1].split(',')))

In [7]:
def ProcessInt(line, prefix):
    CheckPrefix(ProcessInt, prefix, line)
    return int(line[len(prefix):])

In [8]:
def ProcessFloat(line, prefix):
    CheckPrefix(ProcessFloat, prefix, line)
    return float(line[len(prefix):])

In [9]:
def ProcessMatrix(lines, prefix):
    CheckPrefix(ProcessMatrix, prefix, lines[0])
    matrix = []
    for line in lines[1:]:
        if line[0] == '[':
            line = line[1:]
        if line[-1] == ']':
            line = line[:-1]
        
        try:
            matrix.append([float(number) for number in line.split()])
        except ValueError:
            ## Whoever programmed matrix printing didn't account for the minus sign.
            ## As a result, sometimes two elements of the matrix will be squished,
            ## e.g. 0.000841-0.000119364. The above line of code uses whitespace as
            ## separator and doesn't like this. The following is a very simple fix,
            ## but it works.
            line = line.replace("-", " -")
            ## To avoid breaking up exponentials, e.g. 3.43e-5...
            line = line.replace("e -", "e-")
            matrix.append([float(number) for number in line.split()])
            
    return matrix

In [10]:
def ProcessPID(line):
    prefix = "PID: LHCb.ParticleID"
    CheckPrefix(ProcessPID, prefix, line)
    PID = int(line[len(prefix)+1:-1])
    if abs(PID) == 2212:
        return 'proton'
    elif abs(PID) == 211:
        return 'pion'
    else:
        print("ERROR: unrecognized particle in line\n", line)
        return -1

In [11]:
def ProcessStatus(line):
    prefix = "Status: "
    CheckPrefix(ProcessStatus, prefix, line)
    return line[len(prefix):]

## Chunk parsing functions

In [12]:
def ProcessParticleChunk(particleChunk):
    chunkBegin = "PID: LHCb.ParticleID"
    chunkEnd =   "---------END PARTICLE---------"
    
    if not (particleChunk[0].startswith(chunkBegin) and particleChunk[-1] == chunkEnd):
        print("ERROR: the following particle chunk is not standard:\n", particleChunk)
        return -1
    
    particleName = ProcessPID(particleChunk[0])
    refPoint = ProcessTupleWithParentheses(particleChunk[1], "Reference point: ")
    fourMom = ProcessTupleWithParentheses(particleChunk[2], "4-momentum: ")
    posMomCovMatrix = ProcessMatrix(particleChunk[3:11], "PosMomCovMatrix:")
    
    return particleName, refPoint, fourMom, posMomCovMatrix

In [13]:
def ProcessIterationChunk(iterChunk):
    chunkBegin = "Iter: "
    chunkEnd =   "-----------ITER END-----------"
    
    if not (iterChunk[0].startswith(chunkBegin) and iterChunk[-1] == chunkEnd):
        print("ERROR: the following iteration chunk is not standard:\n", iterChunk)
        return -1
    
    iteration = ProcessInt(iterChunk[0], "Iter: ")
    particle1 = ProcessParticleChunk(iterChunk[2:14])
    particle2 = ProcessParticleChunk(iterChunk[14:26])
    currentVtx = ProcessTuple(iterChunk[26], "x: ")
    previousVtx = ProcessTuple(iterChunk[27], "x0: ")
    ci = ProcessMatrix(iterChunk[28:32], "ci:")
    chi2 = ProcessFloat(iterChunk[32], "chi2: ")
    deltaVtx = ProcessTuple(iterChunk[33], "dx: ")
    deltaDistance = ProcessFloat(iterChunk[34], "d1: ")
    deltaChi2 = ProcessFloat(iterChunk[35], "d2: ")
    
    return iteration, particle1, particle2, currentVtx, previousVtx, ci, chi2, deltaVtx, deltaDistance, deltaChi2

In [14]:
def LineIsGood(line):
    return not line.startswith("DaVinci::ParticleTransporter::")

In [15]:
def CleanChunk(chunk):
    return [line for line in chunk if LineIsGood(line)]

In [16]:
def ProcessChunk(chunk):
    chunkBegin = "-----------------BEGINNING EVENT-----------------"
    chunkEnd =   "-------------------EVENT END---------------------"
    
    chunk = CleanChunk(chunk)
    
    if not (chunk[0] == chunkBegin and chunk[-1] == chunkEnd):
        print("ERROR: the following chunk is not standard:\n", chunk)
        return -1
    
    initVtx = ProcessTuple(chunk[2], "Initial vtx: ")
    initChi2 = ProcessFloat(chunk[3], "Initial chi2: ")
    initCi = ProcessMatrix(chunk[4:8], "Initial ci:")
    
    initParticle1 = ProcessParticleChunk(chunk[9:21])
    initParticle2 = ProcessParticleChunk(chunk[21:33])
    
    if initParticle1[0] == 'proton':
        initProtonInfo = initParticle1
        initPionInfo = initParticle2
    else:
        initPionInfo = initParticle1
        initProtonInfo = initParticle2
    
    lengthOfIterationChunk = 37
    firstIterationStartingPoint = 34 ## Index corresponding to Iter: 1
    numberOfIterations = len(chunk[firstIterationStartingPoint:-2]) / lengthOfIterationChunk
    if int(numberOfIterations) != numberOfIterations:
        raise ValueError("ERROR: number of iterations", numberOfIterations, "is not an integer. Follows the chunk:\n", chunk)

    iter_protonRefPoint = []
    iter_protonMomenta = []
    iter_protonEnergy = []
    iter_protonposMomCovMatrices = []
    
    iter_pionRefPoint = []
    iter_pionMomenta = []
    iter_pionEnergy = []
    iter_pionposMomCovMatrices = []
    
    iter_currentVertices = []
    iter_previousVertices = []
    iter_covMatrices = []
    iter_chi2s = []
    iter_deltaVertices = []
    iter_deltaDistances = []
    iter_deltaChi2s = []
    
    for iIter in range(int(numberOfIterations)):
        startIterIndex = firstIterationStartingPoint+iIter*lengthOfIterationChunk ## Iter: number
        endIterIndex = startIterIndex + lengthOfIterationChunk ## ITER END
        
        iterationInfo = ProcessIterationChunk(chunk[startIterIndex:endIterIndex])
        
        if iterationInfo[1][0] == 'proton':
            protonIndex = 1
            pionIndex = 2
        else:
            protonIndex = 2
            pionIndex = 1
            
        iter_protonRefPoint.append(iterationInfo[protonIndex][1])
        iter_protonMomenta.append(iterationInfo[protonIndex][2][:3])
        iter_protonEnergy.append(iterationInfo[protonIndex][2][3])
        iter_protonposMomCovMatrices.append(iterationInfo[protonIndex][3])
        
        iter_pionRefPoint.append(iterationInfo[pionIndex][1])
        iter_pionMomenta.append(iterationInfo[pionIndex][2][:3])
        iter_pionEnergy.append(iterationInfo[pionIndex][2][3])
        iter_pionposMomCovMatrices.append(iterationInfo[pionIndex][3])
        
        iter_currentVertices.append(iterationInfo[3])
        iter_previousVertices.append(iterationInfo[4])
        iter_covMatrices.append(iterationInfo[5])
        iter_chi2s.append(iterationInfo[6])
        iter_deltaVertices.append(iterationInfo[7])
        iter_deltaDistances.append(iterationInfo[8])
        iter_deltaChi2s.append(iterationInfo[9])    
  
    status = ProcessStatus(chunk[-2])
    
    dictionary = {
        "seed_vtx": initVtx,
        "seed_chi2": initChi2,
        "seed_ci": initCi,
        "p_refPoint": initProtonInfo[1],
        "p_momentum": initProtonInfo[2][:3],
        "p_energy": initProtonInfo[2][3],
        "p_posMomCovMatrix": initProtonInfo[3],
        "pim_refPoint": initPionInfo[1],
        "pim_momentum": initPionInfo[2][:3],
        "pim_energy": initPionInfo[2][3],
        "pim_posMomCovMatrix": initPionInfo[3],
        "numberOfIterations": int(numberOfIterations),
        "iter_p_refPoint": iter_protonRefPoint,
        "iter_p_momentum": iter_protonMomenta,
        "iter_p_energy": iter_protonEnergy,
        "iter_p_posMomCovMatrix": iter_protonposMomCovMatrices,
        "iter_pim_refPoint": iter_pionRefPoint,
        "iter_pim_momentum": iter_pionMomenta,
        "iter_pim_energy": iter_pionEnergy,
        "iter_pim_posMomCovMatrix": iter_pionposMomCovMatrices,
        "iter_currentVertices": iter_currentVertices,
        "iter_previousVertices": iter_previousVertices,
        "iter_covMatrices": iter_covMatrices,
        "iter_chi2s": iter_chi2s,
        "iter_deltaVertices": iter_deltaVertices,
        "iter_deltaDistances": iter_deltaDistances,
        "iter_deltaChi2s": iter_deltaChi2s,
        "status": status
    }
    
    return dictionary

## Full file parsing function(s)

In [17]:
def ProcessLogFile(lines, verbose=False):
    searchStart = 0
    chunkBegin = "-----------------BEGINNING EVENT-----------------"
    chunkEnd =   "-------------------EVENT END---------------------"
    
    listOfEvents = []
    
    while True:
        try:
            beginIndex = lines[searchStart:].index(chunkBegin) + searchStart
        except ValueError:
            break     
        
        endIndex = lines[searchStart:].index(chunkEnd) + searchStart + 1
        
        if verbose:
            print(beginIndex, "\t", lines[beginIndex])
            print(endIndex, "\t", lines[endIndex-1])
        
        listOfEvents.append(ProcessChunk(lines[beginIndex:endIndex]))
        searchStart = endIndex
    
    return listOfEvents

# Helper functions

## Feature component functions
We'll use nested information (lists within the DataFrame), which make it a bit hard to single out individual components. These functions help with that.

In [18]:
## Use 1,2,3 instead of 0,1,2. Trust me, it's easier.
def Feature1DComponent(series, component):
    return series.map(lambda x: x[component-1])

In [19]:
def Feature2DComponent(series, row, column):
    return series.map(lambda x: x[row-1][column-1])

In [20]:
def FeatureCoordinate(series, coordinate):  
    coordToComponent = {'x': 1, 'y': 2, 'z': 3}
    return Feature1DComponent(series, coordToComponent[coordinate])

# Data parsing
Finally!

In [21]:
lines = ProcessInputFile(inputFile)
events = ProcessLogFile(lines)
df_events = pd.json_normalize(events)
df_events

Unnamed: 0,seed_vtx,seed_chi2,seed_ci,p_refPoint,p_momentum,p_energy,p_posMomCovMatrix,pim_refPoint,pim_momentum,pim_energy,...,iter_pim_energy,iter_pim_posMomCovMatrix,iter_currentVertices,iter_previousVertices,iter_covMatrices,iter_chi2s,iter_deltaVertices,iter_deltaDistances,iter_deltaChi2s,status
0,"(490.165, -81.5147, 6890.6)",0.0,"[[0.260762, -9.68128e-06, -0.0160285], [-9.681...","(549.518, -95.402, 7668.29)","(3430.55, -685.364, 44948.5)",45094.20,"[[0.00077284, -0.000219784, 0.0, -6.99475, 1.3...","(526.787, -81.7488, 7672.29)","(417.328, -23.5438, 8909.14)",8920.03,...,"[8920.03, 8920.03, 8920.03, 8920.03, 8920.03]","[[[0.0608783, 0.00363242, 0.0, 2.62175, -0.188...","[(466.514, -78.6802, 6560.05), (460.757, -78.0...","[(490.165, -81.5147, 6890.6), (466.514, -78.68...","[[[23.5075, -0.296555, -1.49193], [-0.296555, ...","[0.110586, 2.17313, 1.94896, 2.08533, 2.03551]","[(-23.6508, 2.83455, -330.554), (-5.75665, 0.5...","[331.411, 74.4456, 24.6899, 10.4051, 4.14922]","[316.37, 4.79309, 0.0829486, 0.0147622, 0.0022...",Converged
1,"(320.552, 280.206, 5993.5)",0.0,"[[0.0253826, -0.000505677, -0.003847], [-0.000...","(276.44, 521.059, 7868.09)","(-166.324, 603.669, 6593.58)",6689.38,"[[0.0130188, 0.0751473, 0.0, -0.963057, 2.17, ...","(835.182, 222.187, 7862.48)","(785.515, 25.1585, 2851.02)",2960.65,...,"[2960.65, 2960.65, 2960.65, 2960.65]","[[[42.5846, -5.94154, 0.0, -327.961, -1.82402,...","[(241.136, 219.025, 4797.19), (206.688, 193.67...","[(320.552, 280.206, 5993.5), (241.136, 219.025...","[[[0.0964084, -0.0171089, -0.00361121], [-0.01...","[87.8083, 12.5417, 9.06983, 10.2028]","[(-79.4158, -61.181, -1196.31), (-34.4484, -25...","[1200.51, 504.362, 92.031, 5.36262]","[823.108, 23.4586, 0.611948, 0.00275324]",Converged
2,"(25.4024, 11.0184, 5397.19)",0.0,"[[0.148349, -0.000474292, -0.00380217], [-0.00...","(128.307, 10.7263, 7668.29)","(1137.56, 20.0555, 25103.6)",25146.90,"[[0.00073441, 0.000658215, 0.0, -0.981496, -0....","(-481.751, 105.728, 7862.59)","(-1097.57, 69.3867, 5342.46)",5456.27,...,"[5456.27, 5456.27, 5456.27, 5456.27, 5456.27, ...","[[[91.5888, -4.69065, 0.0, -985.665, 71.3985, ...","[(21.3817, 9.95645, 4581.33), (17.8489, 9.4688...","[(25.4024, 11.0184, 5397.19), (21.3817, 9.9564...","[[[0.310462, -0.0141491, -0.00679856], [-0.014...","[49.9206, 27.0553, 17.6368, 3.4029, 0.0240505,...","[(-4.02071, -1.06197, -815.865), (-3.5328, -0....","[815.876, 494.278, 752.499, 2550.88, 502.771, ...","[199.006, 6.10129, 2.90123, 7.32744, 0.281375,...",Converged
3,"(-21.0909, 12.8729, 5197.18)",0.0,"[[0.261674, -0.000859306, -0.00935076], [-0.00...","(-129.611, 12.2261, 7672.24)","(-1331.74, 50.4607, 30379.4)",30423.10,"[[0.00073441, 0.000670873, 0.0, -1.35959, 0.02...","(282.432, 24.766, 7668.29)","(957.014, 22.8388, 7791.67)",7851.50,...,"[7851.5, 7851.5, 7851.5, 7851.5, 7851.5, 7851.5]","[[[24.8429, 0.335649, 0.0, -313.901, -8.16488,...","[(-10.4738, 9.89561, 4133.44), (0.344467, 7.46...","[(-21.0909, 12.8729, 5197.18), (-10.4738, 9.89...","[[[0.263286, -0.0189893, 0.00373331], [-0.0189...","[14.6889, 5.86996, 3.36358, 1.70314, 2.54495, ...","[(10.6171, -2.97729, -1063.74), (10.8182, -2.4...","[1063.8, 1078.56, 1996.53, 1029.53, 232.534, 2...","[265.583, 7.19944, 0.836226, 0.258878, 0.01797...",Converged
4,"(92.6954, -131.965, 6239.05)",0.0,"[[0.252123, -0.00077747, -0.000134384], [-0.00...","(132.141, -159.991, 7754.79)","(1043.11, -815.329, 40094.5)",40127.30,"[[0.00080656, 0.000742193, 0.0, -1.55671, 1.11...","(55.7184, -173.357, 7750.79)","(-398.533, -417.676, 16294.7)",16305.50,...,"[16305.5, 16305.5, 16305.5, 16305.5]","[[[0.695649, 0.0245866, 0.0, -22.7603, -22.123...","[(87.1915, -120.788, 5756.18), (85.4411, -117....","[(92.6954, -131.965, 6239.05), (87.1915, -120....","[[[3.99382, -0.112449, -0.0398208], [-0.112449...","[3.99005, 1.5046, 1.19377, 1.14857]","[(-5.50388, 11.177, -482.874), (-1.75048, 3.56...","[483.034, 154.944, 36.8421, 7.35844]","[186.837, 2.66068, 0.0743883, 0.00248775]",Converged
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10431,"(-869.552, -85.4424, 10409.1)",0.0,"[[0.122858, -0.000871916, 0.0172491], [-0.0008...","(-783.797, -42.3525, 7853.69)","(-347.049, -69.6395, 10360.5)",10408.90,"[[0.00571536, 0.00487549, 0.0, 1.08656, 0.1888...","(-436.343, -76.9904, 7672.24)","(-1109.36, -38.3318, 7007.87)",7096.61,...,"[7097.3, 7097.3, 7097.3]","[[[0.497066, 0.865569, 0.0, -46.7121, 0.660276...","[(-847.033, -65.421, 10120.3), (-846.99, -64.9...","[(-869.552, -85.4424, 10409.1), (-847.033, -65...","[[[8.28967, -0.0261719, 0.553699], [-0.0261719...","[74.2682, 96.1596, 96.3151]","[(22.5189, 20.0214, -288.802), (0.0423759, 0.4...","[290.37, 1.75959, 0.00762202]","[3606.63, 0.421589, 1.34646e-05]",Converged
10432,"(-6.33693, 47.8751, 4840.09)",0.0,"[[0.213575, 0.00248458, -0.00607877], [0.00248...","(-230.828, 117.343, 7751.84)","(-1287.91, 274.412, 16700.1)",16778.20,"[[0.00081225, 0.000496059, 0.0, -1.38142, 0.28...","(484.395, 2.332, 7672.29)","(1285.45, 31.9525, 7445.99)",7557.49,...,"[7557.49, 7557.49, 7557.49, 7557.49, 7557.49]","[[[68.9682, 0.209614, 0.0, -816.702, -20.4365,...","[(-2.98968, 28.5927, 3453.31), (-6.53087, -4.1...","[(-6.33693, 47.8751, 4840.09), (-2.98968, 28.5...","[[[0.0412808, 0.00956795, -0.000385316], [0.00...","[188.658, 39.9437, 3.45855, 1.60207, 1.53739]","[(3.34724, -19.2824, -1386.78), (-3.54119, -32...","[1386.91, 2580.25, 1981.56, 132.435, 4.65559]","[301.349, 50.7064, 6.47518, 0.0175542, 2.12254...",Converged
10433,"(-2.77537, -19.2735, 4848.62)",0.0,"[[0.213945, 0.00222587, -0.0108378], [0.002225...","(-111.448, -38.3212, 7672.24)","(-1101.28, -160.999, 28618.0)",28655.00,"[[0.00081225, 0.000789541, 0.0, -1.15462, -0.1...","(484.395, 2.332, 7672.29)","(1285.45, 31.9525, 7445.99)",7557.49,...,"[7557.49, 7557.49, 7557.49, 7557.49]","[[[68.0371, 0.205788, 0.0, -811.144, -20.2947,...","[(-8.5721, -15.4629, 3625.76), (-12.8632, -13....","[(-2.77537, -19.2735, 4848.62), (-8.5721, -15....","[[[0.148786, -0.00985489, 0.000870163], [-0.00...","[0.455797, 1.09364, 0.956296, 1.04055]","[(-5.79673, 3.81059, -1222.86), (-4.2911, 2.40...","[1222.88, 575.041, 111.157, 35.1279]","[246.57, 3.22212, 0.0494256, 0.00493449]",Converged
10434,"(161.568, -80.815, 5788.54)",0.0,"[[0.104436, 0.00260099, -0.015545], [0.0026009...","(151.481, -291.649, 7858.86)","(-119.887, -1066.72, 29982.5)",30016.30,"[[0.00725904, 0.0137366, 0.0, -2.48066, -15.60...","(484.395, 2.332, 7672.29)","(1285.45, 31.9525, 7445.99)",7557.49,...,"[7557.49, 7557.49, 7557.49, 7557.49]","[[[11.4766, -0.0345891, 0.0, -330.583, -8.0697...","[(111.327, -97.996, 4735.77), (-17.9353, -58.4...","[(161.568, -80.815, 5788.54), (111.327, -97.99...","[[[0.165085, -0.0188863, -0.0121858], [-0.0188...","[2194.61, 466.424, 1.2325, 0.481157]","[(-50.2405, -17.1811, -1052.77), (-129.263, 39...","[1054.1, 2724.01, 1604.04, 8.49156]","[1019.87, 530.185, 49.2371, 0.00174364]",Converged


In [22]:
Feature1DComponent(df_events['seed_vtx'], 2)

0        -81.5147
1        280.2060
2         11.0184
3         12.8729
4       -131.9650
           ...   
10431    -85.4424
10432     47.8751
10433    -19.2735
10434    -80.8150
10435   -108.6520
Name: seed_vtx, Length: 10436, dtype: float64

In [23]:
Feature2DComponent(df_events['seed_ci'], 1, 2)

0       -0.000010
1       -0.000506
2       -0.000474
3       -0.000859
4       -0.000777
           ...   
10431   -0.000872
10432    0.002485
10433    0.002226
10434    0.002601
10435   -0.000317
Name: seed_ci, Length: 10436, dtype: float64

In [24]:
FeatureCoordinate(df_events['seed_vtx'], 'x')

0        490.16500
1        320.55200
2         25.40240
3        -21.09090
4         92.69540
           ...    
10431   -869.55200
10432     -6.33693
10433     -2.77537
10434    161.56800
10435    689.09700
Name: seed_vtx, Length: 10436, dtype: float64

In [25]:
print("Preparing to save dataframe in HDF5 file...")
df_events.to_hdf(outputFile, "LHCbMC_Lb", mode='w');
print("File saved.")

Preparing to save dataframe in HDF5 file...
File saved.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['seed_vtx', 'seed_ci', 'p_refPoint', 'p_momentum', 'p_posMomCovMatrix',
       'pim_refPoint', 'pim_momentum', 'pim_posMomCovMatrix',
       'iter_p_refPoint', 'iter_p_momentum', 'iter_p_energy',
       'iter_p_posMomCovMatrix', 'iter_pim_refPoint', 'iter_pim_momentum',
       'iter_pim_energy', 'iter_pim_posMomCovMatrix', 'iter_currentVertices',
       'iter_previousVertices', 'iter_covMatrices', 'iter_chi2s',
       'iter_deltaVertices', 'iter_deltaDistances', 'iter_deltaChi2s',
       'status'],
      dtype='object')]

  encoding=encoding,
