Skip to content

Commit

Permalink
Move again to pandas, because it's very conviniet
Browse files Browse the repository at this point in the history
and effective in case of parquet format
  • Loading branch information
bdrum committed Mar 26, 2021
1 parent 01e0d13 commit fa096c0
Show file tree
Hide file tree
Showing 8 changed files with 1,087 additions and 648 deletions.
1,333 changes: 903 additions & 430 deletions notebooks/4TracksAnalysis.ipynb

Large diffs are not rendered by default.

337 changes: 133 additions & 204 deletions notebooks/Draft.ipynb

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions notebooks/modules/FourTrackEvents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,24 @@
import pandas as pd
from particle import Particle


Rho0 = Particle.from_pdgid(113)
Pi0 = Particle.from_pdgid(111)
PiPlus = Particle.from_pdgid(211)

# when my data file size was 500mb work with pandas was comfort, but now the size is 1.5gb and parsing to pandas takes forever and eats all my memory despite on the fact that I have 32gb on my laptop.
# So I'll move to numpy, but for a start only for data loading.


class FourTrackEvents:
def __init__(self, path, tree, branches, uprootLibType='np'):
self.orig_events = uproot4.open(
path, object_cache=5000, num_workers=8)[tree]
self.orig_tracks = self.orig_events.arrays(
filter_name=branches, library=uprootLibType) # , entry_stop=100)
def __init__(self):
self.orig_events = pd.read_parquet(
r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oEvents.parquet')
self.orig_tracks = pd.read_parquet(
r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oTracks.parquet')

# 4 tracks events mask
# NOTE: may be it's not obviously, but dataframe able to work with mask only in case mask cover whole dataframe. This means I have to find events with four tracks in all events, than events with zero total charge again in all events, and than put mask about charge on the mask about numbers of tracks. In this case e.g. we have 6 tracks in events (four_track_mask is false), but total charge is zero (zq_mask is true). Such event will threw out because of four_track_mask is false.
# NOTE: may be it's not obviously, but dataframe able to work with mask only in case mask cover whole dataframe.
# This means I have to find events with four tracks in all events, than events with zero total charge again in all events, and than put the zero charge mask on the 4tracks mask.
# In this case e.g. we have 6 tracks in events (four_track_mask is false), but total charge is zero (zq_mask is true).
# Such event will threw out because of four_track_mask is false, but it could be a useful event.

self._four_tracks_mask = (
self.orig_tracks.reset_index().groupby('entry').count() == 4)

Expand Down
4 changes: 2 additions & 2 deletions notebooks/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from particle import Particle


ccup9_2015 = r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015o.root'
# ccup9_2015 = r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015o.root'

events = uproot4.open(ccup9_2015)['4Prongs/events']
# events = uproot4.open(ccup9_2015)['4Prongs/events']
# events.show()
# dfs.loc[0] # get dataframe part with entry = 0

Expand Down
Empty file.
35 changes: 35 additions & 0 deletions notebooks/modules/data/convertion/convert_to_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import uproot4
import numpy as np
import pandas as pd
import concurrent.futures

# TODO: not a method! (still specific script)


def convert_root_file_to_parquet(root_path, tree_name, branches, parq_path):

ccup9_2015_file = r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oLast.root'

tree_name = '4Prongs/events'

executor = concurrent.futures.ThreadPoolExecutor()

branches = ['T_Px', 'T_Py', 'T_Pz', 'T_Q', 'T_NumberOfSigmaTPCPion', 'T_TPCRefit', 'T_TPCNCls', 'T_Phi',
'T_Eta', 'T_HasPointOnITSLayer0', 'T_HasPointOnITSLayer1', 'T_ITSModuleInner', 'T_ITSModuleOuter']

evColumns = ["RunNum", "PeriodNumber", "OrbitNumber", "BunchCrossNumber", "Mass", "Pt", "Q", "Rapidity", "Phi", "ZNAenergy", "ZNCenergy", "ZPAenergy", "ZPCenergy", "VtxX", "VtxY", "VtxZ", "VtxContrib", "VtxChi2", "VtxNDF", "SpdVtxX", "SpdVtxY",
"SpdVtxZ", "SpdVtxContrib", "V0Adecision", "V0Cdecision", "ADAdecision", "ADCdecision", "V0Afired", "V0Cfired", "ADAfired", "ADCfired", "STPfired", "SMBfired", "SM2fired", "SH1fired", "OM2fired", "OMUfired", "IsTriggered", "nTracklets", "nTracks"]

events = uproot4.open(ccup9_2015_file, object_cache=5000,
num_workers=12, interpretation_executor=executor)[tree_name]

data_tracks = events.arrays(
filter_name=branches, library='pd', array_cache=5000) # , entry_stop=1000000)
data_tracks.to_parquet(
r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oTracks.parquet')
data_events = events.arrays(filter_name=evColumns, library='pd')
chips = events.arrays(filter_name=['FORChip'], library='pd')
chips = chips.groupby('entry').FORChip.apply(list)
data_events['FORChip'] = chips
data_events.to_parquet(
r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oEvents.parquet')
2 changes: 2 additions & 0 deletions notebooks/modules/physics/analysis/ITSvsTPC_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ def ShowComparisonSame(title, arrs, xlabel, labels, colors, nBins=100, ranges=(0
ax.add_patch(Rectangle((0, 0.15), 0.15, 900,
fc='lightgrey', alpha=0.4))
ax.text(0.15, 0, "0.15", size=14)

return fig
4 changes: 2 additions & 2 deletions notebooks/modules/physics/kinematics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def pt_events(tracks):


def mass_events(tracks):
ETracks = np.sqrt((tracks.T_Px**2 + tracks.T_Py**2 + tracks.T_Pz **
2 + (0.001*PiPlus.mass)**2)).groupby("entry").sum()
ETracks = np.sqrt((tracks.T_Px**2 + tracks.T_Py**2 +
tracks.T_Pz**2 + (0.001*PiPlus.mass)**2)).groupby("entry").sum()
SumTracks = tracks.groupby("entry").sum()
return np.sqrt(ETracks**2 - SumTracks.T_Px**2 - SumTracks.T_Py**2 - SumTracks.T_Pz**2)

0 comments on commit fa096c0

Please sign in to comment.