Move again to pandas, because it's very conviniet

and effective in case of parquet format
bdrum · Mar 26, 2021 · fa096c0 · fa096c0
1 parent 01e0d13
commit fa096c0
Show file tree

Hide file tree

Showing 8 changed files with 1,087 additions and 648 deletions.
diff --git a/notebooks/4TracksAnalysis.ipynb b/notebooks/4TracksAnalysis.ipynb
diff --git a/notebooks/Draft.ipynb b/notebooks/Draft.ipynb
diff --git a/notebooks/modules/FourTrackEvents.py b/notebooks/modules/FourTrackEvents.py
@@ -3,24 +3,24 @@
 import pandas as pd
 from particle import Particle
 
-
 Rho0 = Particle.from_pdgid(113)
 Pi0 = Particle.from_pdgid(111)
 PiPlus = Particle.from_pdgid(211)
 
-# when my data file size was 500mb work with pandas was comfort, but now the size is 1.5gb and parsing to pandas takes forever and eats all my memory despite on the fact that I have 32gb on my laptop.
-# So I'll move to numpy, but for a start only for data loading.
-
 
 class FourTrackEvents:
-    def __init__(self, path, tree, branches, uprootLibType='np'):
-        self.orig_events = uproot4.open(
-            path,  object_cache=5000, num_workers=8)[tree]
-        self.orig_tracks = self.orig_events.arrays(
-            filter_name=branches, library=uprootLibType)  # , entry_stop=100)
+    def __init__(self):
+        self.orig_events = pd.read_parquet(
+            r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oEvents.parquet')
+        self.orig_tracks = pd.read_parquet(
+            r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oTracks.parquet')
 
         # 4 tracks events mask
-        # NOTE: may be it's not obviously, but dataframe able to work with mask only in case mask cover whole dataframe. This means I have to find events with four tracks in all events, than events with zero total charge again in all events, and than put mask about charge on the mask about numbers of tracks. In this case e.g. we have 6 tracks in events (four_track_mask is false), but total charge is zero (zq_mask is true). Such event will threw out because of four_track_mask is false.
+        # NOTE: may be it's not obviously, but dataframe able to work with mask only in case mask cover whole dataframe.
+        # This means I have to find events with four tracks in all events, than events with zero total charge again in all events, and than put the zero charge mask  on the 4tracks mask.
+        # In this case e.g. we have 6 tracks in events (four_track_mask is false), but total charge is zero (zq_mask is true).
+        # Such event will threw out because of four_track_mask is false, but it could be a useful event.
+
         self._four_tracks_mask = (
             self.orig_tracks.reset_index().groupby('entry').count() == 4)
 

diff --git a/notebooks/modules/__init__.py b/notebooks/modules/__init__.py
@@ -6,9 +6,9 @@
 from particle import Particle
 
 
-ccup9_2015 = r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015o.root'
+# ccup9_2015 = r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015o.root'
 
-events = uproot4.open(ccup9_2015)['4Prongs/events']
+# events = uproot4.open(ccup9_2015)['4Prongs/events']
 # events.show()
 # dfs.loc[0] # get dataframe part with entry = 0
 

diff --git a/notebooks/modules/data/convertion/__init__.py b/notebooks/modules/data/convertion/__init__.py
diff --git a/notebooks/modules/data/convertion/convert_to_parquet.py b/notebooks/modules/data/convertion/convert_to_parquet.py
@@ -0,0 +1,35 @@
+import uproot4
+import numpy as np
+import pandas as pd
+import concurrent.futures
+
+# TODO: not a method! (still specific script)
+
+
+def convert_root_file_to_parquet(root_path, tree_name, branches, parq_path):
+
+    ccup9_2015_file = r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oLast.root'
+
+    tree_name = '4Prongs/events'
+
+    executor = concurrent.futures.ThreadPoolExecutor()
+
+    branches = ['T_Px', 'T_Py', 'T_Pz',  'T_Q', 'T_NumberOfSigmaTPCPion', 'T_TPCRefit', 'T_TPCNCls', 'T_Phi',
+                'T_Eta', 'T_HasPointOnITSLayer0', 'T_HasPointOnITSLayer1', 'T_ITSModuleInner', 'T_ITSModuleOuter']
+
+    evColumns = ["RunNum", "PeriodNumber", "OrbitNumber", "BunchCrossNumber", "Mass", "Pt", "Q", "Rapidity", "Phi", "ZNAenergy", "ZNCenergy", "ZPAenergy", "ZPCenergy", "VtxX", "VtxY", "VtxZ", "VtxContrib", "VtxChi2", "VtxNDF", "SpdVtxX", "SpdVtxY",
+                 "SpdVtxZ", "SpdVtxContrib", "V0Adecision", "V0Cdecision", "ADAdecision", "ADCdecision", "V0Afired", "V0Cfired", "ADAfired", "ADCfired", "STPfired", "SMBfired", "SM2fired", "SH1fired", "OM2fired", "OMUfired", "IsTriggered", "nTracklets", "nTracks"]
+
+    events = uproot4.open(ccup9_2015_file, object_cache=5000,
+                          num_workers=12, interpretation_executor=executor)[tree_name]
+
+    data_tracks = events.arrays(
+        filter_name=branches, library='pd', array_cache=5000)  # , entry_stop=1000000)
+    data_tracks.to_parquet(
+        r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oTracks.parquet')
+    data_events = events.arrays(filter_name=evColumns, library='pd')
+    chips = events.arrays(filter_name=['FORChip'], library='pd')
+    chips = chips.groupby('entry').FORChip.apply(list)
+    data_events['FORChip'] = chips
+    data_events.to_parquet(
+        r'D:\GoogleDrive\Job\cern\Alice\analysis\data\RhoPrime\2015\4Prongs2015oEvents.parquet')
diff --git a/notebooks/modules/physics/analysis/ITSvsTPC_events.py b/notebooks/modules/physics/analysis/ITSvsTPC_events.py
@@ -26,3 +26,5 @@ def ShowComparisonSame(title, arrs, xlabel, labels, colors, nBins=100, ranges=(0
         ax.add_patch(Rectangle((0, 0.15), 0.15, 900,
                                fc='lightgrey', alpha=0.4))
         ax.text(0.15, 0, "0.15", size=14)
+
+    return fig
diff --git a/notebooks/modules/physics/kinematics.py b/notebooks/modules/physics/kinematics.py
@@ -11,7 +11,7 @@ def pt_events(tracks):
 
 
 def mass_events(tracks):
-    ETracks = np.sqrt((tracks.T_Px**2 + tracks.T_Py**2 + tracks.T_Pz **
-                       2 + (0.001*PiPlus.mass)**2)).groupby("entry").sum()
+    ETracks = np.sqrt((tracks.T_Px**2 + tracks.T_Py**2 +
+                      tracks.T_Pz**2 + (0.001*PiPlus.mass)**2)).groupby("entry").sum()
     SumTracks = tracks.groupby("entry").sum()
     return np.sqrt(ETracks**2 - SumTracks.T_Px**2 - SumTracks.T_Py**2 - SumTracks.T_Pz**2)