In [1]:
import pandas as pd
import numpy as np

In [11]:
# Load the data into a DataFrame called "data"
data_filepath="../data/dielectron.csv"
data = pd.read_csv(data_filepath)
data.head()

Unnamed: 0,Run,Event,E1,px1,py1,pz1,pt1,eta1,phi1,Q1,E2,px2,py2,pz2,pt2,eta2,phi2,Q2,M
0,147115,366639895,58.7141,-7.31132,10.531,-57.2974,12.8202,-2.20267,2.17766,1,11.2836,-1.03234,-1.88066,-11.0778,2.14537,-2.34403,-2.07281,-1,8.94841
1,147115,366704169,6.61188,-4.15213,-0.579855,-5.11278,4.19242,-1.02842,-3.00284,-1,17.1492,-11.7135,5.04474,11.4647,12.7536,0.808077,2.73492,1,15.893
2,147115,367112316,25.5419,-11.4809,2.04168,22.7246,11.661,1.42048,2.9656,1,15.8203,-1.4728,2.25895,-15.5888,2.69667,-2.45508,2.14857,1,38.3877
3,147115,366952149,65.3959,7.51214,11.8871,63.8662,14.0619,2.21838,1.00721,1,25.1273,4.08786,2.59641,24.6563,4.84272,2.33021,0.565865,-1,3.72862
4,147115,366523212,61.4504,2.95284,-14.6227,-59.6121,14.9179,-2.09375,-1.37154,-1,13.8871,-0.277757,-2.4256,-13.6708,2.44145,-2.4237,-1.68481,-1,2.74718


In [12]:
# Original number of rows and columns
data.shape

(100000, 19)

In [14]:
# Missing values check and column data types check
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Run     100000 non-null  int64  
 1   Event   100000 non-null  int64  
 2   E1      100000 non-null  float64
 3   px1     100000 non-null  float64
 4   py1     100000 non-null  float64
 5   pz1     100000 non-null  float64
 6   pt1     100000 non-null  float64
 7   eta1    100000 non-null  float64
 8   phi1    100000 non-null  float64
 9   Q1      100000 non-null  int64  
 10  E2      100000 non-null  float64
 11  px2     100000 non-null  float64
 12  py2     100000 non-null  float64
 13  pz2     100000 non-null  float64
 14  pt2     100000 non-null  float64
 15  eta2    100000 non-null  float64
 16  phi2    100000 non-null  float64
 17  Q2      100000 non-null  int64  
 18  M       99915 non-null   float64
dtypes: float64(15), int64(4)
memory usage: 14.5 MB


## Data Cleaning

In [15]:
# Remove all second, third, etc. occurances of each duplicated row
data = data.drop_duplicates(keep="first")
data.shape

(99977, 19)

In [16]:
# Remove rows that have missing values in some of their columns
data = data.dropna(axis=0)

In [17]:
# Number of rows and columns after data cleaning
data.shape

(99892, 19)

In [18]:
data.head()

Unnamed: 0,Run,Event,E1,px1,py1,pz1,pt1,eta1,phi1,Q1,E2,px2,py2,pz2,pt2,eta2,phi2,Q2,M
0,147115,366639895,58.7141,-7.31132,10.531,-57.2974,12.8202,-2.20267,2.17766,1,11.2836,-1.03234,-1.88066,-11.0778,2.14537,-2.34403,-2.07281,-1,8.94841
1,147115,366704169,6.61188,-4.15213,-0.579855,-5.11278,4.19242,-1.02842,-3.00284,-1,17.1492,-11.7135,5.04474,11.4647,12.7536,0.808077,2.73492,1,15.893
2,147115,367112316,25.5419,-11.4809,2.04168,22.7246,11.661,1.42048,2.9656,1,15.8203,-1.4728,2.25895,-15.5888,2.69667,-2.45508,2.14857,1,38.3877
3,147115,366952149,65.3959,7.51214,11.8871,63.8662,14.0619,2.21838,1.00721,1,25.1273,4.08786,2.59641,24.6563,4.84272,2.33021,0.565865,-1,3.72862
4,147115,366523212,61.4504,2.95284,-14.6227,-59.6121,14.9179,-2.09375,-1.37154,-1,13.8871,-0.277757,-2.4256,-13.6708,2.44145,-2.4237,-1.68481,-1,2.74718


In [26]:
data.rename(columns={"px1 ": "px1"}, inplace=True)

# Calculating Invariant Mass (M)
Lets check if we will calculate the same value for the invariant mass (M) of the system using the formula: 
M = $\sqrt{(E_1 + E_2)^2 - (\vec{p_1} + \vec{p_2})^2}$. Lets try it for the first example (row) from the dataset.

In [46]:
e_1 = data.loc[0, "E1"]
e_2 = data.loc[0, "E2"]
total_e = e_1 + e_2 # Total energy of the particles
total_e

np.float64(69.99770000000001)

Total momentum of the particles is calculated by vector addition:
$|\vec{p_1} + \vec{p_2}| = \sqrt{(p_{x1} + p_{x2})^2 + (p_{y1} + p_{y2})^2 + (p_{z1} + p_{z2})^2}$

In [47]:
p_x1 = data.loc[0, "px1"]
p_y1 = data.loc[0, "py1"]
p_z1 = data.loc[0, "pz1"]
p_x1, p_y1, p_z1

(np.float64(-7.31132), np.float64(10.531), np.float64(-57.2974))

In [48]:
p_x2 = data.loc[0, "px2"]
p_y2 = data.loc[0, "py2"]
p_z2 = data.loc[0, "pz2"]
p_x2, p_y2, p_z2

(np.float64(-1.03234), np.float64(-1.88066), np.float64(-11.0778))

In [49]:
total_pSquared = (p_x1 + p_x2) ** 2 + (p_y1 + p_y2) ** 2 + (p_z1 + p_z2) ** 2
total_pSquared

np.float64(4819.613019351201)

For the invariant mass we have:

In [50]:
m = np.sqrt(total_e ** 2 - total_pSquared)
round(m, 6) # This value differs a bit from the one in the dataset but it does not matter so much

# Note: we can do this for whatever example we want just by changing the 0 in the "loc" statement

np.float64(8.947904)

# Transverse momentum (p<sub>t</sub>)

The particles are moving along the z-axis. Therefore, the x and y momentum components form the transverse momentum. It is calculated using this formula: $p_t=\sqrt{p_x^2 + p_y^2}$.

In [54]:
p_t1 = np.sqrt(p_x1 ** 2 + p_y1 ** 2)
p_t2 = np.sqrt(p_x2 ** 2 + p_y2 ** 2)
round(p_t1, 6), round(p_t2, 6) # Again the values do not exactly match up but it does not matter

(np.float64(12.820193), np.float64(2.145369))

# Adding new features to the dataset

Lets add total energy of the system, total momentum of each particle, and total momentum of the system of particles.

In [71]:
data["T_E"] = data["E1"] + data["E2"]
data["T_p1"] = np.sqrt(data["px1"] ** 2 + data["py1"] ** 2 + data["pz1"] ** 2)
data["T_p2"] = np.sqrt(data["px2"] ** 2 + data["py2"] ** 2 + data["pz2"] ** 2)
data["T_p"] = np.sqrt((data["px1"] + data["px2"]) ** 2 + (data["py1"] + data["py2"]) ** 2 + (data["pz1"] + data["pz2"]) ** 2)

In [72]:
data.head()

Unnamed: 0,Run,Event,E1,px1,py1,pz1,pt1,eta1,phi1,Q1,...,pz2,pt2,eta2,phi2,Q2,M,T_E,T_p1,T_p2,T_p
0,147115,366639895,58.7141,-7.31132,10.531,-57.2974,12.8202,-2.20267,2.17766,1,...,-11.0778,2.14537,-2.34403,-2.07281,-1,8.94841,69.9977,58.714133,11.283628,69.423433
1,147115,366704169,6.61188,-4.15213,-0.579855,-5.11278,4.19242,-1.02842,-3.00284,-1,...,11.4647,12.7536,0.808077,2.73492,1,15.893,23.76108,6.611878,17.149193,17.66353
2,147115,367112316,25.5419,-11.4809,2.04168,22.7246,11.661,1.42048,2.9656,1,...,-15.5888,2.69667,-2.45508,2.14857,1,38.3877,41.3622,25.541867,15.820325,15.401734
3,147115,366952149,65.3959,7.51214,11.8871,63.8662,14.0619,2.21838,1.00721,1,...,24.6563,4.84272,2.33021,0.565865,-1,3.72862,90.5232,65.395924,25.127377,90.446476
4,147115,366523212,61.4504,2.95284,-14.6227,-59.6121,14.9179,-2.09375,-1.37154,-1,...,-13.6708,2.44145,-2.4237,-1.68481,-1,2.74718,75.3375,61.450347,13.887097,75.287343


In [73]:
# Export the modified data into a new csv file called "cdata.csv"
cleaned_data_filepath = "../data/cdata.csv"
data.to_csv(cleaned_data_filepath, index=False)