In [24]:
import pandas as pd
import numpy as np

In [25]:
data_filepath = "../data/cdata.csv"
data = pd.read_csv(data_filepath)

In [26]:
data.head()

Unnamed: 0,Run,Event,E1,px1,py1,pz1,pt1,eta1,phi1,Q1,...,pz2,pt2,eta2,phi2,Q2,M,T_E,T_p1,T_p2,T_p
0,147115,366639895,58.7141,-7.31132,10.531,-57.2974,12.8202,-2.20267,2.17766,1,...,-11.0778,2.14537,-2.34403,-2.07281,-1,8.94841,69.9977,58.714133,11.283628,69.423433
1,147115,366704169,6.61188,-4.15213,-0.579855,-5.11278,4.19242,-1.02842,-3.00284,-1,...,11.4647,12.7536,0.808077,2.73492,1,15.893,23.76108,6.611878,17.149193,17.66353
2,147115,367112316,25.5419,-11.4809,2.04168,22.7246,11.661,1.42048,2.9656,1,...,-15.5888,2.69667,-2.45508,2.14857,1,38.3877,41.3622,25.541867,15.820325,15.401734
3,147115,366952149,65.3959,7.51214,11.8871,63.8662,14.0619,2.21838,1.00721,1,...,24.6563,4.84272,2.33021,0.565865,-1,3.72862,90.5232,65.395924,25.127377,90.446476
4,147115,366523212,61.4504,2.95284,-14.6227,-59.6121,14.9179,-2.09375,-1.37154,-1,...,-13.6708,2.44145,-2.4237,-1.68481,-1,2.74718,75.3375,61.450347,13.887097,75.287343


## Feature Engineering

Here, we add two additional features to the dataset which represent the total energy of the system squared and the total momentum of the system squared. The goal is to try to reduce the errors  we got in the previous notebooks. We use scikit-learn to implement the linear regression model. 

In [27]:
data["T_E_squared"] = data["T_E"] ** 2 # Total energy squared
data["T_p_squared"] = data["T_p"] ** 2 # Total momentum squared
#data["E1_squared"] = data["E1"] ** 2
#data["E2_squared"] = data["E2"] ** 2
#data["E1 times E2"] = data["E1"] * data["E2"]
#data["physics_formula"] = np.sqrt(data["T_E_squared"] - data["T_p_squared"])

In [28]:
data.head()

Unnamed: 0,Run,Event,E1,px1,py1,pz1,pt1,eta1,phi1,Q1,...,eta2,phi2,Q2,M,T_E,T_p1,T_p2,T_p,T_E_squared,T_p_squared
0,147115,366639895,58.7141,-7.31132,10.531,-57.2974,12.8202,-2.20267,2.17766,1,...,-2.34403,-2.07281,-1,8.94841,69.9977,58.714133,11.283628,69.423433,4899.678005,4819.613019
1,147115,366704169,6.61188,-4.15213,-0.579855,-5.11278,4.19242,-1.02842,-3.00284,-1,...,0.808077,2.73492,1,15.893,23.76108,6.611878,17.149193,17.66353,564.588923,312.000301
2,147115,367112316,25.5419,-11.4809,2.04168,22.7246,11.661,1.42048,2.9656,1,...,-2.45508,2.14857,1,38.3877,41.3622,25.541867,15.820325,15.401734,1710.831589,237.213404
3,147115,366952149,65.3959,7.51214,11.8871,63.8662,14.0619,2.21838,1.00721,1,...,2.33021,0.565865,-1,3.72862,90.5232,65.395924,25.127377,90.446476,8194.449738,8180.565068
4,147115,366523212,61.4504,2.95284,-14.6227,-59.6121,14.9179,-2.09375,-1.37154,-1,...,-2.4237,-1.68481,-1,2.74718,75.3375,61.450347,13.887097,75.287343,5675.738906,5668.184034


In [29]:
# split predictors and target variable
y = data["M"]
x = data.drop(columns=["M", "Run", "Event"], axis=1)

In [30]:
# train-test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [31]:
# train the model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)
yPredictions = model.predict(x_test)

In [32]:
yPredictions[:5]

array([26.36841347, 24.25614274, 18.93690793, 29.1742714 , 12.06428913])

In [33]:
y_test[:5]

89877    32.16470
63939     6.79365
97180    22.24300
19919    35.56790
82715     4.14619
Name: M, dtype: float64

In [34]:
# evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(yPredictions, y_test)
mse = mean_squared_error(yPredictions, y_test)
rmse = np.sqrt(mse)

In [35]:
print(f"Mean squared error: {mse};\nRoot Mean Squared Error: {rmse};\nMean Absolute Error: {mae}.")

Mean squared error: 38.02626711394391;
Root Mean Squared Error: 6.166544179193393;
Mean Absolute Error: 4.932157706198696.


We did reduce the errors. The RMSE got reduced from 9.25 to 6.17 which is an improvement of around 33%.

In [37]:
# Lets export the data, yPredictions, and y_test to csv files in order to plot them in a new notebook

data.to_csv("../data/final_data.csv")
np.savetxt("../data/yPredictions.csv", yPredictions, delimiter=",")
np.savetxt("../data/y_test.csv", y_test, delimiter=",")

In [4]:
((9.25 - 6.17) / 9.25) * 100

33.2972972972973