In [83]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
##########################################################################
# Important Note: This notebook only uses one csv file to test the model #
##########################################################################

In [7]:
# assign processed data to data_files
cwd = os.getcwd()
data_dir = os.path.join(cwd, "data/")
data_files = [f for f in os.listdir(str(data_dir)) if f.endswith('csv')]

data = pd.read_csv(os.path.join(data_dir,data_files[0]))
data

Unnamed: 0,id,tdrift,tdrift50,tdrift10,rea,dcr,peakindex,peakvalue,tailslope,currentamp,lfpr,lq80,areagrowthrate,inflection point,risingedgeslope,energylabel,highavse,lowavse,truedcr,lq
0,2720098,72.927,36.5,7.3,-0.076426,6.881667e+05,1040,1557.0,-0.133265,0.004676,0.019066,185989.0,-649642.5,335,25.145317,584.125035,True,True,True,False
1,2720099,87.912,44.0,8.8,0.859192,2.534012e+06,1058,5468.0,-0.515260,0.006334,0.019731,246783.0,-2508675.0,334,73.832579,2141.979713,True,True,True,False
2,2720100,100.899,50.5,10.1,0.687139,2.643202e+05,1043,635.0,-0.051026,0.006306,0.019065,112261.0,-244314.0,333,6.920641,218.964101,True,True,True,True
3,2720101,107.892,54.0,10.8,1.122271,2.577911e+06,1078,5645.0,-0.533495,0.007364,0.018889,255408.0,-2588360.5,359,57.156349,2269.136956,True,True,True,True
4,2720102,126.873,63.5,12.7,1.073198,3.399610e+05,1068,805.0,-0.066449,0.005428,0.019221,134760.0,-307752.5,274,6.153707,261.949510,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64995,2785093,110.889,55.5,11.1,0.491273,2.915576e+05,1051,653.0,-0.055625,0.005289,0.018264,81304.0,-265233.0,288,7.327922,238.472881,True,True,True,False
64996,2785094,92.907,46.5,9.3,0.517614,5.559312e+05,1045,1190.0,-0.108918,0.007148,0.017511,96335.0,-513901.0,362,16.491451,452.840234,True,True,True,True
64997,2785095,124.875,62.5,12.5,0.589107,4.068374e+05,1073,933.0,-0.079957,0.005937,0.017913,131195.0,-373728.0,272,8.956922,344.740556,True,True,True,True
64998,2785096,115.884,58.0,11.6,1.558933,2.052404e+05,1041,457.0,-0.038741,0.006517,0.024807,52616.0,-184091.5,277,3.217168,163.807547,True,True,True,True


In [31]:
# Drop columns needed for classification group
boolean_col = ['highavse','lowavse','truedcr','lq']
data_filtered = data.drop(columns=boolean_col)

X = data.drop(columns=['id','energylabel']+boolean_col)
y = data['energylabel']
y

0         584.125035
1        2141.979713
2         218.964101
3        2269.136956
4         261.949510
            ...     
64995     238.472881
64996     452.840234
64997     344.740556
64998     163.807547
64999     213.868023
Name: energylabel, Length: 65000, dtype: float64

In [93]:
# Train test split + standardization
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

# GridCV to find best lambda
alpha_range = np.logspace(-5, 2.5, 100)
param_grid = {"alpha": alpha_range}
ridge = Ridge()
grid_search = GridSearchCV(estimator=ridge, param_grid={"alpha": alpha_range}, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train_standardized, y_train)

best_alpha = grid_search.best_params_['alpha']
print(f"lambda: {best_alpha}")

# Train the Ridge Regression model with the best alpha
ridge_reg = Ridge(alpha=best_alpha)
ridge_reg.fit(X_train_standardized,y_train)
y_pred = ridge_reg.predict(X_test_standardized)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

lambda: 6.812920690579622


In [94]:
mse,r2

(4616.175743128149, 0.9896050423106761)

In [78]:
np.mean(abs(y_pred - y_test))

18.33795123925416

In [89]:
np.logspace(-5, 2.5, 100)

array([1.00000000e-05, 1.19057724e-05, 1.41747416e-05, 1.68761248e-05,
       2.00923300e-05, 2.39214708e-05, 2.84803587e-05, 3.39080668e-05,
       4.03701726e-05, 4.80638086e-05, 5.72236766e-05, 6.81292069e-05,
       8.11130831e-05, 9.65713905e-05, 1.14975700e-04, 1.36887451e-04,
       1.62975083e-04, 1.94034425e-04, 2.31012970e-04, 2.75038784e-04,
       3.27454916e-04, 3.89860370e-04, 4.64158883e-04, 5.52617002e-04,
       6.57933225e-04, 7.83320322e-04, 9.32603347e-04, 1.11033632e-03,
       1.32194115e-03, 1.57387304e-03, 1.87381742e-03, 2.23092437e-03,
       2.65608778e-03, 3.16227766e-03, 3.76493581e-03, 4.48244688e-03,
       5.33669923e-03, 6.35375264e-03, 7.56463328e-03, 9.00628020e-03,
       1.07226722e-02, 1.27661695e-02, 1.51991108e-02, 1.80957154e-02,
       2.15443469e-02, 2.56502091e-02, 3.05385551e-02, 3.63585086e-02,
       4.32876128e-02, 5.15372466e-02, 6.13590727e-02, 7.30527154e-02,
       8.69749003e-02, 1.03550337e-01, 1.23284674e-01, 1.46779927e-01,
      