In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
##########################################################################
# Important Note: This notebook only uses one csv file to test the model #
##########################################################################

In [32]:
# assign processed data to data_files
cwd = os.getcwd()
data_dir = os.path.join(cwd, "data/")
data_files = [f for f in os.listdir(str(data_dir)) if f.endswith('csv')]

data_train_name = [f for f in data_files if 'TRAIN' in f]
data_test_name = [f for f in data_files if 'TEST' in f]

data_train = pd.read_csv(os.path.join(data_dir,data_train_name[0]))
data_test = pd.read_csv(os.path.join(data_dir,data_test_name[0]))


In [33]:

# Drop columns needed for classification group
boolean_col = ['highavse','lowavse','truedcr','lq']
data_train_filtered = data_train.drop(columns=boolean_col+['id'])
data_test_filtered = data_test.drop(columns=boolean_col+['id'])

# Find and Drop rows with missing values
data_train_filtered = data_train_filtered.dropna()
data_test_filtered = data_test_filtered.dropna()

# Drop irrelevant features and feature with perfect multicollinearity 
data_train_filtered = data_train_filtered.drop(columns=['tdrift50','tdrift10'])
data_test_filtered = data_test_filtered.drop(columns=['tdrift50','tdrift10'])



In [34]:
# Train test split
X_train = data_train_filtered.drop(columns=['energylabel'])
X_test = data_test_filtered.drop(columns=['energylabel'])
y_train = data_train_filtered['energylabel']
y_test = data_test_filtered['energylabel']

# standardization
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

# GridCV to find best lambda
alpha_range = np.logspace(-5, 2.5, 100)
param_grid = {"alpha": alpha_range}
ridge = Ridge()
grid_search = GridSearchCV(estimator=ridge, param_grid={"alpha": alpha_range}, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train_standardized, y_train)

best_alpha = grid_search.best_params_['alpha']
print(f"lambda: {best_alpha}")

# Train the Ridge Regression model with the best alpha
ridge_reg = Ridge(alpha=best_alpha)
ridge_reg.fit(X_train_standardized,y_train)
y_pred = ridge_reg.predict(X_test_standardized)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

lambda: 1e-05


In [35]:
mse,r2

(4706.421982582751, 0.9894859088286423)

In [36]:
np.mean(abs(y_pred - y_test))

16.143720399693166

In [16]:
data_filtered.corr()

Unnamed: 0,tdrift,rea,dcr,peakindex,peakvalue,tailslope,currentamp,lfpr,lq80,areagrowthrate,inflection point,risingedgeslope,energylabel
tdrift,1.0,-0.353646,-0.119906,0.966816,-0.104132,0.112163,-0.12991,-0.287289,0.20041,0.119038,0.131842,-0.207463,-0.104336
rea,-0.353646,1.0,0.028737,-0.431403,0.029845,-0.025223,0.245087,0.649042,-0.261714,-0.038562,-0.289455,-0.069715,0.028187
dcr,-0.119906,0.028737,1.0,0.027008,0.997981,-0.996803,0.213819,0.089385,0.684834,-0.998767,-0.040027,0.917027,0.992463
peakindex,0.966816,-0.431403,0.027008,1.0,0.042621,-0.033552,-0.106819,-0.278013,0.317134,-0.027973,0.147367,-0.058978,0.042785
peakvalue,-0.104132,0.029845,0.997981,0.042621,1.0,-0.996964,0.198594,0.091994,0.711452,-0.999103,-0.045428,0.905183,0.994342
tailslope,0.112163,-0.025223,-0.996803,-0.033552,-0.996964,1.0,-0.201432,-0.0878,-0.704431,0.996307,0.03837,-0.902061,-0.991746
currentamp,-0.12991,0.245087,0.213819,-0.106819,0.198594,-0.201432,1.0,0.093334,-0.110651,-0.214353,-0.02607,0.244163,0.202547
lfpr,-0.287289,0.649042,0.089385,-0.278013,0.091994,-0.0878,0.093334,1.0,-0.078781,-0.094215,-0.240215,0.024977,0.092354
lq80,0.20041,-0.261714,0.684834,0.317134,0.711452,-0.704431,-0.110651,-0.078781,1.0,-0.683208,0.038102,0.554716,0.696343
areagrowthrate,0.119038,-0.038562,-0.998767,-0.027973,-0.999103,0.996307,-0.214353,-0.094215,-0.683208,1.0,0.046493,-0.913401,-0.994058
