In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "C:\\Users\\ACDC\\Desktop\\techem\\assignment2\\energy+efficiency\\ENB2012_data.xlsx"
data = pd.read_excel(file_path)

# Rename columns for easier access
data.columns = ['Relative_Compactness', 'Surface_Area', 'Wall_Area', 'Roof_Area', 'Overall_Height', 
                'Orientation', 'Glazing_Area', 'Glazing_Area_Distribution', 'Heating_Load', 'Cooling_Load']

# Features and target
X = data.iloc[:, :-2]  # All columns except the last two
y = data['Heating_Load']  # Target variable

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


In [9]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

# Define regularization parameters
alphas = [0.001,0.01,0.1, 1, 10, 100, 200, 300, 400, 500, 600, 700]

best_alpha = None
best_r2 = -float('inf')
best_model = None

# Train Ridge Regression models with different alphas
for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    r2 = r2_score(y_val, y_val_pred)
    
    if r2 > best_r2:
        best_r2 = r2
        best_alpha = alpha
        best_model = model

print(f'Best alpha: {best_alpha}')
print(f'Best R² on validation set: {best_r2}')


Best alpha: 0.001
Best R² on validation set: 0.9050265244755583


In [5]:
# Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f'R² on test set: {test_r2}')
print(f'Mean Squared Error on test set: {test_mse}')
print(f'Coefficients: {best_model.coef_}')
print(f'Intercept: {best_model.intercept_}')


R² on test set: 0.9196159466847389
Mean Squared Error on test set: 7.899248058517873
Coefficients: [-6.12498964 -3.32218905  0.86842943 -3.63565183  7.32739826 -0.01991889
  2.70237342  0.37642244]
Intercept: 22.079978260869552


In [7]:
import numpy as np

# Get the top 5 features based on absolute coefficient values
top_features_indices = np.argsort(np.abs(best_model.coef_))[-5:]
top_features = X.columns[top_features_indices]

# Train a new model using only the top 5 features
X_train_top = X_train[:, top_features_indices]
X_val_top = X_val[:, top_features_indices]
X_test_top = X_test[:, top_features_indices]

best_model_top = Ridge(alpha=best_alpha)
best_model_top.fit(X_train_top, y_train)

# Evaluate the new model
y_test_pred_top = best_model_top.predict(X_test_top)
test_r2_top = r2_score(y_test, y_test_pred_top)
test_mse_top = mean_squared_error(y_test, y_test_pred_top)

print(f'R² on test set with top features: {test_r2_top}')
print(f'Mean Squared Error on test set with top features: {test_mse_top}')
print(f'Coefficients with top features: {best_model_top.coef_}')
print(f'Intercept with top features: {best_model_top.intercept_}')


R² on test set with top features: 0.9194835828284909
Mean Squared Error on test set with top features: 7.912255301762805
Coefficients with top features: [ 2.77232889 -1.61237938 -5.42091749 -6.14063946  7.32163951]
Intercept with top features: 22.079978260869552
