**Data Preparation**

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "C:\\Users\\ACDC\\Desktop\\techem\\assignment2\\energy+efficiency\\ENB2012_data.xlsx"
data = pd.read_excel(file_path)

data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [17]:
data.columns= ["Relative Compactness", "Surface Area", "Wall Area", "Roof Area", "Overall Height", "Orientation", 
               "Glazing Area", "Glazing Area Distribution", "Heating Load", "Cooling Load"]
data.columns

Index(['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
       'Overall Height', 'Orientation', 'Glazing Area',
       'Glazing Area Distribution', 'Heating Load', 'Cooling Load'],
      dtype='object')

In [19]:
data.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [34]:
# Define features and target
X = data.iloc[:, :-2]  # Features (X1 to X8)
y = data.iloc[:, -2]   # Target (Y1 - Heating Load)

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

**Model Training**

In [36]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# Ridge regression with different regularization parameters
alphas = [0.1, 1, 10, 100, 200, 300, 400, 500, 600, 700]
best_alpha = None
best_r2 = -float('inf')

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_val_pred = ridge.predict(X_val)
    r2 = r2_score(y_val, y_val_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_alpha = alpha

print(f"Best alpha: {best_alpha}")


Best alpha: 0.1


**Model Evaluation**

In [38]:
# Final model evaluation
ridge = Ridge(alpha=best_alpha)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training R2: {train_r2}")
print(f"Test R2: {test_r2}")
print(f"Coefficients: {ridge.coef_}")
print(f"Intercept: {ridge.intercept_}")


Training R2: 0.9181823495656721
Test R2: 0.9196159466847389
Coefficients: [-6.12498964 -3.32218905  0.86842943 -3.63565183  7.32739826 -0.01991889
  2.70237342  0.37642244]
Intercept: 22.079978260869552


**Feature Analysis**

In [42]:
# Feature importance
feature_importance = pd.Series(ridge.coef_, index=data.columns[:-2]).sort_values(ascending=False)
print("Top 5 features:")
print(feature_importance.head(5))

# Get the indices of the top 5 features
top_features = feature_importance.head(5).index
top_feature_indices = [data.columns.get_loc(f) for f in top_features]

# Select top 5 features from the training, validation, and test sets
X_train_top = X_train[:, top_feature_indices]
X_val_top = X_val[:, top_feature_indices]
X_test_top = X_test[:, top_feature_indices]

# Train and evaluate with top 5 features
ridge_top = Ridge(alpha=best_alpha)
ridge_top.fit(X_train_top, y_train)
y_train_pred_top = ridge_top.predict(X_train_top)
y_test_pred_top = ridge_top.predict(X_test_top)

train_r2_top = r2_score(y_train, y_train_pred_top)
test_r2_top = r2_score(y_test, y_test_pred_top)

print(f"Training R2 with top 5 features: {train_r2_top}")
print(f"Test R2 with top 5 features: {test_r2_top}")

Top 5 features:
X5    7.327398
X7    2.702373
X3    0.868429
X8    0.376422
X6   -0.019919
dtype: float64
Training R2 with top 5 features: 0.9135146946652771
Test R2 with top 5 features: 0.9169417857482781
