# **load and preprocess the data**
**1. Split the samples into 60% training, 20% validation, and 20% testing data at random**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Specify the file path
file_path = "C:\\Users\\ACDC\\Desktop\\techem\\assignment2\\energy+efficiency\\ENB2012_data.xlsx"

# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)

# Extract the features (X1-X8) and target (Y1)
X = df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']]
y = df['Y1']

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Now you have your data split into training, validation, and testing sets

**2. Using Ridge regularization, set your own 10 different choices of regularization parameters, find 
the best choice that gives the highest accuracy on the validation data (based on R2** 

In [22]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# Define a range of regularization parameters (alpha values)
alpha_values = [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]

# Initialize variables to store the best alpha and R2
best_alpha = None
best_r2 = -1

# Iterate over different alpha values
for alpha in alpha_values:
    # Create a Ridge Regression model
    model = Ridge(alpha=alpha)

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Predict on the validation data
    y_val_pred = model.predict(X_val)

    # Calculate R2 score
    r2 = r2_score(y_val, y_val_pred)

    # Update best alpha and R2 if necessary
    if r2 > best_r2:
        best_alpha = alpha
        best_r2 = r2

# Print the best alpha and R2
print("Best alpha:", best_alpha)
print("Best R2:", best_r2)

Best alpha: 0.01
Best R2: 0.9043592533272623


**3. Make one final evaluation on the test data**

In [26]:
import numpy as np
# Combine training and validation data (assuming X_train and X_val have the same columns)
combined_data = np.vstack([X_train, X_val])
combined_target = np.concatenate([y_train, y_val])

# Train the model on combined data
best_model.fit(combined_data, combined_target)

In [30]:
# Make predictions on the test data
y_test_pred = best_model.predict(X_test)

# Calculate the R2 score on the test data
test_r2 = r2_score(y_test, y_test_pred)

# Print the test R2 score
print("Test R2:", test_r2)

Test R2: 0.9191767580009695


**4. What are the best model’s coefficients, intercept, and its training, validation, and test accuracy?**

In [38]:
# Access coefficients and intercept
coefficients = best_model.coef_
intercept = best_model.intercept_

# Print the results
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [-5.78627250e+01 -5.16282621e-02  3.32642807e-02 -4.24463080e-02
  4.51399134e+00 -5.16521530e-03  2.00450831e+01  2.20995684e-01]
Intercept: 69.07137182022353


In [40]:
from sklearn.metrics import mean_squared_error

# Calculate MSE and RMSE on the training, validation, and test sets
train_mse = mean_squared_error(y_train, best_model.predict(X_train))
train_rmse = np.sqrt(train_mse)

val_mse = mean_squared_error(y_val, best_model.predict(X_val))
val_rmse = np.sqrt(val_mse)

test_mse = mean_squared_error(y_test, best_model.predict(X_test))
test_rmse = np.sqrt(test_mse)

# Print the results
print("Training MSE:", train_mse)
print("Training RMSE:", train_rmse)
print("Validation MSE:", val_mse)
print("Validation RMSE:", val_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)

Training MSE: 8.28207197201746
Training RMSE: 2.8778589214931056
Validation MSE: 9.884963771198715
Validation RMSE: 3.1440362229463443
Test MSE: 7.9424066231150245
Test RMSE: 2.81822756765933


**5. What are the top 5 features among X1 to X8?**

In [45]:
# Get the absolute values of the coefficients
abs_coefficients = np.abs(coefficients)

# Get the indices of the top 5 features
top_5_indices = abs_coefficients.argsort()[-5:][::-1]

# Get the corresponding feature names
top_5_features = df.columns[top_5_indices]

# Print the top 5 features
print("Top 5 features:", top_5_features)

Top 5 features: Index(['X1', 'X7', 'X5', 'X8', 'X2'], dtype='object')


**6. If you repeat the procedure above using only the 5 top features, what are the results?**

In [56]:
combined_data_top_5 = pd.concat([X_train_top_5, X_val_top_5], ignore_index=True)
combined_target_top_5 = pd.concat([y_train, y_val], ignore_index=True)

best_model_top_5.fit(combined_data_top_5, combined_target_top_5)

# Make predictions on the test data
y_test_pred_top_5 = best_model_top_5.predict(X_test_top_5)

# Calculate the R2 score on the test data
test_r2_top_5 = r2_score(y_test, y_test_pred_top_5)

# Print the test R2 score
print("Test R2 with top 5 features:", test_r2_top_5)

Test R2 with top 5 features: 0.9066612953612943
