# Model Development

Testing different ML models for energy consumption prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import sys
sys.path.append('../')
from src.data_processing import load_data, process_data

In [None]:
# Load and process data
raw_data = load_data('../data/energy_consumption.csv')
df = process_data(raw_data)
print(f"Processed data shape: {df.shape}")
df.head()

In [None]:
# Create features for modeling
def create_features(df):
    features_df = df.copy()

    # Moving averages
    for window in [3, 6]:
        if len(df) > window:
            features_df[f'ma_{window}'] = features_df['Consumption'].rolling(window=window).mean()
            features_df[f'std_{window}'] = features_df['Consumption'].rolling(window=window).std()

    # Lag features
    for lag in [1, 2, 3]:
        if len(df) > lag:
            features_df[f'lag_{lag}'] = features_df['Consumption'].shift(lag)

    # Season dummies
    features_df['is_winter'] = features_df['Month'].isin([12, 1, 2]).astype(int)
    features_df['is_summer'] = features_df['Month'].isin([6, 7, 8]).astype(int)

    return features_df

features_df = create_features(df)
features_df = features_df.dropna()
print(f"Features shape after dropna: {features_df.shape}")

In [None]:
# Prepare training data
exclude_cols = ['Date', 'Consumption', 'Reading', 'Cost']
feature_cols = [col for col in features_df.columns if col not in exclude_cols]
print(f"Feature columns: {feature_cols}")

X = features_df[feature_cols].values
y = features_df['Consumption'].values

print(f"X shape: {X.shape}, y shape: {y.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train set: {X_train_scaled.shape}, Test set: {X_test_scaled.shape}")

In [None]:
# Test different models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression()
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Train model
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred = model.predict(X_test_scaled)

    # Metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    # Cross validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='r2')

    results[name] = {
        'R²': r2,
        'RMSE': rmse,
        'MAE': mae,
        'CV R²': cv_scores.mean()
    }

    print(f"R²: {r2:.3f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, CV R²: {cv_scores.mean():.3f}")

# Results summary
results_df = pd.DataFrame(results).T
print("\nModel comparison:")
print(results_df.round(3))

In [None]:
# Plot model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# R² scores
axes[0].bar(results_df.index, results_df['R²'])
axes[0].set_title('R² Score Comparison')
axes[0].set_ylabel('R² Score')
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45)

# RMSE
axes[1].bar(results_df.index, results_df['RMSE'], color='orange')
axes[1].set_title('RMSE Comparison')
axes[1].set_ylabel('RMSE (kWh)')
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance for best tree-based model
best_model_name = results_df['CV R²'].idxmax()
best_model = models[best_model_name]

print(f"Best model: {best_model_name}")

if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['feature'][:10], importance_df['importance'][:10])
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    print("\nTop features:")
    print(importance_df.head(10))

In [None]:
# Test predictions vs actual
best_model.fit(X_train_scaled, y_train)
y_pred_best = best_model.predict(X_test_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Consumption (kWh)')
plt.ylabel('Predicted Consumption (kWh)')
plt.title(f'Predictions vs Actual - {best_model_name}')
plt.show()

print(f"Best model final R²: {r2_score(y_test, y_pred_best):.3f}")