
---
## 1  Import Dependencies

In [2]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
import joblib, os

## 2  Load and Merge Data

In [3]:
# Use relative paths so it works in Kaggle environment or locally
data_path = "../data/"
train       = pd.read_csv(os.path.join(data_path, "train.csv"))
test        = pd.read_csv(os.path.join(data_path, "test.csv"))
building    = pd.read_csv(os.path.join(data_path, "building_metadata.csv"))
weather_tr  = pd.read_csv(os.path.join(data_path, "weather_train.csv"))
weather_te  = pd.read_csv(os.path.join(data_path, "weather_test.csv"))

# Keep electricity only
train = train[train['meter'] == 0]

In [4]:
def merge_weather(df, weather):
    df = df.merge(building, on="building_id", how="left")
    df = df.merge(weather, on=["site_id", "timestamp"], how="left")
    return df

train_full = merge_weather(train, weather_tr)
test_full  = merge_weather(test,  weather_te)

## 3  Feature Engineering & Cleaning

In [5]:
for df in [train_full, test_full]:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month']       = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['is_weekend']  = df['day_of_week'].isin([5,6]).astype(int)
    # Clean weather anomalies
    df['precip_depth_1_hr'] = df['precip_depth_1_hr'].clip(lower=0).fillna(0)
    df['air_temperature']   = df['air_temperature'].fillna(df['air_temperature'].median())
    df['dew_temperature']   = df['dew_temperature'].fillna(df['dew_temperature'].median())

## 4  Aggregate to Daily (level matching training)

In [8]:
def aggregate_daily(df):
    df['date'] = df['timestamp'].dt.date

    # Base aggregation for weather variables
    agg_dict = {
        'air_temperature': 'mean',
        'dew_temperature': 'mean',
        'wind_speed': 'mean',
        'cloud_coverage': 'mean',
        'precip_depth_1_hr': 'sum',
        'sea_level_pressure': 'mean'
    }

    # Add target only if present (training set)
    if 'meter_reading' in df.columns:
        agg_dict['meter_reading'] = 'mean'

    daily = (
        df.groupby(['building_id', 'site_id', 'primary_use',
                    'square_feet', 'year_built', 'floor_count',
                    'month', 'day_of_week', 'is_weekend', 'date'])
          .agg(agg_dict)
          .reset_index()
    )
    return daily

In [9]:
train_d = aggregate_daily(train_full)
test_d  = aggregate_daily(test_full)

## 5  Feature Lists and Preprocessor

In [10]:
num_features = ['square_feet','year_built','floor_count','air_temperature',
                'dew_temperature','wind_speed','cloud_coverage',
                'precip_depth_1_hr','sea_level_pressure',
                'month','day_of_week','is_weekend']
cat_features = ['primary_use','site_id']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
])

In [12]:
# === FINAL CLEANING SAFETY NET ===
import numpy as np

def clean_numeric(df):
    num_cols = ['square_feet','year_built','floor_count',
                'air_temperature','dew_temperature',
                'wind_speed','cloud_coverage',
                'precip_depth_1_hr','sea_level_pressure',
                'month','day_of_week','is_weekend']
    
    # Replace non‑finite values with NaN
    df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)

    # Specific fixes
    df['precip_depth_1_hr'] = df['precip_depth_1_hr'].clip(lower=0)
    df['air_temperature']   = df['air_temperature'].clip(lower=-50, upper=60)
    df['dew_temperature']   = df['dew_temperature'].clip(lower=-50, upper=60)
    df['sea_level_pressure']= df['sea_level_pressure'].clip(lower=800, upper=1100)
    df['wind_speed']        = df['wind_speed'].clip(lower=0, upper=100)
    df['cloud_coverage']    = df['cloud_coverage'].clip(lower=0, upper=10)
    
    # Fill residual NaNs with column median
    df[num_cols] = df[num_cols].apply(lambda c: c.fillna(c.median()))

    return df

train_d = clean_numeric(train_d)
test_d  = clean_numeric(test_d)

print("NaN counts after cleaning:")
print(train_d.isna().sum()[train_d.isna().sum() > 0])

NaN counts after cleaning:
Series([], dtype: int64)


## 6  Train Final Model (using Cleaned Training Data)

In [14]:
import os, joblib

os.makedirs("../outputs", exist_ok=True)   # create folder if it doesn't exist
joblib.dump(poly_enet, "../outputs/final_elasticnet_poly.pkl")
print("Model trained and saved ✓ → ../outputs/final_elasticnet_poly.pkl")

Model trained and saved ✓ → ../outputs/final_elasticnet_poly.pkl


In [15]:
X_train = train_d.drop(columns=['meter_reading','date'])
y_train = np.log1p(train_d['meter_reading'].clip(lower=0))  # log1p stabilization

poly_enet = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=20000, random_state=42))
])

poly_enet.fit(X_train, y_train)
joblib.dump(poly_enet, "../outputs/final_elasticnet_poly.pkl")
print("Model trained and saved ✓")

Model trained and saved ✓


## 7  Inference on Kaggle Test Set

In [16]:
X_test = test_d.drop(columns=['date'])
pred_log = poly_enet.predict(X_test)
pred_kwh = np.expm1(pred_log)
pred_kwh = np.clip(pred_kwh, 0, None)

## 8  Building Submission File

In [17]:
submission = pd.DataFrame({
    "row_id": test_d.index,
    "meter_reading": pred_kwh
})
submission.to_csv("../outputs/submission.csv", index=False)
print("submission.csv created:", submission.shape)

submission.csv created: (168630, 2)
