<a href="https://colab.research.google.com/github/carlos-alves-one/-Energy-Comp/blob/main/enefit_project_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries and Packages

In [1]:
import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # or RandomForestClassifier for classification
from sklearn.metrics import mean_absolute_error     # Import MAE instead of MSE
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb

!pip install optuna

import optuna


Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.0 colorlog-6.8.0 optuna-3.5.0


# Mount Google Drive

In [2]:
# Imports the 'drive' module from 'google.colab' and mounts the Google Drive to
# the '/content/drive' directory in the Colab environment.
from google.colab import drive

# This function mounts Google Drive
def mount_google_drive():
    drive.mount('/content/drive')

# Call the function to mount Google Drive
mount_google_drive()


Mounted at /content/drive


# Load the Data

In [3]:
# root = "/kaggle/input/predict-energy-behavior-of-prosumers"
root = "/content/drive/MyDrive/project_energy"

data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols  = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

save_path = None
load_path = None

df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_historical  = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target      = df_data.select(target_cols)

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_historical  = df_historical.schema
schema_target      = df_target.schema


#Feature Engineering


In [4]:
# Columns specifications
data_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_high', 'windspeed_10m', 'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation', 'latitude', 'longitude']
location_cols = ['longitude', 'latitude', 'county']

# Helper function to read large CSV files directly into Polars DataFrames
def read_csv_polars(file_path, columns, batch_size=5 * 10 ** 5):  # Adjust batch_size as needed
    return pl.read_csv(file_path, columns=columns, batch_size=batch_size)

# Reading CSV files directly into Polars DataFrames
df_data = read_csv_polars(os.path.join(root, "train.csv"), data_cols)
df_client = read_csv_polars(os.path.join(root, "client.csv"), client_cols)
df_gas = read_csv_polars(os.path.join(root, "gas_prices.csv"), gas_cols)
df_electricity = read_csv_polars(os.path.join(root, "electricity_prices.csv"), electricity_cols)
df_forecast = read_csv_polars(os.path.join(root, "forecast_weather.csv"), forecast_cols)
df_historical = read_csv_polars(os.path.join(root, "historical_weather.csv"), historical_cols)
df_location = read_csv_polars(os.path.join(root, "weather_station_to_county_mapping.csv"), location_cols)

# Define a date format string compatible with your data
date_format = "%Y-%m-%d %H:%M:%S"  # Adjust the format according to your data

# Convert 'datetime' columns to datetime format in all dataframes
df_data = df_data.with_columns([pl.col('datetime').str.strptime(pl.Datetime, date_format, strict=False)])
df_client = df_client.with_columns([pl.col('date').str.strptime(pl.Datetime, date_format, strict=False)])
df_gas = df_gas.with_columns([pl.col('forecast_date').str.strptime(pl.Datetime, date_format, strict=False)])
df_electricity = df_electricity.with_columns([pl.col('forecast_date').str.strptime(pl.Datetime, date_format, strict=False)])
df_forecast = df_forecast.with_columns([pl.col('forecast_datetime').str.strptime(pl.Datetime, date_format, strict=False)])
df_historical = df_historical.with_columns([pl.col('datetime').str.strptime(pl.Datetime, date_format, strict=False)])

# Standardize datetime precision to microseconds
# This function is simplified assuming your datetime data is already timezone-naive
def standardize_datetime_precision(df, datetime_columns):
    for col in datetime_columns:
        df = df.with_columns([pl.col(col).cast(pl.Datetime)])
    return df

datetime_columns_data = ['datetime']
datetime_columns_client = ['date']
datetime_columns_gas = ['forecast_date']
datetime_columns_electricity = ['forecast_date']
datetime_columns_forecast = ['forecast_datetime']
datetime_columns_historical = ['datetime']

df_data = standardize_datetime_precision(df_data, datetime_columns_data)
df_client = standardize_datetime_precision(df_client, datetime_columns_client)
df_gas = standardize_datetime_precision(df_gas, datetime_columns_gas)
df_electricity = standardize_datetime_precision(df_electricity, datetime_columns_electricity)
df_forecast = standardize_datetime_precision(df_forecast, datetime_columns_forecast)
df_historical = standardize_datetime_precision(df_historical, datetime_columns_historical)

# Filtering data for years greater than 2021
df_data = df_data.filter(pl.col('datetime').dt.year() > 2021)

# Function to convert data into a pandas DataFrame
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "category_1"]

    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()

    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")

    df["target_mean"] = df[[f"target_{i}" for i in range(1, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(1, 7)]].std(1)
    df["target_ratio"] = df["target_6"] / (df["target_7"] + 1e-3)

    return df

# Define the feature engineering function
def feature_eng(df):
    # Example: Creating a new feature by combining existing ones
    if 'installed_capacity' in df.columns and 'eic_count' in df.columns:
        df = df.with_columns([(pl.col('installed_capacity') * pl.col('eic_count')).alias('capacity_eic_product')])

    # Example: Transforming a feature (e.g., logarithmic transformation)
    if 'euros_per_mwh' in df.columns:
        df = df.with_columns([pl.col('euros_per_mwh').log().alias('log_euros_per_mwh')])

    # Example: Encoding categorical variables
    if 'county' in df.columns:
        df = df.join(df.groupby('county').count(), on='county', how='left').rename({'count': 'county_frequency'})

    # Additional feature engineering steps...
    # TO DO...

    return df

# Feature engineering and data preparation in batches
number_of_batches = 100  # Adjust based on your memory capacity
for batch in np.array_split(df_data.to_pandas(), number_of_batches):
    X = pl.DataFrame(batch.drop(columns=["target"]))
    y = pl.DataFrame(batch[["target"]])

    # Apply feature engineering
    X = feature_eng(X)

# Garbage collection to free up memory
gc.collect()


0

In [6]:
# Assuming X is a Polars DataFrame, convert it to a Pandas DataFrame
X_df = X.to_pandas()

# Convert the 'datetime' column to a numeric format (e.g., seconds since epoch)
X_df['datetime'] = X_df['datetime'].apply(pd.Timestamp.timestamp)

# Data Preprocessing
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df)

# Splitting Data
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y.to_pandas(), test_size=0.2, random_state=42)

# Convert y_train and y_test to 1D array
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Model Training
# Initialize and train the model
model = RandomForestRegressor(random_state=42)  # Use RandomForestClassifier for classification tasks
model.fit(X_train, y_train)

# Model Evaluation
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE
print(f"\n>> Mean Absolute Error: {mae}")

# Hyperparameter Tuning (Optional)
# This can be done using GridSearchCV or RandomizedSearchCV from sklearn.model_selection

# Prediction
# Use model.predict(new_data) to make predictions on new, unseen data



>> Mean Absolute Error: 59.44933381845689
