<a href="https://colab.research.google.com/github/carlos-alves-one/-Energy-Comp/blob/main/enefit_project_comp_KV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/predict-energy-behavior-of-prosumers/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/weather_station_to_county_mapping.csv
/kaggle/input/predict-energy-behavior-of-prosumers/public_timeseries_testing_util.py
/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/county_id_to_name_map.json
/kaggle/input/predict-energy-behavior-of-prosumers/train.csv
/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/sample_submission.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/electricity

In [None]:
!pip install --upgrade polars



# Install Libraries and Packages

In [None]:
import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl

import enefit

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingRegressor

import pickle
import lightgbm as lgb

import optuna

# Load the Data

In [None]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"

data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols  = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

save_path = None
load_path = None

df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_historical  = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target      = df_data.select(target_cols)

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_historical  = df_historical.schema
schema_target      = df_target.schema


# Feature Engineering

In [None]:
import os
import polars as pl
import pandas as pd
import numpy as np
import gc

# Columns specifications
data_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_high', 'windspeed_10m', 'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation', 'latitude', 'longitude']
location_cols = ['longitude', 'latitude', 'county']

# Helper function to read large CSV files directly into Polars DataFrames
def read_csv_polars(file_path, columns, batch_size=5 * 10 ** 5):  # Adjust batch_size as needed
    return pl.read_csv(file_path, columns=columns, batch_size=batch_size)

# Reading CSV files directly into Polars DataFrames
df_data = read_csv_polars(os.path.join(root, "train.csv"), data_cols)
df_client = read_csv_polars(os.path.join(root, "client.csv"), client_cols)
df_gas = read_csv_polars(os.path.join(root, "gas_prices.csv"), gas_cols)
df_electricity = read_csv_polars(os.path.join(root, "electricity_prices.csv"), electricity_cols)
df_forecast = read_csv_polars(os.path.join(root, "forecast_weather.csv"), forecast_cols)
df_historical = read_csv_polars(os.path.join(root, "historical_weather.csv"), historical_cols)
df_location = read_csv_polars(os.path.join(root, "weather_station_to_county_mapping.csv"), location_cols)

# Define a date format string compatible with your data
date_format = "%Y-%m-%d %H:%M:%S"  # Adjust the format according to the data

# Convert 'datetime' columns to datetime format in all dataframes
df_data = df_data.with_columns([pl.col('datetime').str.strptime(pl.Datetime, date_format, strict=False)])
df_client = df_client.with_columns([pl.col('date').str.strptime(pl.Datetime, date_format, strict=False)])
df_gas = df_gas.with_columns([pl.col('forecast_date').str.strptime(pl.Datetime, date_format, strict=False)])
df_electricity = df_electricity.with_columns([pl.col('forecast_date').str.strptime(pl.Datetime, date_format, strict=False)])
df_forecast = df_forecast.with_columns([pl.col('forecast_datetime').str.strptime(pl.Datetime, date_format, strict=False)])
df_historical = df_historical.with_columns([pl.col('datetime').str.strptime(pl.Datetime, date_format, strict=False)])

# Standardize datetime precision to microseconds
# This function is simplified assuming the datetime data is already timezone-naive
def standardize_datetime_precision(df, datetime_columns):
    for col in datetime_columns:
        df = df.with_columns([pl.col(col).cast(pl.Datetime)])
    return df

datetime_columns_data = ['datetime']
datetime_columns_client = ['date']
datetime_columns_gas = ['forecast_date']
datetime_columns_electricity = ['forecast_date']
datetime_columns_forecast = ['forecast_datetime']
datetime_columns_historical = ['datetime']

df_data = standardize_datetime_precision(df_data, datetime_columns_data)
df_client = standardize_datetime_precision(df_client, datetime_columns_client)
df_gas = standardize_datetime_precision(df_gas, datetime_columns_gas)
df_electricity = standardize_datetime_precision(df_electricity, datetime_columns_electricity)
df_forecast = standardize_datetime_precision(df_forecast, datetime_columns_forecast)
df_historical = standardize_datetime_precision(df_historical, datetime_columns_historical)

# Filtering data for years greater than 2021
df_data = df_data.filter(pl.col('datetime').dt.year() > 2021)

# Function to convert data into a pandas DataFrame
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "category_1"]

    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()

    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")

    df["target_mean"] = df[[f"target_{i}" for i in range(1, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(1, 7)]].std(1)
    df["target_ratio"] = df["target_6"] / (df["target_7"] + 1e-3)

    return df

# Define the feature engineering function
def feature_eng(df):
    # Example: Creating a new feature by combining existing ones
    if 'installed_capacity' in df.columns and 'eic_count' in df.columns:
        df = df.with_columns([(pl.col('installed_capacity') * pl.col('eic_count')).alias('capacity_eic_product')])

    # Example: Transforming a feature (e.g., logarithmic transformation)
    if 'euros_per_mwh' in df.columns:
        df = df.with_columns([pl.col('euros_per_mwh').log().alias('log_euros_per_mwh')])

    # Example: Encoding categorical variables
    if 'county' in df.columns:
        # Count the occurrences of each 'county'
        county_counts = df.group_by('county').agg(pl.count().alias('county_frequency'))

        # Join the counts back to the original dataframe
        df = df.join(county_counts, on='county', how='left')

    # Additional feature engineering steps...
    # TO DO...

    return df

# Feature engineering and data preparation in batches
number_of_batches = 100  # Adjust based on the memory capacity
for batch in np.array_split(df_data.to_pandas(), number_of_batches):
    X = pl.DataFrame(batch.drop(columns=["target"]))
    y = pl.DataFrame(batch[["target"]])

    # Apply feature engineering
    X = feature_eng(X)

# Garbage collection to free up memory
gc.collect()


0

# Training

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Assuming X is a Polars DataFrame, convert it to a Pandas DataFrame
X_df = X.to_pandas()

# Ensure the 'datetime' column is in datetime format
X_df['datetime'] = pd.to_datetime(X_df['datetime'])

# Convert the 'datetime' column to a numeric format (e.g., seconds since epoch)
X_df['datetime'] = X_df['datetime'].apply(lambda x: x.timestamp())

# Data Preprocessing
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df)

# Splitting Data
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y.to_pandas(), test_size=0.2, random_state=42)

# Convert y_train and y_test to 1D array
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Model Training
# Initialize and train the model
model = RandomForestRegressor(random_state=42)  # Use RandomForestClassifier for classification tasks
model.fit(X_train, y_train)

# Model Evaluation
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f"\n-> Mean Absolute Error: {mae}")

# Hyperparameter Tuning (Optional)
# This can be done using GridSearchCV or RandomizedSearchCV from sklearn.model_selection

# Prediction
# Use model.predict(new_data) to make predictions on new, unseen data



-> Mean Absolute Error: 59.44933381845689


In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

# Initialize environment
if 'env' not in globals():
    env = enefit.make_env()

# Initialize empty DataFrames
df_forecast = pl.DataFrame({col: [] for col in forecast_cols})
df_historical = pl.DataFrame({col: [] for col in historical_cols})
df_target = pl.DataFrame({col: [] for col in target_cols})

# Iterating over the test set provided by the environment
iter_test = env.iter_test()

for (test, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    # Data type conversion and renaming
    common_dtype = 'datetime64[ns]'
    test = test.rename(columns={"prediction_datetime": "datetime"})
    test['datetime'] = test['datetime'].astype(common_dtype)
    historical_weather['datetime'] = historical_weather['datetime'].astype(common_dtype)

    # Correct timezone conversion for forecast_weather
    if forecast_weather['forecast_datetime'].dt.tz is None:
        forecast_weather['forecast_datetime'] = forecast_weather['forecast_datetime'].dt.tz_localize('UTC')
    forecast_weather['forecast_datetime'] = forecast_weather['forecast_datetime'].dt.tz_convert(None)
    forecast_weather['forecast_datetime'] = forecast_weather['forecast_datetime'].astype(common_dtype)

    revealed_targets['datetime'] = revealed_targets['datetime'].astype(common_dtype)
    client['date'] = client['date'].astype(common_dtype)

    # Concatenating and making data unique
    df_new_forecast = pl.from_pandas(forecast_weather[forecast_cols])
    df_new_historical = pl.from_pandas(historical_weather[historical_cols])
    df_new_target = pl.from_pandas(revealed_targets[target_cols])

    df_forecast = df_forecast.cast(df_new_forecast.schema)
    df_historical = df_historical.cast(df_new_historical.schema)
    df_target = df_target.cast(df_new_target.schema)

    df_forecast = pl.concat([df_forecast, df_new_forecast]).unique()
    df_historical = pl.concat([df_historical, df_new_historical]).unique()
    df_target = pl.concat([df_target, df_new_target]).unique()

    # Convert test DataFrame to Polars DataFrame
    test_pl = pl.from_pandas(test)

    # Rename columns for joining, if they exist
    if 'datetime_historical' in df_historical.columns:
        df_historical = df_historical.rename({'datetime_historical': 'datetime'})
    if 'datetime_target' in df_target.columns:
        df_target = df_target.rename({'datetime_target': 'datetime'})
    if 'county_target' in df_target.columns:
        df_target = df_target.rename({'county_target': 'county'})

    # Further processing and combining data for model input
    # Joining DataFrames using Polars join function
    X_test = test_pl.join(df_historical, on='datetime', how='left')
    X_test = X_test.join(df_target, on=['datetime', 'county'], how='left')

    # Convert to Pandas DataFrame for compatibility with scikit-learn
    X_test_pandas = X_test.to_pandas()

    # Using SimpleImputer to fill NaN values across all columns
    imputer = SimpleImputer(strategy='mean')
    X_test_imputed = imputer.fit_transform(X_test_pandas)

    # Ensure only numeric columns are used for model prediction
    X_test_for_model = pd.DataFrame(X_test_imputed, columns=X_test_pandas.columns).select_dtypes(include=[np.number])

    # Check if the number of rows in X_test matches sample_prediction
    if len(X_test_for_model) != len(sample_prediction):
        print(f"Warning: Mismatch in number of rows. X_test has {len(X_test_for_model)} rows, sample_prediction has {len(sample_prediction)} rows.")

    # Making predictions
    predictions = model.predict(X_test_for_model)
    if len(predictions) != len(sample_prediction):
        print(f"Warning: Mismatch in prediction length. Model returned {len(predictions)} predictions, expected {len(sample_prediction)}.")

    sample_prediction["target"] = predictions.clip(0)

    # Submitting predictions
    env.predict(sample_prediction)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


TypeError: Cannot cast DatetimeArray to dtype float64