<a href="https://colab.research.google.com/github/carlos-alves-one/-Energy-Forecast/blob/main/datasets_merge_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Google Drive

In [1]:
# Imports the 'drive' module from 'google.colab' and mounts the Google Drive to
# the '/content/drive' directory in the Colab environment.
from google.colab import drive

# This function mounts Google Drive
def mount_google_drive():
    drive.mount('/content/drive')

# Call the function to mount Google Drive
mount_google_drive()


Mounted at /content/drive


# Importing Necessary Libraries and Packages

In [2]:
import pandas as pd              # Import pandas for data manipulation and analysis


# Input Data Files

In [3]:
test        = pd.read_csv('/content/drive/MyDrive/project_energy/test.csv')
targets     = pd.read_csv('/content/drive/MyDrive/project_energy/revealed_targets.csv')
gas         = pd.read_csv('/content/drive/MyDrive/project_energy/gas_prices.csv')
electricity = pd.read_csv('/content/drive/MyDrive/project_energy/electricity_prices.csv')
client      = pd.read_csv('/content/drive/MyDrive/project_energy/client.csv')
forecast    = pd.read_csv('/content/drive/MyDrive/project_energy/forecast_weather.csv')
historical  = pd.read_csv('/content/drive/MyDrive/project_energy/historical_weather.csv')


# Merge Test and Targets datasets

In [4]:
# Reducing the number of keys for merging
merge_keys = ["county", "is_business", "row_id"]

# Merging test.csv with revealed_targets.csv
merged_df = pd.merge(test, targets, on=merge_keys, how='inner')

# Renaming columns to differentiate between the datasets
merged_df = merged_df.rename(columns={'datetime': 'target_datetime', 'target': 'actual_target'})

# Displaying the merged dataframe
merged_df.head(3).T


Unnamed: 0,0,1,2
county,0,0,0
is_business,0,0,0
product_type_x,1,1,2
is_consumption_x,0,1,0
prediction_datetime,2023-05-28 00:00:00,2023-05-28 00:00:00,2023-05-28 00:00:00
data_block_id_x,634,634,634
row_id,2005872,2005873,2005874
prediction_unit_id_x,0,0,1
currently_scored,False,False,False
product_type_y,1,1,2


In [5]:
test.shape

(12480, 9)

In [6]:
targets.shape

(12576, 9)

In [7]:
merged_df.shape

(6240, 15)

In [8]:
print(merged_df.isnull().sum())

county                  0
is_business             0
product_type_x          0
is_consumption_x        0
prediction_datetime     0
data_block_id_x         0
row_id                  0
prediction_unit_id_x    0
currently_scored        0
product_type_y          0
actual_target           0
is_consumption_y        0
target_datetime         0
data_block_id_y         0
prediction_unit_id_y    0
dtype: int64


In [10]:
# Saving the final merged dataset to CSV
output_file_path = "/content/drive/MyDrive/project_energy/merged_data.csv"
merged_df.to_csv(output_file_path, index=False)  # Set index=False if you don't want to include the index

In [11]:
# Load the merged dataset and display the first 3 records
data = pd.read_csv("/content/drive/MyDrive/project_energy/merged_data.csv")
data.head(3).T

Unnamed: 0,0,1,2
county,0,0,0
is_business,0,0,0
product_type_x,1,1,2
is_consumption_x,0,1,0
prediction_datetime,2023-05-28 00:00:00,2023-05-28 00:00:00,2023-05-28 00:00:00
data_block_id_x,634,634,634
row_id,2005872,2005873,2005874
prediction_unit_id_x,0,0,1
currently_scored,False,False,False
product_type_y,1,1,2


In [12]:
historical.shape

(10752, 18)

In [13]:
forecast.shape

(21504, 18)

In [14]:
historical.head(3).T

Unnamed: 0,0,1,2
datetime,2023-05-26 11:00:00,2023-05-26 11:00:00,2023-05-26 11:00:00
temperature,13.5,13.4,16.4
dewpoint,9.0,8.9,7.8
rain,0.0,0.2,0.2
snowfall,0.0,0.0,0.0
surface_pressure,1018.5,1013.2,1017.7
cloudcover_total,30,47,60
cloudcover_low,31,31,21
cloudcover_mid,3,32,69
cloudcover_high,0,0,0


In [15]:
forecast.head(3).T

Unnamed: 0,0,1,2
latitude,57.6,57.6,57.6
longitude,21.7,22.2,22.7
origin_datetime,2023-05-27 02:00:00,2023-05-27 02:00:00,2023-05-27 02:00:00
hours_ahead,1,1,1
temperature,9.859155,5.916284,9.111963
dewpoint,5.508813,4.613428,6.878442
cloudcover_high,0.0,0.0,0.0
cloudcover_low,0.0,0.0,0.0
cloudcover_mid,0.026901,0.0,0.0
cloudcover_total,0.026901,0.0,0.0
