In [29]:
# Libraries
import pandas as pd
import numpy as np
import os # Library for interacting with the operating system

# Visualizations
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

## 1. Loading Data

In [30]:
# Upload files from H1 to H10
files = [f'./Plots/H{i}_Wh.csv' for i in range(1, 11)]
dfs = [pd.read_csv(f) for f in files]

# Unite all energy data
energy_df = pd.concat(dfs, ignore_index=True)

## 2. Data Preprocessing and Cleaning

In [31]:
# show general information about the dataset
energy_info = energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5270330 entries, 0 to 5270329
Data columns (total 8 columns):
 #   Column               Dtype  
---  ------               -----  
 0   date                 object 
 1    Discharge(Wh)       float64
 2    Charge(Wh)          float64
 3    Production(Wh)      float64
 4    Consumption(Wh)     float64
 5    Feed-in(Wh)         float64
 6    From grid(Wh)       float64
 7    State of Charge(%)  float64
dtypes: float64(7), object(1)
memory usage: 321.7+ MB


In [32]:
print(f"Dataset contains {energy_df.shape[0]} rows and {energy_df.shape[1]} columns.")
energy_df.head()

Dataset contains 5270330 rows and 8 columns.


Unnamed: 0,date,Discharge(Wh),Charge(Wh),Production(Wh),Consumption(Wh),Feed-in(Wh),From grid(Wh),State of Charge(%)
0,2020-01-01 01:01:00,0.0,0.13,0.0,1.58,0.0,1.71,0.0
1,2020-01-01 01:02:00,0.0,0.13,0.0,1.58,0.0,1.71,0.0
2,2020-01-01 01:03:00,0.0,0.13,0.0,1.57,0.0,1.7,0.0
3,2020-01-01 01:04:00,0.0,0.13,0.0,1.55,0.0,1.68,0.0
4,2020-01-01 01:05:00,0.0,0.13,0.0,1.55,0.0,1.68,0.0


In [33]:
# Check unique values for categorical columns in client data
for col in energy_df.select_dtypes(include=["object", "category"]).columns:
    print(f"{col} has {energy_df[col].nunique()} unique values: {energy_df[col].unique()[:5]}", '\n')

date has 527039 unique values: ['2020-01-01 01:01:00' '2020-01-01 01:02:00' '2020-01-01 01:03:00'
 '2020-01-01 01:04:00' '2020-01-01 01:05:00'] 



---
### 2.1. Handling Missing Data

In [34]:
# check for missing data
print(energy_df.isnull().sum())

date                   0
 Discharge(Wh)         0
 Charge(Wh)            0
 Production(Wh)        0
 Consumption(Wh)       0
 Feed-in(Wh)           0
 From grid(Wh)         0
 State of Charge(%)    0
dtype: int64


### 2.2 Feature Engeneering and Transformation

`Transform date columns to datetime type`

In [35]:
# Standardize columns and dates
for df in dfs:
    df.columns = [col.strip() for col in df.columns]
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

`Energy Feature Engeneering`

In [36]:
# Group by date and add production and consumption
energy_df.columns = [col.strip() for col in energy_df.columns]
total_energy = energy_df.groupby('date')[['Production(Wh)', 'Consumption(Wh)']].sum().reset_index()

# Rename columns
total_energy.rename(columns={
    'Production(Wh)': 'Total Production (Wh)',
    'Consumption(Wh)': 'Total Consumption(Wh)'
}, inplace=True)

total_energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527039 entries, 0 to 527038
Data columns (total 3 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date                   527039 non-null  object 
 1   Total Production (Wh)  527039 non-null  float64
 2   Total Consumption(Wh)  527039 non-null  float64
dtypes: float64(2), object(1)
memory usage: 12.1+ MB


In [37]:
# Saving cleaned dataset
directory_name = 'dataset'
output_filename = 'energy_ie_clean.csv'

# Build the full path
output_path = os.path.join(directory_name, output_filename)

# Save the DataFrame to a CSV file
total_energy.to_csv(output_path, index=False)