In [20]:
import os

# set path to the dataset folder
dataset_folder = os.path.join(os.getcwd(), 'dataset')

# count the number of files in the dataset folder
num_files = len([f for f in os.listdir(dataset_folder) if os.path.isfile(os.path.join(dataset_folder, f))])
print(f'Number of files in dataset folder: {num_files}')

Number of files in dataset folder: 155


In [21]:
import pandas as pd
import os

# Initialize an empty list to store dataframes
dataframes = []
save_path = os.path.join(os.getcwd(), 'dataset_merge')

power_columns = ["Most_Recent_Forecast_MW", "Day_Ahead_Forecast_MW", "Week_Ahead_Forecast_MW", 
                      "Real_Time_Measurement_MW", "Corrected_Measurement_MW", "Monitored_Capacity_MWp", "Day_Ahead_11h00_MW"]

# Iterate over each file in the dataset folder
for file in os.listdir(dataset_folder):
    if file.endswith('.xls'):
        file_path = os.path.join(dataset_folder, file)
        
        # Load the Excel file
        df = pd.read_excel(file_path)
        
        # Rename columns for easier access
        df.columns = ["DateTime", "Most_Recent_Forecast_MW", "Day_Ahead_Forecast_MW", "Week_Ahead_Forecast_MW", 
                      "Real_Time_Measurement_MW", "Corrected_Measurement_MW", "Monitored_Capacity_MWp", "Day_Ahead_11h00_MW"]
        
        # Drop the first three rows which contain metadata
        df = df.iloc[3:].reset_index(drop=True)
        
        # Convert DateTime column to actual datetime format
        df["DateTime"] = pd.to_datetime(df["DateTime"], format="%d/%m/%Y %H:%M", errors='coerce')
        
        # Convert power columns to numeric values
        df[power_columns] = df[power_columns].apply(pd.to_numeric, errors="coerce")
        
        # Append the cleaned dataframe to the list
        dataframes.append(df)

# Concatenate all dataframes into a single dataframe
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged dataframe to a new Excel file
merged_df.to_excel(os.path.join(save_path, 'dataset_merge.xlsx'), index=False)

# Display the merged dataframe
import ace_tools_open as tools
tools.display_dataframe_to_user(name="Merged Solar Power Output Data", dataframe=merged_df)

merged_df.head(10)

Merged Solar Power Output Data


DateTime,Most_Recent_Forecast_MW,Day_Ahead_Forecast_MW,Week_Ahead_Forecast_MW,Real_Time_Measurement_MW,Corrected_Measurement_MW,Monitored_Capacity_MWp,Day_Ahead_11h00_MW
Loading ITables v2.2.4 from the internet... (need help?),,,,,,,


Unnamed: 0,DateTime,Most_Recent_Forecast_MW,Day_Ahead_Forecast_MW,Week_Ahead_Forecast_MW,Real_Time_Measurement_MW,Corrected_Measurement_MW,Monitored_Capacity_MWp,Day_Ahead_11h00_MW
0,2012-02-01 00:00:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
1,2012-02-01 00:15:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
2,2012-02-01 00:30:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
3,2012-02-01 00:45:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
4,2012-02-01 01:00:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
5,2012-02-01 01:15:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
6,2012-02-01 01:30:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
7,2012-02-01 01:45:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
8,2012-02-01 02:00:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0
9,2012-02-01 02:15:00,0.0,0.0,0.0,0.0,0.0,400.02,0.0


In [23]:
# count number of rows in dataset
num_rows = merged_df.shape[0]
print(f'Number of rows in the merged dataset: {num_rows}')

Number of rows in the merged dataset: 452928


In [22]:
# cheack for missing values
missing_values = merged_df.isnull().sum()
print(f'Missing values in the merged dataset: \n{missing_values}')

Missing values in the merged dataset: 
DateTime                         0
Most_Recent_Forecast_MW          0
Day_Ahead_Forecast_MW       109728
Week_Ahead_Forecast_MW      159655
Real_Time_Measurement_MW         0
Corrected_Measurement_MW         0
Monitored_Capacity_MWp           0
Day_Ahead_11h00_MW            1152
dtype: int64
