In [None]:
import pandas as pd
import os
import numpy as np
from datetime import datetime

-----------

In [None]:
# Set the folder path that storing historical crawl data
crawl_data_folder = r'xxx'

In [None]:
# Define a function to calculate daily energy generation for PV panels in a single station

def calculate_daily_energy(file_path):             # file_path: Daily power generation CSV file for a single PV station 
    df = pd.read_csv(file_path, parse_dates=[0], index_col=[0])
    # Calculate the daily energy for each PV panel
    daily_energy = df.sum() / 4 / 1000
    # Create a new dataframe with the daily energy values and the module names
    daily_energy_df = pd.DataFrame({'Module': df.columns, 'PV generation (kWh)': daily_energy})
    # Reset the index to start from 0
    daily_energy_df = daily_energy_df.reset_index(drop=True)
    return daily_energy_df

In [None]:
def sum_daily_energy(target_station, start_date, end_date):
    # Convert start_date and end_date to Timestamp type
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    # Find all date folders between start_date and end_date
    date_folders = []
    for folder_name in os.listdir(crawl_data_folder):
        if os.path.isdir(os.path.join(crawl_data_folder, folder_name)):
            try:
                folder_date = pd.to_datetime(folder_name)
                if start_date <= folder_date <= end_date:
                    date_folders.append(folder_name)
            except ValueError:
                pass

    # Find all target csv files in the date folders and calculate daily energy
    dfs = []
    for date_folder in date_folders:
        folder_path = os.path.join(crawl_data_folder, date_folder)
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv') and target_station in file_name:
                file_path = os.path.join(folder_path, file_name)
                daily_energy_df = calculate_daily_energy(file_path)
                dfs.append(daily_energy_df)

    # Sum the daily energy by module
    result_df = pd.concat(dfs).groupby('Module').sum().reset_index()
    return result_df

In [None]:
# Define a function to find the PV panels with low energy
def find_low_energy_pv(target_station, start_date, end_date,threshold):
    result_df=sum_daily_energy(target_station, start_date, end_date)
    # Calculate the mean and standard deviation of the daily energy for all PV panels
    mean_energy = result_df['PV generation (kWh)'].mean()
    std_energy = result_df['PV generation (kWh)'].std()
    # Find the PV panels with energy lower than (mean - 2*std)
    low_energy_pv = (result_df[result_df['PV generation (kWh)'] < (mean_energy - threshold * std_energy)])['Module'].tolist()
    # Output the list of PV panels with low energy
    print(f"The following PV panels at {target_station} generated less energy than {threshold} standard deviations below the mean:\n{low_energy_pv}")

------------

#### Task 2: PV panels fault detection for all power stations
- Customized start and end time, and threshold

In [None]:
site_info = pd.read_excel(r'xxx') # Local file cotains all sites' name
site_list= site_info['Site Name']

In [None]:
for target_station in site_list:
    find_low_energy_pv(target_station, start_date, end_date,threshold)