## This script is a simplifid version for you to learn the calculation process of 95PPU.
## An encapsulated version will be shared through ZOOM channel afterwards.

In [None]:
# import the library. install them if not yet
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Block 1, we look into the process for the 95PPU(95% Prediction Uncertainty) calculation step by step:

- 1.1. [cacluate 95PPU (95% Prediction Uncertainty)](#step1.1)
    - calculate_prediction_interval
- 1.2. [read in data](#step1.2)
    - read_data
- 1.3. [rearrange the data](#step1.3)
    - rearrange_data
- 1.4. [apply the calculate_prediction_interval to the df data](#step1.4)
- 1.5. [plot the ppu curve](#step1.5)
    - plot_ppu
- 1.6. [preparation for batch processing](#step1.6)
    - get_files

### 1.1. cacluate 95PPU (95% Prediction Uncertainty) <a name="step 1.1"></a>
- calculate_prediction_interval

In [None]:
def calculate_prediction_interval(data):
    # Assuming 'data' is a list or numpy array containing the data points
    mean = np.mean(data)
    std_dev = np.std(data)
    n = len(data)
    z_score = 1.96  # For 95% confidence level
    sem = std_dev / np.sqrt(n)  # Standard error of the mean (SEM)
    lower_bound = mean - z_score * sem
    upper_bound = mean + z_score * sem
    return lower_bound, upper_bound

# Example:
data_row = [10, 12, 14, 15, 16, 18, 20]  # Example data 
lower_bound, upper_bound = calculate_prediction_interval(data_row)
print(f"95% Prediction Interval: [{lower_bound}, {upper_bound}]")


### 1. 2. read in data <a name="step1.2"></a>
- read_data

In [None]:
def read_data(folder_path, file):
    with open(os.path.join(folder_path, file), 'r') as file:
        data = file.readlines()
    return data

# Example:
folder = 'D:\Python\Code\Lecture\WFM_PRE\AnalysisSWAT\CUP95'
file = 'SW_1.txt'
data = read_data(folder, file)
print(data)

### 1.3. rearrange the data  <a name="step1.3"></a>
- rearrange_data: rearrange the data into a DataFrame\
please open the 'SW_1.txt' with a text editor.
    - what is the data stucture?
    - what is your target?
    - what do you want to keep?
- Do you have an idea to rearrange or extract the data?

In [None]:
def rearrange_data(data):
    rearranged_data = []
    record_group = False

    '''
    Insert your idea here.I share you one slolution during the class
    '''

    if record_group:
        rearranged_data.append(group)

    df = pd.DataFrame(rearranged_data).T
    return df

# Example: we work on the data from step 2.
df = rearrange_data(data)
print(df)

### 1. 4. apply the calculate_prediction_interval to the df data to calucath the 95PPU for each time step <a name="step1.4"></a>

In [None]:
# Example: we work on the df from step 3.
ppu = df.apply(calculate_prediction_interval, axis=1)
#convert to dataframe format
ppu = pd.DataFrame(ppu.tolist(), columns=['Lower Bound', 'Upper Bound']) 
print(ppu)

## 1.5. plot the ppu in 1.4 to visulize the entropy. <a name="step1.5"></a>

In [None]:
def plot_ppu(ppu_df):
    plt.figure(figsize=(10, 6))

    # Plot the prediction intervals
    sns.lineplot(data=ppu_df, dashes=False)
    
    # Set labels and title
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.show()
# Example: use the ppu from step 1.4.
plot_ppu(ppu)

### 1.6. preparation for batch processing  <a name="step1.6"></a>
- get_files: get the list of the files in a folder

In [None]:
def get_files(folder_path):
    files = [file for file in os.listdir(folder_path) if file.endswith('.txt')]
    return files

# please define your folder path
# example:
folder = 'D:\Python\Code\Lecture\WFM_PRE\AnalysisSWAT\CUP95'
files = get_files(folder)
print(files)

## 2. Block 2, now you understand all step. We ensemble all functions together for batch processing.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def get_files(folder_path):
    files = [file for file in os.listdir(folder_path) if file.endswith('.txt')]
    # Sort files numerically based on the number part of the file name
    files_sorted = sorted(files, key=lambda x: int(x.split('_')[1].split('.')[0]))
    return files_sorted

def read_data(folder_path, file):
    with open(os.path.join(folder_path, file), 'r') as file:
        data = file.readlines()
    return data

def rearrange_data(data):
    rearranged_data = []
    record_group = False
    '''
    Insert your idea here.I share you one slolution during the class
    '''
    if record_group:
        rearranged_data.append(group)

    df = pd.DataFrame(rearranged_data).T
    return df

def calculate_prediction_interval(row):
    mean = np.mean(row)
    std_dev = np.std(row)
    n = len(row)
    z_score = 1.96  # For 95% confidence level
    sem = std_dev / np.sqrt(n)  # Standard error of the mean (SEM)
    lower_bound = mean - z_score * sem
    upper_bound = mean + z_score * sem
    return lower_bound, upper_bound

def prediction_intervals(folder_path, files):
    # List to store results for all files
    all_results = []
    # Iterate over each file in the given list of files
    for file in files:
        # Read data from the current file
        data = read_data(folder_path, file)
        # Rearrange the data into a DataFrame
        df = rearrange_data(data)
        # Calculate prediction intervals for each row in the DataFrame
        prediction_intervals = df.apply(calculate_prediction_interval, axis=1)
        # Convert prediction intervals to DataFrame format
        result_df = pd.DataFrame(prediction_intervals.tolist(), columns=['Lower Bound', 'Upper Bound'])
        # Append the result DataFrame to the list of all results
        all_results.append(result_df)
    # Return the list of DataFrames containing prediction intervals for all files
    return all_results

# Example: try the functions out 
folder = 'D:\Python\Code\Lecture\WFM_PRE\AnalysisSWAT\CUP95'
files = get_files(folder)
ppu_all = prediction_intervals(folder,files)
print(ppu_all)

## More tasks: How about the plots for batch processed outputs?
- Run and check the final encapsulated version shared via ZOOM channel.
- I believe now you are capable to adapt the final version for different cases. 
- Have a try with e.g. the simluated **streamflow** outputs. **show me your tests outcomes via ZOOM Channel**.