## **Estimation of freshwater lens thickness (_FWL_) using three different methods**

> **Objective:** Apply three different methods to estimate the depth of the freshwater lens. Based on this estimation, extract the corresponding array for the freshwater zone, calculate basic statistics, and plot the boxplots for each profile using the three methods.


---

### Import Libraries

In [5]:
import sys
import os

root = os.path.abspath('..')  
sys.path.append(root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import modules.sec_fwl_estimation as sec_fwl

from modules import processing, load, plots, analysis

---

### Load data

1. Cargar datos del primer método

---

### **Method 1:** _Intuitive criterion_

## !!!!! Esto es una prueba, no HACCER MUCHO CASO 

In [12]:
import os
import random
from typing import Dict, List, Optional, Any

import numpy as np
import pandas as pd
import plotly.graph_objects as go

# =============================================================================
# Utility functions for file name handling and data simulation
# =============================================================================
def get_file_suffix(subfolder: str) -> str:
    """
    Determine the file suffix (or extension) based on the subfolder.
    
    Args:
        subfolder (str): Name of the subfolder ('rawdy/', 'processed/', or 'raw/').
        
    Returns:
        str: Suffix to be appended to the base file name.
    """
    mapping = {
        'rawdy': '_rowdy.csv',
        'processed': '_processed.csv',
        'raw': '.cd'
    }
    key = subfolder.strip('/').lower()
    if key not in mapping:
        raise ValueError(f"Invalid subfolder name: {subfolder}")
    return mapping[key]


def simulate_read_data(well_id: str, subfolder: str) -> pd.DataFrame:
    """
    Simulate reading a CSV file from the given subfolder for a specific well.
    The simulated DataFrame contains:
        - 'Vertical Position [m]': 100 points uniformly distributed between 5 and 20.
        - 'Corrected sp Cond [uS/cm]': Simulated electrical conductivity data.
    
    Args:
        well_id (str): The base name (ID) of the well.
        subfolder (str): The subfolder from which to read the file.
        
    Returns:
        pd.DataFrame: DataFrame with simulated data.
    """
    np.random.seed(hash(well_id) % 2**32)
    vertical_positions = np.linspace(5, 20, 100)
    # Simulate conductivity data (e.g., normal distribution around 100)
    conductivity = np.random.normal(loc=100, scale=10, size=100)
    
    df = pd.DataFrame({
        "Vertical Position [m]": vertical_positions,
        "Corrected sp Cond [uS/cm]": conductivity
    })
    return df


# =============================================================================
# Data Filtering Module
# =============================================================================
def filter_well_data(well_data: pd.DataFrame, filter_value: float) -> pd.DataFrame:
    """
    Filter the well data based on the vertical position criterion.
    Keeps only rows where 'Vertical Position [m]' is less than or equal to filter_value.
    
    Args:
        well_data (pd.DataFrame): DataFrame with well data.
        filter_value (float): The threshold value for filtering.
    
    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    if "Vertical Position [m]" not in well_data.columns:
        raise KeyError("Column 'Vertical Position [m]' not found in the data.")
    filtered = well_data[well_data["Vertical Position [m]"] <= filter_value].copy()
    return filtered


def load_and_filter_data(
    file_info_df: pd.DataFrame, 
    subfolder: str
) -> Dict[str, Dict[str, pd.DataFrame]]:
    """
    Load and filter data for each well based on the three filtering methods.
    
    The file_info_df must contain the columns:
        - 'ID'
        - 'vp_dgh': Filter value for method DGH.
        - 'vp_bic': Filter value for method BIC.
        - 'vp_ic': Filter value for method IC.
        
    Args:
        file_info_df (pd.DataFrame): DataFrame with filtering points for each well.
        subfolder (str): The subfolder name (e.g. 'rawdy', 'processed', or 'raw').
        
    Returns:
        Dict[str, Dict[str, pd.DataFrame]]: Nested dictionary where the first key is the well ID 
        and the second key is the filtering method (e.g. 'IC', 'BIC', 'DGH'), and the value is 
        the filtered DataFrame.
    """
    filtered_data: Dict[str, Dict[str, pd.DataFrame]] = {}
    # Map filtering method keys to the corresponding column in file_info_df
    method_mapping = {
        'IC': 'vp_ic',
        'BIC': 'vp_bic',
        'DGH': 'vp_dgh'
    }
    
    for idx, row in file_info_df.iterrows():
        well_id = row['ID']
        # Read data (simulate reading CSV based on the subfolder)
        try:
            df = simulate_read_data(well_id, subfolder)
        except Exception as e:
            print(f"Error reading data for {well_id}: {e}")
            continue
        
        filtered_data[well_id] = {}
        for method, col_name in method_mapping.items():
            if col_name not in row:
                raise KeyError(f"Filtering column '{col_name}' not found in file_info_df.")
            filter_value = row[col_name]
            filtered_df = filter_well_data(df, filter_value)
            filtered_data[well_id][method] = filtered_df
    
    return filtered_data


# =============================================================================
# Boxplot Generation Module
# =============================================================================
def calculate_outliers(data: np.ndarray) -> int:
    """
    Calculate the number of outliers in the data using the IQR method.
    
    Args:
        data (np.ndarray): 1D array of numerical values.
    
    Returns:
        int: Number of outliers.
    """
    if data.size == 0:
        return 0
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = np.sum((data < lower_bound) | (data > upper_bound))
    return int(outliers)


def generate_boxplots(
    filtered_data: Dict[str, Dict[str, pd.DataFrame]],
    variable: str,
    show_outliers: bool = True,
    order: Optional[List[str]] = None
) -> go.Figure:
    """
    Generate boxplots with Plotly for each well and each filtering method.
    
    Args:
        filtered_data (Dict[str, Dict[str, pd.DataFrame]]): Nested dictionary with filtered data.
        variable (str): The column name to analyze ('Corrected sp Cond [uS/cm]' or 'Vertical Position [m]').
        show_outliers (bool): Whether to display outliers in the boxplots.
        order (Optional[List[str]]): Optional list to define the order of boxplots on the y-axis.
            Each entry should have the format "{well} - {method}".
    
    Returns:
        go.Figure: Plotly figure object with the boxplots and annotations.
    """
    box_traces = []
    annotations = []
    
    # Create a list of group labels (each in the format "well - method")
    group_labels = []
    group_data = []
    
    for well, methods in filtered_data.items():
        for method, df in methods.items():
            label = f"{well} - {method}"
            group_labels.append(label)
            group_data.append(df[variable].values)
    
    # If an order is provided, reorder the groups accordingly.
    if order:
        # Build mapping from label to (data, index) and reorder
        label_to_data = {lbl: dat for lbl, dat in zip(group_labels, group_data)}
        group_labels = order
        group_data = [label_to_data[lbl] for lbl in order if lbl in label_to_data]
    
    # Set the parameter for showing outliers in Plotly
    boxpoints_setting: Any = "outliers" if show_outliers else False
    
    # Create boxplot traces for each group
    for label, data in zip(group_labels, group_data):
        trace = go.Box(
            x=data,
            name=label,
            boxpoints=boxpoints_setting,
            orientation='h'
        )
        box_traces.append(trace)
    
    fig = go.Figure(data=box_traces)
    
    # Add annotations for each group with the total number of points and outliers
    for label, data in zip(group_labels, group_data):
        n_points = len(data)
        n_outliers = calculate_outliers(data)
        annotation_text = f"n={n_points}, out={n_outliers}"
        
        # The y-coordinate is set to the corresponding category
        # xref is set to 'paper' to place annotations outside the main plotting area.
        annotation = dict(
            x=1.05,  # position outside the plot
            y=label,
            xref="paper",
            yref="y",
            text=annotation_text,
            showarrow=False,
            font=dict(size=10),
            align="left"
        )
        annotations.append(annotation)
    
    fig.update_layout(
        yaxis=dict(
            title="Well - Filtering Method",
            categoryorder="array",
            categoryarray=group_labels
        ),
        xaxis=dict(title=variable),
        margin=dict(r=150),  # Extra margin on the right for annotations
        annotations=annotations,
        template="plotly_white",
        title=f"Boxplots of {variable} by Well and Filtering Method"
    )
    
    return fig


In [13]:

# List of 25 well IDs as provided
well_ids = [
    "AW1D_YSI_20230826", "AW2D_YSI_20230815", "AW5D_YSI_20230824", "AW6D_YSI_20230815",
    "AW7D_YSI_20230814", "BW1D_YSI_20230824", "BW2D_YSI_20230819", "BW3D_YSI_20230818",
    "BW4D_YSI_20230816", "BW5D_YSI_20230822", "BW6D_YSI_20230826", "BW7D_YSI_20230826",
    "BW8D_YSI_20230823", "BW9D_YSI_20230823", "BW10D_YSI_20230825", "BW11D_YSI_20230823",
    "LRS33D_YSI_20230822", "LRS65D_YSI_20230827", "LRS69D_YSI_20230818", "LRS70D_YSI_20230822",
    "LRS75D_YSI_20230819", "LRS79D_YSI_20230827", "LRS81D_YSI_20230823", "LRS89D_YSI_20230825",
    "LRS90D_YSI_20230827"
]

# Create a DataFrame with filtering points for each well.
# For demonstration, generate random filtering points in the range [10, 15] for each method.
random.seed(42)
file_info = {
    "ID": well_ids,
    "vp_dgh": [round(random.uniform(10, 15), 2) for _ in well_ids],
    "vp_bic": [round(random.uniform(10, 15), 2) for _ in well_ids],
    "vp_ic":  [round(random.uniform(10, 15), 2) for _ in well_ids]
}
file_info_df = pd.DataFrame(file_info)

# Load and filter the data from the 'rawdy' subfolder.
filtered_data = load_and_filter_data(file_info_df, subfolder="rawdy")

# Generate a boxplot for the variable 'Corrected sp Cond [uS/cm]'
fig = generate_boxplots(filtered_data, variable="Corrected sp Cond [uS/cm]", show_outliers=True)

fig.show()

In [14]:
file_info_df

Unnamed: 0,ID,vp_dgh,vp_bic,vp_ic
0,AW1D_YSI_20230826,13.2,11.68,11.85
1,AW2D_YSI_20230815,10.13,10.46,11.05
2,AW5D_YSI_20230824,11.38,10.48,11.33
3,AW6D_YSI_20230815,11.12,14.24,14.68
4,AW7D_YSI_20230814,13.68,13.02,13.24
5,BW1D_YSI_20230824,13.38,14.04,13.05
6,BW2D_YSI_20230819,14.46,13.65,10.86
7,BW3D_YSI_20230818,10.43,12.68,13.65
8,BW4D_YSI_20230816,12.11,14.87,10.82
9,BW5D_YSI_20230822,10.15,11.89,11.9
