# Resample data


---

### Import libraries

In [1]:
import sys
import os


root = os.path.abspath('../..')  
sys.path.append(root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

from modules import processing, load, plots, analysis

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 25)

---

### Import data

In [2]:
well_name = 'BW5D_YSI_20230822'

df_well = pd.read_csv(f'{root}/data/raw/{well_name}.csv')

df_well = df_well[['Vertical Position m', 'Corrected sp Cond [µS/cm]']]


df_well

Unnamed: 0,Vertical Position m,Corrected sp Cond [µS/cm]
0,-0.015,1765.7
1,-0.012,1765.6
2,-0.003,1765.4
3,-0.001,1764.9
4,0.001,1764.3
...,...,...
12183,34.231,55098.6
12184,34.242,55098.6
12185,34.243,55098.6
12186,34.233,55098.6


---

### Function to resample data and plot

> Data resampling is performed through:

- Binning of measurements at fixed intervals
- Averaging of values within each bin (mean)
- Default sampling interval of 5cm
- Original units are preserved:
  * Vertical position in meters (m)
  * Specific conductivity in microSiemens/cm (µS/cm)




In [3]:
def resample_conductivity_data(df: pd.DataFrame, sampling_interval: float = 0.05) -> pd.DataFrame:
    """
    Resample conductivity data at uniform vertical position intervals.
    
    Args:
        df: DataFrame containing conductivity measurements with columns 'Vertical Position m' and 'Corrected sp Cond [µS/cm]'
        sampling_interval: Interval in meters for resampling (default: 0.05m or 5cm)
    
    Returns:
        DataFrame with resampled conductivity data at exact interval centers
    """
    # Sort data by vertical position
    sorted_df = df.sort_values('Vertical Position m')
    
    # Define range for resampling
    min_pos = sorted_df['Vertical Position m'].min()
    max_pos = sorted_df['Vertical Position m'].max()
    
    # Create centers of intervals
    interval_centers = np.arange(
        np.ceil(min_pos / sampling_interval) * sampling_interval,
        np.floor(max_pos / sampling_interval) * sampling_interval + sampling_interval,
        sampling_interval
    )
    
    # Create empty dataframe for results
    resampled_data = []
    
    # For each center point, find nearest measurements and average them
    for center in interval_centers:
        # Define interval boundaries
        lower_bound = center - sampling_interval/2
        upper_bound = center + sampling_interval/2
        
        # Find points within this interval
        interval_data = sorted_df[
            (sorted_df['Vertical Position m'] >= lower_bound) & 
            (sorted_df['Vertical Position m'] < upper_bound)
        ]
        
        # Only include intervals with actual data
        if len(interval_data) > 0:
            avg_conductivity = interval_data['Corrected sp Cond [µS/cm]'].mean()
            resampled_data.append({
                'Vertical Position m': center,
                'Corrected sp Cond [µS/cm]': avg_conductivity
            })
    
    # Create dataframe from results
    resampled_df = pd.DataFrame(resampled_data)
    
    return resampled_df


def create_comparison_plot(original_df: pd.DataFrame, resampled_df: pd.DataFrame, 
                          sampling_interval: float, x_min: float, num_intervals: int) -> go.Figure:
    """
    Create a comparison plot between original and resampled conductivity data.
    
    Args:
        original_df: DataFrame with original conductivity data
        resampled_df: DataFrame with resampled conductivity data
        sampling_interval: Interval used for resampling (in meters)
        x_min: Minimum x-value for inset plot
        num_intervals: Number of intervals to show in inset plot
        
    Returns:
        Plotly figure object with main plot and inset
    """
    # Create main plot with subplots
    fig = make_subplots(specs=[[{"secondary_y": False}]])
    
    # Add original data points
    fig.add_trace(
        go.Scatter(
            x=original_df['Vertical Position m'],
            y=original_df['Corrected sp Cond [µS/cm]'],
            mode='markers',
            marker=dict(size=4, color='blue'),
            name=f'Original data n = {len(original_df)}'
        )
    )
    
    # Add resampled data points
    fig.add_trace(
        go.Scatter(
            x=resampled_df['Vertical Position m'],
            y=resampled_df['Corrected sp Cond [µS/cm]'],
            mode='markers',
            marker=dict(size=10, color='red'),
            name=f'Resampled ({sampling_interval*100:.0f} cm) n = {len(resampled_df)}'
        )
    )
    
    # Configure main layout
    fig.update_layout(
        title=f'Comparison of original vs resampled data {well_name}',
        xaxis_title='Vertical Position (m)',
        yaxis_title='Corrected Specific Conductivity (µS/cm)',
        legend=dict(x=0.75, y=0.1),
        width=1000,
        height=600,
        hovermode='closest'
    )
    
    # Define inset plot parameters
    x_max = x_min + num_intervals * sampling_interval
    mask = (original_df['Vertical Position m'] >= x_min) & (original_df['Vertical Position m'] <= x_max)
    y_min = original_df.loc[mask, 'Corrected sp Cond [µS/cm]'].min()
    y_max = original_df.loc[mask, 'Corrected sp Cond [µS/cm]'].max()
    margin = (y_max - y_min) * 0.1
    
    # Add rectangle for inset
    fig.add_shape(
        type="rect",
        x0=0.05, y0=0.65,
        x1=0.35, y1=0.95,
        xref="paper", yref="paper",
        line=dict(color="black", width=1),
        fillcolor="white",
        opacity=0.7
    )
    
    # Add original data to inset
    fig.add_trace(
        go.Scatter(
            x=original_df.loc[mask, 'Vertical Position m'],
            y=original_df.loc[mask, 'Corrected sp Cond [µS/cm]'],
            mode='markers',
            marker=dict(size=4, color='blue'),
            showlegend=False,
            xaxis="x2", yaxis="y2"
        )
    )
    
    # Add resampled data to inset
    inset_mask = (resampled_df['Vertical Position m'] >= x_min) & (resampled_df['Vertical Position m'] <= x_max)
    fig.add_trace(
        go.Scatter(
            x=resampled_df.loc[inset_mask, 'Vertical Position m'],
            y=resampled_df.loc[inset_mask, 'Corrected sp Cond [µS/cm]'],
            mode='markers',
            marker=dict(size=10, color='red'),
            showlegend=False,
            xaxis="x2", yaxis="y2"
        )
    )
    
    # Add vertical dotted lines in inset to show sampling intervals
    for i in range(1, num_intervals):
        x_line = x_min + i*sampling_interval
        fig.add_shape(
            type="line",
            x0=x_line, y0=y_min - margin,
            x1=x_line, y1=y_max + margin,
            xref="x2", yref="y2",
            line=dict(color="gray", width=1, dash="dot")
        )
    
    # Configure inset axes
    fig.update_layout(
        xaxis2=dict(
            domain=[0.05, 0.35],
            anchor="y2",
            range=[x_min, x_max]
        ),
        yaxis2=dict(
            domain=[0.65, 0.95],
            anchor="x2",
            range=[y_min - margin, y_max + margin],
            showticklabels=False
        )
    )
    
    return fig


def main(df_example: pd.DataFrame, sampling_interval: float = 0.05, 
         x_min: float = 10, num_intervals: int = 4) -> None:
    """
    Main function to resample conductivity data and create comparison visualization.
    
    Args:
        df_example: DataFrame with conductivity data
        sampling_interval: Sampling interval in meters (default: 0.05m or 5cm)
        x_min: Minimum x-value for inset plot (default: 10)
        num_intervals: Number of intervals to show in inset plot (default: 4)
    """
    resampled_df = resample_conductivity_data(df_example, sampling_interval)
    fig = create_comparison_plot(df_example, resampled_df, sampling_interval, x_min, num_intervals)
    fig.show()

---

### Main function:

sampling_interval: Distance between sampling points in meters (default 0.05m)
  - Controls resolution of resampled data
  - Smaller values = higher resolution but more data points
  - Larger values = lower resolution but smoother data

x_min: Starting depth in meters for zoomed inset plot (default 10m)
  - Determines which section of data is shown in detail
  - Should be within range of available depth measurements

num_intervals: Number of sampling intervals to display in inset (default 4)
  - Controls how many vertical grid lines appear in zoomed view
  - More intervals = more detailed view of sampling resolution



In [4]:
main(df_example=df_well, 
     sampling_interval=0.05, # meters
     x_min=10, #meters
     num_intervals=6 #number of intervals 
     )