In [1]:
import pandas as pd

### Exploratory analysis

In [None]:
priority_wells = [ 'AW5D', 'AW5O', 'AW5S','AW6D', 'AW6O', 'AW6S', 'LRS69D', 'LRS69O', 'LRS69S', 'LRS70D', 'LRS70S']

#### Caliper

In [4]:
# path = r'notebooks\sandbox\concatenate_caliper_all.csv'
caliper= pd.read_csv('concatenate_caliper_all.csv')
caliper.sample(5)

Unnamed: 0.1,Unnamed: 0,Depth_ft,Caliper_in,source_file,Diameter_auger_in,calibrated_in,calibrated_cm,Well_Diameter_Deviation_Percentage,Depth [m]
13193,13193,77.2746,5.61202,BW3D_caliper_20210910.LAS,6,7.000235,17.780597,16.670582,-23.553298
6460,6460,11.5803,3.09552,AW6O_caliper_20210910.LAS,4,4.36994,11.099648,9.248506,-3.529675
24755,24755,7.18162,4.87133,LRS75S_caliper_20210910.LAS,6,6.226051,15.81417,3.767523,-2.188958
25536,25536,28.7188,5.21497,LRS79D_caliper_20210910.LAS,6,6.585231,16.726486,9.753843,-8.75349
13698,13698,26.9215,6.63316,BW3D_caliper_20210910.LAS,6,8.06755,20.491578,34.459172,-8.205673


In [None]:
# Retain only the wells defined as priority for the current analysis
# Extract well name from 'source_file' by taking characters before the first underscore
caliper['Well'] = caliper['source_file'].str.extract(r'^([^_]+)')
caliper_priority = caliper[caliper['Well'].isin(priority_wells)].copy()
caliper_priority.sample(5)


Unnamed: 0.1,Unnamed: 0,Depth_ft,Caliper_in,source_file,Diameter_auger_in,calibrated_in,calibrated_cm,Well_Diameter_Deviation_Percentage,Depth [m],Well
4252,4252,22.6365,5.44314,AW5D_caliper_20210910.LAS,6,6.823718,17.332244,13.728638,-6.899605,AW5D
4515,4515,31.2128,2.92105,AW5O_caliper_20210910.LAS,4,4.187581,10.636455,4.689521,-9.513661,AW5O
22332,22332,107.589,7.80473,LRS70D_caliper_20210910.LAS,8,9.292098,23.601929,16.151225,-32.793127,LRS70D
5280,5280,53.347,4.99296,AW6D_caliper_20210910.LAS,6,6.353181,16.137081,5.886357,-16.260166,AW6D
22852,22852,55.74,8.57031,LRS70D_caliper_20210910.LAS,8,10.092297,25.634435,26.153714,-16.989552,LRS70D


In [6]:
print(f"Row count for each well:\n{caliper_priority['Well'].value_counts()}\n\n"
      f"Total priority wells defined: {len(priority_wells)}\n"
      f"Total priority wells after filtering: {caliper_priority['Well'].nunique()}")


Row count for each well:
Well
LRS70D    1160
AW5D       900
AW6D       898
LRS69D     865
AW6O       753
AW5O       343
LRS69S     245
LRS70S     203
AW6S       147
AW5S        95
Name: count, dtype: int64

Total priority wells defined: 11
Total priority wells after filtering: 10


In [7]:
# It seems LRS69O is missing from the caliper data — let's check which LRS69 wells are actually present
# caliper[caliper['source_file'].str.contains(r'^LRS69', regex=True)].sample(20)
caliper[caliper['source_file'].str.startswith('LRS69')]['Well'].unique()


array(['LRS69D', 'LRS69S'], dtype=object)

#### Conductivity

In [12]:
from pathlib import Path
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

In [37]:
## Standarizing column names across csvs
from pathlib import Path
import pandas as pd

# Set to collect column names
column_names = set()

# Path to CSV folder
csv_folder = Path("../../data/raw")
csv_paths = list(csv_folder.glob("*.csv"))

# Loop through only priority well files
for path in csv_paths:
    well_id = path.stem.split("_")[0]  # Get well ID from filename
    if well_id in priority_wells:
        try:
            df = pd.read_csv(path, nrows=1)  # Read only the header
            print(f"{path.name}: {len(df.columns)} columns")
            column_names.update(df.columns)
        except Exception as e:
            print(f"{path.name}: ❌ Error - {e}")


AW5D_YSI_20250225.csv: 20 columns
AW5O_YSI_20250225.csv: 20 columns
AW5S_YSI_20250225.csv: 23 columns
AW6D_YSI_20250226.csv: 20 columns
LRS69D_YSI_20250222R.csv: 21 columns
LRS69O_YSI_20250222R.csv: 21 columns
LRS69S_YSI_20250222R.csv: 27 columns
LRS70D_YSI_20250317.csv: 21 columns
LRS70S_YSI_20250317.csv: 21 columns


In [39]:
rename_dict = {
    # Conductividad específica
    'SpCond_muS/cm': 'SpCond µS/cm',
    'nLF_Cond_muS/cm': 'nLF Cond µS/cm',
    'Cond_muS/cm': 'Cond µS/cm',
    #'Corrected sp Cond [µS/cm]': 'SpCond_muS/cm',
   
    # Temperatura
    'Temp °C': 'Temperature_C',
    'Temp_Celcius': 'Temperature_C',

    # Profundidad
    #'Depth from GL (m)': 'Depth_m',
    #'Corrected Depth': 'Depth_m',

    # Tiempo
    'Time (HH:MM:SS)': 'Time (HH:mm:ss)',
    'Date (MM/DD/YYYY)': 'Date',


}


In [42]:
dfs=[]
for path in csv_paths:
    well_id = path.stem.split("_")[0]
    if well_id in priority_wells:
        df = pd.read_csv(path)
        df["Well"] = well_id
        df.rename(columns=rename_dict, inplace=True)
        df.drop(columns=[col for col in df.columns if col.startswith('Unnamed')], inplace=True)
        dfs.append(df)

# Combine all into a single DataFrame
if dfs:
    conductivity_df = pd.concat(dfs, ignore_index=True)
    
else:
    print("⚠️ No matching CSV files from priority wells were found or loaded.")

conductivity_df.sample(6)

Unnamed: 0,Date,Time (HH:mm:ss),Time (Fract. Sec),Site Name,Cond µS/cm,Depth m,ODO % sat,ODO mg/L,ORP mV,Pressure psi a,...,Temperature_C,Vertical Position m,Battery V,Cable Pwr V,Resistivity ohms-cm,Well,Depth m.1,nLF Cond µS/cm,ODO % CB,SpCond_muS/cm.1
43202,22/02/2025,13:10:04,0.75,Default Site,1157.0,3.517,54.9,4.43,379.0,4.988,...,26.084,3.513,2.77,1.1,,LRS69D,,1131.1,54.9,
101434,26/02/2025,11:28:51,0.0,Default Site,5052.0,5.396,1.2,0.1,378.0,7.665,...,25.939,5.395,2.55,1.1,,LRS70D,,4953.7,1.2,
95976,22/02/2025,10:01:15,0.5,Default Site,933.9,1.753,78.1,6.34,373.5,2.486,...,25.884,1.753,2.87,1.1,,LRS69S,1.753,916.8,78.1,918.4
61714,22/02/2025,14:27:12,0.75,Default Site,10711.2,10.544,1.9,0.15,-142.5,15.017,...,25.702,10.543,2.74,1.1,,LRS69D,,10554.8,1.9,
71469,22/02/2025,10:31:21,0.5,Default Site,1012.3,0.676,53.7,4.36,394.8,0.959,...,25.899,0.676,2.85,1.1,,LRS69O,,993.4,53.7,
43059,22/02/2025,13:09:29,0.0,Default Site,1157.0,3.517,54.9,4.43,378.5,4.988,...,26.086,3.519,2.77,1.1,,LRS69D,,1131.0,54.9,


#### Ploting 

In [None]:
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Create dropdown menu with the wells
dropdown = widgets.Dropdown(
    options=sorted(conductivity_df['Well'].unique()),
    description='Well:',
    style={'description_width': 'initial'}
)

# Define the interactive plotting function
def plot_conductivity(well):
    subset = conductivity_df[conductivity_df['Well'] == well]
    fig = px.line(
        subset,
        x='SpCond µS/cm',
        y='Vertical Position m',
        title=f"Specific Conductivity Profile - {well}",
        labels={
            'SpCond µS/cm': 'Specific Conductivity (µS/cm)',
            'Vertical Position m': 'Depth (m)'
        }
    )
    fig.update_yaxes(autorange="reversed")  # Depth increases downward
    fig.show()

# Display the widget
widgets.interact(plot_conductivity, well=dropdown)


interactive(children=(Dropdown(description='Well:', options=('AW5D', 'AW5O', 'AW5S', 'AW6D', 'LRS69D', 'LRS69O…

<function __main__.plot_conductivity(well)>

### Checking 

In [50]:
total_rows_csv = 0

print("Row counts per file:")
for path in csv_paths:
    well_id = path.stem.split("_")[0]
    if well_id in priority_wells:
        try:
            df = pd.read_csv(path)
            row_count = len(df)
            total_rows_csv += row_count
            print(f"{path.name}: {row_count} rows")
        except Exception as e:
            print(f"{path.name}: ❌ Error - {e}")


print("\n")
for well in priority_wells:
    rows = len(conductivity_df[conductivity_df["Well"] == well])
    print(f"{well}: {rows}")

Row counts per file:
AW5D_YSI_20250225.csv: 13688 rows
AW5O_YSI_20250225.csv: 5144 rows
AW5S_YSI_20250225.csv: 1665 rows
AW6D_YSI_20250226.csv: 11612 rows
LRS69D_YSI_20250222R.csv: 37858 rows
LRS69O_YSI_20250222R.csv: 23636 rows
LRS69S_YSI_20250222R.csv: 3010 rows
LRS70D_YSI_20250317.csv: 16980 rows
LRS70S_YSI_20250317.csv: 455 rows


AW5D: 13688
AW5O: 5144
AW5S: 1665
AW6D: 11612
AW6O: 0
AW6S: 0
LRS69D: 37858
LRS69O: 23636
LRS69S: 3010
LRS70D: 16980
LRS70S: 455
