In [8]:
import os
import pandas as pd
import plotly.express as px
from glob import glob
from collections import defaultdict # to check for common headers

In [9]:
# Ruta base donde están los CSV
base_path = r'C:\Users\Mariana\Documents\freshwater_lens\data\raw_2025_with_issues\SEC_split_profiles_cut' 
html_filename = '2025_YSI_profiles_cut_by_TOM_fake_dates.html'

## Funciones para checar headers

In [10]:
# ============================================================================ #
# Solve merging issues with headers
# ============================================================================ #


def find_common_headers(folder_path):
    """
    Identify common and unique headers among CSV files in the specified folder and its subfolders.

    Parameters:
    - folder_path (str): The path to the folder containing CSV files.

    Note:
    - If no CSV files are found, the function prints a message and returns.
    - If a file is empty, it is skipped.
    - Any errors encountered while reading files are printed.
    """

    # Get all CSV files in the folder and subfolders
    csv_files = glob(os.path.join(folder_path, '**/*.csv'), recursive=True)

    if not csv_files:
        print("No CSV files found in the specified folder and subfolders.")
        return

    # Dictionary to store headers for each CSV file
    headers_dict = {}

    for csv_file in csv_files:
        try:
            # Read the first row of each CSV file to get headers
            headers = pd.read_csv(csv_file, nrows=1).columns.tolist()
            headers_dict[csv_file] = set(headers)
        except pd.errors.EmptyDataError:
            print(f"Skipping empty file: {csv_file}")
        except Exception as e:
            print(f"Error reading file {csv_file}: {str(e)}")

    # Find common and unique headers
    common_headers = set.intersection(*headers_dict.values())
    unique_headers = set.union(*headers_dict.values()) - common_headers

    # Print results
    print("Common headers in ", ' ', folder_path,':')
    print(common_headers)


def find_non_common_headers(folder_path):
    """
    Identify non-common headers and corresponding unique folders among CSV files in the specified
    folder and its subfolders.

    Parameters:
    - folder_path (str): The path to the folder containing CSV files.

    Note:
    - If no CSV files are found, the function prints a message and returns.
    - If a file is empty, it is skipped.
    - Any errors encountered while reading files are printed.
    """

    # Get all CSV files in the folder and subfolders
    csv_files = glob(os.path.join(folder_path, '**/*.csv'), recursive=True)

    if not csv_files:
        print("No CSV files found in the specified folder and subfolders.")
        return

    # Dictionary to store headers for each CSV file
    headers_dict = defaultdict(list)

    for csv_file in csv_files:
        try:
            # Read the first row of each CSV file to get headers
            headers = pd.read_csv(csv_file, nrows=1).columns.tolist()
            headers_dict[csv_file] = headers
        except pd.errors.EmptyDataError:
            print(f"Skipping empty file: {csv_file}")
        except Exception as e:
            print(f"Error reading file {csv_file}: {str(e)}")

    # Find common and unique headers
    common_headers = set.intersection(*map(set, headers_dict.values()))
    
    # Dictionary to store non-common headers and corresponding unique folders
    non_common_headers_dict = defaultdict(set)

    for csv_file, headers in headers_dict.items():
        non_common_headers = set(headers) - common_headers
        folder_name = os.path.basename(os.path.dirname(csv_file))
        for header in non_common_headers:
            non_common_headers_dict[header].add(folder_name)

    # Print results
    print("Non-common headers and unique folders:")
    for header, folders in non_common_headers_dict.items():
        print(f"\nHeader: {header}")
        print(f"Unique Folders: {list(folders)}")

def find_files_without_header(folder_path, target_header):
    """
    Identify CSV files in the specified folder and its subfolders that do not contain the target header.

    Parameters:
    - folder_path (str): The path to the folder containing CSV files.
    - target_header (str): The header to check for in each CSV file.

    Note:
    - If no CSV files are found, the function prints a message and returns.
    - If a file is empty, it is skipped.
    - Any errors encountered while reading files are printed.
    """    

    # Get all CSV files in the folder and subfolders
    csv_files = glob(os.path.join(folder_path, '**/*.csv'), recursive=True)

    if not csv_files:
        print("No CSV files found in the specified folder and subfolders.")
        return

    # List to store files without the target header
    files_without_header = []

    for csv_file in csv_files:
        try:
            # Read the first row of each CSV file to get headers
            headers = pd.read_csv(csv_file, nrows=1).columns.tolist()
            
            # Check if the target header is not present in the file
            if target_header not in headers:
                files_without_header.append(csv_file)
        except pd.errors.EmptyDataError:
            print(f"Skipping empty file: {csv_file}")
        except Exception as e:
            print(f"Error reading file {csv_file}: {str(e)}")

    # Print files without the target header
    print(f"Files without the '{target_header}' header:")
    for file in files_without_header:
        print(file)

In [11]:
#relevant_columns = ['Corrected sp Cond [uS/cm]', 'Vertical Position [m]']
find_common_headers(base_path )
find_non_common_headers(base_path )

Common headers in    C:\Users\Mariana\Documents\freshwater_lens\data\raw_2025_with_issues\SEC_split_profiles_cut :
{'SEC (mS)', 'Depth (m)'}
Non-common headers and unique folders:


## Plot Interactivo de todos los archivos 2025 enviados por Tom

In [12]:



# Lista para guardar los dataframes
dfs = []

# Recorrer todas las carpetas y subcarpetas
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            try:
                # Leer el archivo, saltando la primera línea
                df = pd.read_csv(file_path)
                df['file_name'] = file  # Agrega columna con nombre del archivo
                dfs.append(df)
            except Exception as e:
                print(f"Error leyendo {file_path}: {e}")

# Concatenar todos los dataframes
merged_df = pd.concat(dfs, ignore_index=True)

# Asegurarse de que los nombres de columnas estén correctos
merged_df.columns = [col.strip() for col in merged_df.columns]

# Graficar con Plotly
fig = px.scatter(
    merged_df,
    x='SEC (mS)',
    y='Depth (m)',
    color='file_name',
    title='Tom YSI 2025',
    labels={'SEC (mS)': 'SEC (mS)', 'Depth (m)': 'Depth (m)'}
)

# Invertir el eje Y para que 0 esté arriba
fig.update_yaxes(autorange='reversed')

# Mostrar el gráfico
fig.show()
# Guardar como HTML y PNG si se especifica el nombre de archivo
if html_filename:
    fig.write_html(html_filename)
    print(f"Gráfico guardado como HTML en: {html_filename}")
    #png_filename = os.path.splitext(html_filename)[0] + '.png'
    #fig.write_image(png_filename, engine='kaleido')
    #print(f"Gráfico guardado como PNG en: {png_filename}")

Gráfico guardado como HTML en: 2025_YSI_profiles_cut_by_TOM_fake_dates.html
