In [1]:
import pandas as pd
import numpy as np
from enum import Enum
from typing import Dict
FLAG_VALUE = -9.99e-29
import seabirdfilehandler as fh
from scipy.stats import binned_statistic

In [2]:
def read_cnv_file(filepath='seabird_example_data/cnv/basic_emb.cnv'):
    """Liest CNV-Datei ein und gibt DataFrame zur端ck"""
    with open(filepath, 'r') as f:
        lines = f.readlines()
    
    # Header-Ende finden
    data_start = 0
    for i, line in enumerate(lines):
        if '*END*' in line:
            data_start = i + 1
            break
    
    # Spaltennamen extrahieren
    column_names = []
    for line in lines:
        if '# name' in line:
            parts = line.split('=')
            if len(parts) > 1:
                col_name = parts[1].split(':')[0].strip()
                column_names.append(col_name)
    
    # Daten einlesen
    data_lines = lines[data_start:]
    data = []
    for line in data_lines:
        if line.strip() and not line.startswith('*'):
            values = line.split()
            if len(values) == len(column_names):
                try:
                    data.append([float(x) for x in values])
                except ValueError:
                    continue
    
    return pd.DataFrame(data, columns=column_names)

In [3]:
class CastType(Enum):
    """The subsection of data to use when splitting by upcast and/or
    downcast
    """

    BOTH = 0
    DOWNCAST = 1
    UPCAST = 2
    NA = 3

def bin_average(
    dataset: pd.DataFrame,
    bin_variable: str,
    bin_size: float,
    include_scan_count: bool = True,
    min_scans: int = 1,
    max_scans: int = 999999,
    exclude_bad_scans: bool = True,
    flag_value=FLAG_VALUE,
) -> pd.DataFrame:

    _dataset = dataset.copy()

    # pd series containing the variable we want to bin for, converted to ndarray
    control = _dataset[bin_variable].to_numpy()

    bin_min = bin_size / 2.0  # min value of first bin

    control_max = np.amax(control)
    bin_max = control_max - ((bin_min + control_max) % bin_size) + bin_size

    # split into descending and ascending, including peak in both
    peak_index = np.argmax(control)
    control_desc = control[: peak_index + 1]
    control_asc = control[peak_index:]

    # create the bins to sort into
    desc_bin_edges = np.arange(start=bin_min, stop=bin_max + bin_size, step=bin_size)
    asc_bin_adges = np.arange(start=bin_max, stop=bin_min - bin_size, step=-bin_size)

    # setup bins to indicate where each index should be sorted into
    desc_bins = np.digitize(x=control_desc, bins=desc_bin_edges)
    asc_bins = np.digitize(x=control_asc, bins=asc_bin_adges, right=True)
    asc_bins += np.amax(desc_bins) - 1
    _dataset["bin_number"] = np.concat((desc_bins[:-1], asc_bins))

    # get the number of scans in each bin
    scans_per_bin = np.bincount(_dataset["bin_number"])
    _dataset["nbin"] = _dataset["bin_number"].map(lambda x: scans_per_bin[x])

    if exclude_bad_scans:
        _dataset = _dataset.groupby("bin_number", as_index=False).mean()
    else:
        # need to handle the flag column differently
        not_flag = _dataset[_dataset.columns.difference(["flag"])].groupby("bin_number").mean()
        # if all the values in a group are the flag value the assign the
        # flag value to the group, otherwise 0
        flag = _dataset[["bin_number", "flag"]].groupby("bin_number").mean()
        flag.loc[flag["flag"] != flag_value] = 0
        _dataset = pd.concat([not_flag, flag], axis=1).reset_index()

    _dataset = _dataset.drop(_dataset[_dataset["nbin"] < min_scans].index)
    _dataset = _dataset.drop(_dataset[_dataset["nbin"] > max_scans].index)

    _dataset = _dataset.drop("bin_number", axis=1)

    if not include_scan_count:
        _dataset = _dataset.drop("nbin", axis=1)

    return _dataset

In [4]:
# CNV einlesen
cnv_data = read_cnv_file()

# Flag-Spalte hinzuf端gen falls nicht vorhanden
if 'flag' not in cnv_data.columns:
    cnv_data['flag'] = 0.0

# Bin-Average ausf端hren
result = bin_average(
    dataset=cnv_data,
    bin_variable='prDM',
    bin_size=1.0,
    cast_type=CastType.BOTH,
    include_scan_count=True
)

# Ergebnis anzeigen
print(result.head())


       prDM     t090C     t190C    c0mS/cm    c1mS/cm  sbeox0ML/L  sbeox1ML/L  \
0  1.191238  3.561413  3.561083  15.329939  15.329996    8.128437    8.128355   
1  1.982106  3.559567  3.559714  15.328845  15.329168    8.128128    8.125038   
2  2.965558  3.558415  3.558420  15.327312  15.327569    8.120852    8.124836   
3  3.980170  3.559311  3.559184  15.327844  15.328040    8.128944    8.127231   
4  5.009978  3.562772  3.562648  15.331412  15.332043    8.127640    8.128198   

   sbox0Mm/Kg  sbox1Mm/Kg  flECO-AFL  turbWETntu0       par        spar  \
0  358.580232  358.576611   1.354166     0.572110  8.704192  409.452625   
1  358.566847  358.430282   1.251336     0.549480  5.837184  480.982941   
2  358.246128  358.421628   1.263487     0.564210  4.199303  497.104884   
3  358.603352  358.527705   1.260914     0.568548  2.952731  490.847045   
4  358.545130  358.569652   1.257283     0.569900  2.077398  479.589783   

       timeS    dz/dtM      sal00      sal11  flag    nbin  
0

In [5]:
cnv = fh.CnvFile('seabird_example_data/cnv/basic_emb.cnv')
cnv.parameters['prDM'].data


array([ 1.149,  1.197,  1.197, ..., 23.563, 23.46 , 23.563], shape=(3067,))

In [6]:
cnv

C:\Users\lokadmin\Desktop\Aaron\SeabirdFileHandler\seabird_example_data\cnv\basic_emb.cnv

In [7]:
cnv.parameters['prDM'].data

array([ 1.149,  1.197,  1.197, ..., 23.563, 23.46 , 23.563], shape=(3067,))

In [8]:
def bin_average_array_test(
    cnv_file,
    bin_variable: str,
    bin_size: float
):
    control_data = cnv_file.parameters[bin_variable].data
    
    bin_min = np.floor(control_data.min() / bin_size) * bin_size
    bin_max = np.ceil(control_data.max() / bin_size) * bin_size
    bin_edges = np.arange(bin_min, bin_max + bin_size, bin_size)

    
    cnv.parameters['prDM'].data = binned_statistic( cnv.parameters['prDM'].data, cnv.parameters['t090C'].data, statistic='mean', bins=bin_edges)

In [9]:
bin_average_array_test(
    cnv_file=cnv,
    bin_variable = 'prDM',
    bin_size = 1
)
cnv.parameters['prDM'].data

BinnedStatisticResult(statistic=array([3.56140046, 3.55803111, 3.55902381, 3.56051512, 3.56439759,
       3.56647848, 3.56856301, 3.57177733, 3.58487808, 3.598688  ,
       3.63202394, 3.63989733, 3.65021053, 3.65282297, 3.67169863,
       3.67660667, 3.68735205, 3.71530267, 3.74810526, 3.75141781,
       3.75088421, 3.74999571, 3.74980582]), bin_edges=array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.]), binnumber=array([ 1,  1,  1, ..., 23, 23, 23], shape=(3067,)))

In [10]:
def bin_average_array(
    cnv_file,
    bin_variable: str,
    bin_size: float
) -> Dict[str, np.ndarray]:
    """
    Binning direkt mit CnvFile-Objekt
    """
    

    control_data = cnv_file.parameters[bin_variable].data
    
    # Bin-Edges
    bin_min = np.floor(control_data.min() / bin_size) * bin_size
    bin_max = np.ceil(control_data.max() / bin_size) * bin_size
    bin_edges = np.arange(bin_min, bin_max + bin_size, bin_size)
    
    result = {}
    
    # Alle verf端gbaren Variablen verarbeiten
    variables = []
    if hasattr(cnv_file, 'parameters'):
        variables = list(cnv_file.parameters.keys())
    elif hasattr(cnv_file, '_fields'):
        variables = cnv_file._fields
    
    for var_name in variables:
        try:
            # Daten holen (gleiche Methode wie oben)
            if hasattr(cnv_file, 'parameters') and hasattr(cnv_file.parameters[var_name], 'data'):
                data = cnv_file.parameters[var_name].data
            else:
                continue
                
            statistic, _, _ = binned_statistic(
                control_data, data, statistic='mean', bins=bin_edges
            )
            result[var_name] = statistic
        except (AttributeError, KeyError):
            continue
    
    # Bin-Mittelpunkte
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    result[bin_variable] = bin_centers
    
    return result

In [11]:
result = bin_average_array(
    cnv_file=cnv,
    bin_variable = 'prDM',
    bin_size = 1
)
result

AttributeError: 'BinnedStatisticResult' object has no attribute 'min'