In [3]:
from pathlib import Path

import pandas as pd

In [59]:
sample_file = Path('../data/sample-data.txt')
df = pd.read_csv(sample_file, sep='\t', index_col='Selection', skiprows=lambda x: x % 2 !=0)
print(df.columns)

Index(['View', 'Channel', 'Low Freq (Hz)', 'High Freq (Hz)', 'Begin File',
       'Delta Time (s)', 'File Offset (s)', 'call variant', 'level',
       'calls overlap', 'call cutoff @ end?', 'Notes'],
      dtype='object')


In [123]:
sample_file = Path('../data/sample-data.txt')

# unable to use "usecols" kwarg for read_csv
drop_cols = [
    'Selection',  # file order into Raven software
    'View',  # spectrogram or wave
    'Channel',  # only use 1 channel
    'Notes',  # skipping for now
    'Low Freq (Hz)', # 0
#    'High Freq (Hz)'  # need for spectograms
]


rename_cols = {
    'Begin File': 'file',
    'Delta Time (s)': 'call_length',
    'File Offset (s)': 'start_time',
    'call variant': 'call_variant',
    'level': 'signal_level',
    'calls overlap': 'call_overlap',
    'call cutoff @ end?': 'call_cutoff'
}

# should be bool, but i don't know what the number mean
# can't use in read_csv because there are NaNs
dtypes = {
    'calls overlap': int,
    'call cutoff': int
}


df = (pd.read_csv(
    sample_file,
    sep='\t',
    skiprows=lambda x: x % 2 != 0,
)
    .drop(columns=drop_cols)
    .rename(columns=rename_cols)
    .dropna()
)
df['call_overlap'] = df['call_overlap'].astype(int)
df['call_cutoff'] = df['call_cutoff'].astype(int)
df

Unnamed: 0,High Freq (Hz),file,call_length,start_time,call_variant,signal_level,call_overlap,call_cutoff
0,238.7,0902.DSG_RAWD_HMS_15_ 0_ 0__DMY_ 9_ 8_15.wav,0.2304,7.0268,3,1.0,1,2
1,182.3,0903.DSG_RAWD_HMS_15_ 5_ 0__DMY_ 9_ 8_15.wav,6.6072,1.2648,1,2.0,2,2
2,171.4,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,1.2507,0.0117,1,1.0,2,1
3,156.2,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,3.3077,6.2898,2,2.0,2,2
4,247.4,0905.DSG_RAWD_HMS_15_15_ 0__DMY_ 9_ 8_15.wav,0.1810,5.1520,3,1.0,1,2
...,...,...,...,...,...,...,...,...
180,253.6,1064.DSG_RAWD_HMS_ 4_30_ 0__DMY_10_ 8_15.wav,2.4611,5.7677,1,1.0,2,2
181,218.9,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,0.2618,6.3764,3,1.0,1,2
182,202.3,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,2.8509,5.1429,1,3.0,1,2
183,194.7,1066.DSG_RAWD_HMS_ 4_40_ 0__DMY_10_ 8_15.wav,2.2342,0.0206,1,1.0,2,2


In [128]:
def read_annotation_file(annotation: Path) -> pd.DataFrame:
    """Given path to BlackGrouper annotation file, return annotations as a DataFrame.
    
    
    Parameters
    ----------
    annotation_file: Path
        Path to annotation file
        
    Returns
    -------
    annotation: pd.DataFrame
        DataFrame of selected annotation data
        
    Notes
    -----
    - This function drops columns of annotated data not required for development of Black Grouper classifier.
    - Bool values for `calls overlap` and `call cutoff @ end?` determined from metadata describedin 
      https://docs.google.com/spreadsheets/d/16NzrodSu2MhiBPzKBrQKCi923lWDKvzT/edit#gid=616126176
        - Call variants: 6 variants
        - Level: 3 levels of relative amplitude (1: high, 2: medium, 3: low)
        - Overlap: Do calls overlap (1: yes, 2: no)
        - Cutoff: Are calls cutoff at end (1: yes, 2: no)
    """
    # unable to use "usecols" kwarg for read_csv
    drop_cols = [
        'Selection',
        'View',
        'Channel',
        'Notes',
        'Low Freq (Hz)',
    ]


    rename_cols = {
        'Begin File': 'file',
        'Delta Time (s)': 'call_length',
        'File Offset (s)': 'start_time',
        'call variant': 'call_variant',
        'level': 'signal_level',
        'calls overlap': 'call_overlap',
        'call cutoff @ end?': 'call_cutoff',
        'High Freq (Hz)': 'high_freq'
    }


    df = (pd.read_csv(
        sample_file,
        sep='\t',
        skiprows=lambda x: x % 2 != 0,
    )
        .drop(columns=drop_cols)
        .rename(columns=rename_cols)
        .dropna()
    )
    df['signal_level'] = df['signal_level'].astype(int)
    # 1 == True
    # 2 == False
    df['call_overlap'] = df['call_overlap'].apply(lambda x: bool(x % 2))
    df['call_cutoff'] = df['call_cutoff'].apply(lambda x: bool(x % 2))
    
    return df

In [129]:
read_annotation_file(sample_file)

Unnamed: 0,high_freq,file,call_length,start_time,call_variant,signal_level,call_overlap,call_cutoff
0,238.7,0902.DSG_RAWD_HMS_15_ 0_ 0__DMY_ 9_ 8_15.wav,0.2304,7.0268,3,1,True,False
1,182.3,0903.DSG_RAWD_HMS_15_ 5_ 0__DMY_ 9_ 8_15.wav,6.6072,1.2648,1,2,False,False
2,171.4,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,1.2507,0.0117,1,1,False,True
3,156.2,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,3.3077,6.2898,2,2,False,False
4,247.4,0905.DSG_RAWD_HMS_15_15_ 0__DMY_ 9_ 8_15.wav,0.1810,5.1520,3,1,True,False
...,...,...,...,...,...,...,...,...
180,253.6,1064.DSG_RAWD_HMS_ 4_30_ 0__DMY_10_ 8_15.wav,2.4611,5.7677,1,1,False,False
181,218.9,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,0.2618,6.3764,3,1,True,False
182,202.3,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,2.8509,5.1429,1,3,True,False
183,194.7,1066.DSG_RAWD_HMS_ 4_40_ 0__DMY_10_ 8_15.wav,2.2342,0.0206,1,1,False,False


In [130]:
bermuda_files = list(Path('/Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015').glob('*.txt'))
bermuda_files.sort()

In [131]:
dfs = []
for f in bermuda_files:
    print(f'Processing {f}')
    dfs.append(read_annotation_file(f))


Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda000-1551-1600.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda000-1601-1650.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda000-1651-1700.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda000-1701-1750.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda001-0101-0150.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda001-0151-0200.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda001-02

Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda008-2001-2250.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda009-0251-0500.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda009-0501-0750.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda009-0751-1000.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda009-1001-1250.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda009-1751-2000.DSG_RAWD_HMS_21_5_0__DMY_11_8_15.Table.1.selections.txt
Processing /Users/jesse/axiom/src/black-grouper-analysis/data/bermuda-2015/Bermuda009-35

In [133]:
annotations = pd.concat(dfs)
annotations

Unnamed: 0,high_freq,file,call_length,start_time,call_variant,signal_level,call_overlap,call_cutoff
0,238.7,0902.DSG_RAWD_HMS_15_ 0_ 0__DMY_ 9_ 8_15.wav,0.2304,7.0268,3,1,True,False
1,182.3,0903.DSG_RAWD_HMS_15_ 5_ 0__DMY_ 9_ 8_15.wav,6.6072,1.2648,1,2,False,False
2,171.4,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,1.2507,0.0117,1,1,False,True
3,156.2,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,3.3077,6.2898,2,2,False,False
4,247.4,0905.DSG_RAWD_HMS_15_15_ 0__DMY_ 9_ 8_15.wav,0.1810,5.1520,3,1,True,False
...,...,...,...,...,...,...,...,...
180,253.6,1064.DSG_RAWD_HMS_ 4_30_ 0__DMY_10_ 8_15.wav,2.4611,5.7677,1,1,False,False
181,218.9,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,0.2618,6.3764,3,1,True,False
182,202.3,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,2.8509,5.1429,1,3,True,False
183,194.7,1066.DSG_RAWD_HMS_ 4_40_ 0__DMY_10_ 8_15.wav,2.2342,0.0206,1,1,False,False


In [134]:
annotations.high_freq.max()

402.0

In [135]:
annotations.to_csv(
    'annotations.csv',
    columns=[
        'file',
        'call_length',
        'start_time',
        'call_variant',
        'signal_level',
        'call_overlap',
        'call_cutoff'
    ],
    index=False,
)

In [136]:
!ls

annotations.csv  dev_parser.ipynb


In [137]:
df_test = pd.read_csv('annotations.csv')

In [138]:
df_test

Unnamed: 0,file,call_length,start_time,call_variant,signal_level,call_overlap,call_cutoff
0,0902.DSG_RAWD_HMS_15_ 0_ 0__DMY_ 9_ 8_15.wav,0.2304,7.0268,3,1,True,False
1,0903.DSG_RAWD_HMS_15_ 5_ 0__DMY_ 9_ 8_15.wav,6.6072,1.2648,1,2,False,False
2,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,1.2507,0.0117,1,1,False,True
3,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,3.3077,6.2898,2,2,False,False
4,0905.DSG_RAWD_HMS_15_15_ 0__DMY_ 9_ 8_15.wav,0.1810,5.1520,3,1,True,False
...,...,...,...,...,...,...,...
15635,1064.DSG_RAWD_HMS_ 4_30_ 0__DMY_10_ 8_15.wav,2.4611,5.7677,1,1,False,False
15636,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,0.2618,6.3764,3,1,True,False
15637,1065.DSG_RAWD_HMS_ 4_35_ 0__DMY_10_ 8_15.wav,2.8509,5.1429,1,3,True,False
15638,1066.DSG_RAWD_HMS_ 4_40_ 0__DMY_10_ 8_15.wav,2.2342,0.0206,1,1,False,False


In [141]:
# 6205: Clean calls
df_test.query('call_overlap==False and call_cutoff==False')

Unnamed: 0,file,call_length,start_time,call_variant,signal_level,call_overlap,call_cutoff
1,0903.DSG_RAWD_HMS_15_ 5_ 0__DMY_ 9_ 8_15.wav,6.6072,1.2648,1,2,False,False
3,0904.DSG_RAWD_HMS_15_10_ 0__DMY_ 9_ 8_15.wav,3.3077,6.2898,2,2,False,False
5,0907.DSG_RAWD_HMS_15_25_ 0__DMY_ 9_ 8_15.wav,2.4684,1.7079,1,2,False,False
6,0907.DSG_RAWD_HMS_15_25_ 0__DMY_ 9_ 8_15.wav,0.5266,10.2075,4,1,False,False
7,0909.DSG_RAWD_HMS_15_35_ 0__DMY_ 9_ 8_15.wav,2.7729,2.9620,1,1,False,False
...,...,...,...,...,...,...,...
15633,1063.DSG_RAWD_HMS_ 4_25_ 0__DMY_10_ 8_15.wav,0.2385,7.4049,3,1,False,False
15634,1064.DSG_RAWD_HMS_ 4_30_ 0__DMY_10_ 8_15.wav,0.2560,5.2790,3,1,False,False
15635,1064.DSG_RAWD_HMS_ 4_30_ 0__DMY_10_ 8_15.wav,2.4611,5.7677,1,1,False,False
15638,1066.DSG_RAWD_HMS_ 4_40_ 0__DMY_10_ 8_15.wav,2.2342,0.0206,1,1,False,False


In [142]:
df_test.call_variant.unique()

array([3, 1, 2, 4, 5, 6])

In [144]:
for call in range(1, 7):
    call_df = df_test.query(f'call_overlap==False and call_cutoff==False and call_variant=={call}')
    print(f'Call {call} has {len(call_df)} clean samples')

Call 1 has 2465 clean samples
Call 2 has 1360 clean samples
Call 3 has 2210 clean samples
Call 4 has 170 clean samples
Call 5 has 0 clean samples
Call 6 has 0 clean samples
