# Data loading and structuring

The code in this notebook loads all the recordings from the Jarinos folder and saves all vocalizations separately in the vocalizations folder, organized per seal.

It also creates a dataframe and csv file with all metadata per vocalization.

(Code by Marianne, July 2022)

In [None]:
import parselmouth
import tgt
import glob
import pandas as pd
import numpy as np
import os


In [None]:
project_folder = os.get_cwd()
print(project_folder)
raw_data_folder = '../Jarinos/'
processed_data_folder = '../vocalizations/'
months = [6,7]
include_p_vocalizations = True

In [None]:
info_sheet = pd.read_excel(raw_data_folder + 'Seal_recs_summer17.xlsx', index_col=0)
month_folders = glob.glob(raw_data_folder + '/[01]*')

In [10]:
# remove empty columns
info_sheet = info_sheet.loc[:, ~info_sheet.columns.str.contains('^Unnamed')]
# add month column
info_sheet['Month'] = info_sheet.apply(lambda row: row.Date.month, axis=1)

In [11]:
info_sheet

Unnamed: 0,Folder,Filename,Date,Time,Human,Animal,Partners,Context,GoodExcerpt,D_0_min,...,D_6_min,D_9_min,LastKnownLocation,Species,Comments,Annotator,Annotation,Comments Praat (NNCO = Names Not Called Out),Multichannel,Month
3.0,29.0,ZOOM0002a,2017-05-29,7.0,Koen,Info,,,,,...,,,Q1,PV,Ella ID r17-192,K,-,,,5
3.0,29.0,ZOOM0003a,2017-05-29,7.0,Koen,r17-192,,,,,...,,,Q1,PV,,K,97.0,,,5
3.0,29.0,ZOOM0004a,2017-05-29,7.0,Koen,r17-192,,,Very clear,,...,,,Q1,PV,,K,5.0,,,5
3.0,29.0,ZOOM0006a,2017-05-29,7.0,Koen,r17-192,,,,,...,,,Q1,PV,,K,69.0,,,5
3.0,29.0,ZOOM0007a,2017-05-29,7.0,Koen,r17-192,,,,,...,,,Q1,PV,,K,37.0,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63.0,28.0,ZOOM0007,2017-07-28,7.0,Andrea,r17-294,r17-295,,,3.0,...,4.0,4.0,J2,PV,,K,2018-11-25 00:00:00,,,7
63.0,28.0,ZOOM0008,2017-07-28,7.0,Andrea,r17-302,r17-303,,,1.0,...,2.0,2.0,J3,PV,Only one voc from 303,K,2017-01-03 00:00:00,,,7
63.0,28.0,ZOOM0024,2017-07-28,11.0,Marga,r17-288,r17-289,,,3.0,...,?,?,J8,PV,"Ra wasn't looking good, and did not vocalize",K,13-53,,,7
63.0,28.0,ZOOM0025,2017-07-28,11.0,Marga,r17-297,r17-299,,,4.0,...,4.0,4.0,J1,PV,Only 299 vocalized; one on the plateau; the ot...,K,1-95,Gain too high,,7


In [12]:
# select recordings from the given months that have annotations
rec_selection = info_sheet[(info_sheet['Annotator'].notna()) & (info_sheet['Month'].isin(months))]

In [13]:
def get_month_folder(month_int, month_folders):
    month_list = [f for f in month_folders if str(month_int) in f]
    assert len(month_list) == 1, f"Missing or multiple folders for this month: {month_int}"
    return month_list[0]

In [22]:
f"{100:02d}"

'100'

In [14]:
def get_day_folder(day_int, month_folder, ignore_missing_folders=False):
    day_folders = glob.glob(month_folder + '/*')
    day_str = f"{day_int:02d}"
    day_list = [f for f in day_folders if f[-2:] == day_str]
    if not ignore_missing_folders:
        assert len(day_list) != 0, f"Missing folder for this day: {month_folder}/{day_str}"
    else:
        if len(day_list) == 0:
            return None
    assert len(day_list) < 2, f"Multiple folders for this day: {month_folder}/{day_str}"
    return day_list[0]

In [15]:
def get_rec_folder(rec_str, day_folder, ignore_missing_folders=False):
    rec_folders = glob.glob(day_folder + '/*')
    rec_list = [f for f in rec_folders if f[-(len(rec_str)):] == rec_str]
    if not ignore_missing_folders:
        assert len(rec_list) != 0, f"Missing folder for this recording: {day_folder}/{rec_str}"
    else:
        if len(rec_list) == 0:
            return None
    assert len(rec_list) < 2, f"Multiple folders for this recording: {day_folder}/{rec_str}"
    return rec_list[0]

In [16]:
def get_rec_wav(rec_folder):
    wav_files = glob.glob(rec_folder + '/*.WAV') + glob.glob(rec_folder + '/*.wav')
    if len(wav_files) > 1:
        # if there is a transcoded wav file, use that one
        transcoded_wavs = [w for w in wav_files if '_transcoded' in w]
        wav_files = transcoded_wavs
    assert len(wav_files) == 1, f"Missing or multiple WAV files for this recording: {rec_folder}"
    return wav_files[0]

In [17]:
def get_rec_textgrid(rec_folder):
    textgrid_files = glob.glob(rec_folder + '/*.TextGrid')
    if len(textgrid_files) > 1:
        # if there are multiple textgrid files, use the one named koen
        koen_tgs = [t for t in textgrid_files if '_koen' in t]
        textgrid_files = koen_tgs
    assert len(textgrid_files) == 1, f"Missing or multiple TextGrid files for this recording: {rec_folder}"
    return textgrid_files[0]

In [18]:
def check_folder(folder_path):
    """
    Create folder if it does not exist, return path.
    """
    if not os.path.isdir(folder_path):
        os.mkdir(folder_path)
    return folder_path

In [23]:
%%time
# looping over all data and structuring information
# this cell takes a while (maybe a few mins) and produces a *lot* of warnings about clipping

processed_data_folder = check_folder(processed_data_folder)
    
vocalizations_dict = {'SealID': [], 'Filepath': [], 'Duration': [], 'VocQuality': [], 'SourceRecPath': [], 'StartTime': [], 'EndTime': [],
                      'RecDate': [], 'RecTime': [], 'RecDuration': [], 'RecHuman': [], 'RecAnimal': [], 'RecPartners': [], 
                      'RecLastKnownLocation': [], 'RecSpecies': []}
missing_days = []
missing_recordings = []

for month in months:
    
    # get folder path for this month in the dataset
    month_folder = get_month_folder(month, month_folders)
    
    # get days from info sheet
    days = np.unique(rec_selection[rec_selection['Month'] == month]['Folder'])
    
    for day in days:
        
        # get folder in dataset; ignore days in info sheet for which we don't have a folder
        day = int(day)
        day_folder = get_day_folder(day, month_folder, ignore_missing_folders=True)
        if not day_folder:
            missing_days.append(day)
            continue
            
        # get recording names from info sheet
        recs = np.unique(rec_selection[(rec_selection['Month'] == month) & (rec_selection['Folder'] == day)]['Filename'])
        
        for rec in recs:
            
            # ignore lost or deleted recordings
            if 'lost' in rec or 'deleted' in rec:
                continue
                
            # get folder in dataset; ignore recordings in info sheet for which we don't have a folder
            rec_folder = get_rec_folder(rec, day_folder, ignore_missing_folders=True)
            if not rec_folder:
                missing_recordings.append(rec)
                continue
            
            # load the recording sound and textgrid
            rec_wav = get_rec_wav(rec_folder)
            rec_sound = parselmouth.Sound(rec_wav)
            rec_tg = tgt.io.read_textgrid(get_rec_textgrid(rec_folder), encoding='utf-8')
            
            # get all the recording metadata
            rec_info = rec_selection[(rec_selection['Month'] == month) & (rec_selection['Folder'] == day) &
                                                 (rec_selection['Filename'] == rec)]
            assert len(rec_info) == 1, f"Multiple rows in info sheet for recording {rec} on {day}/{month}"
            
            rec_date_str = str(rec_info['Date'].dt.strftime('%Y%m%d').values[0])
            rec_time_int = rec_info['Time'].values[0]
            rec_duration = rec_sound.get_total_duration()
            rec_human = rec_info['Human'].values[0]
            rec_animal = rec_info['Animal'].values[0]
            rec_partners = rec_info['Partners'].values[0]
            rec_location = rec_info['LastKnownLocation'].values[0]
            rec_species = rec_info['Species'].values[0]
            
            seal_tiers = [(t.name, i) for i, t in enumerate(rec_tg.tiers)]
            
            for seal, tier_idx in seal_tiers:
                
                # load all vocalizations for this seal
                seal_folder = check_folder(processed_data_folder + seal + '/')
                u_annotations = rec_tg.tiers[tier_idx].get_annotations_with_text(pattern="u")
                p_annotations = rec_tg.tiers[tier_idx].get_annotations_with_text(pattern="p")
                
                if include_p_vocalizations:
                    annotations = [u_annotations, p_annotations]
                else:
                    annotations = [u_annotations]
                    
                for intervals in annotations:
                    
                    intv_times = [(intv.start_time, intv.end_time) for intv in intervals]
                    intv_texts = [intv.text for intv in intervals]
                    
                    for intv_idx, (start, end) in enumerate(intv_times):
                        voc_sound = rec_sound.extract_part(from_time=start, to_time=end)
                        voc_quality = intv_texts[intv_idx]
                        
                        # save vocalization WAV file
                        voc_filepath = seal_folder + '_'.join([voc_quality, seal, rec_date_str,
                                                rec, f"{intv_idx:03d}"]) + '.WAV'
                        voc_sound.save(voc_filepath, 'WAV')
                        
                        # save metadata to dict
                        vocalizations_dict['SealID'].append(seal)
                        vocalizations_dict['Filepath'].append(voc_filepath)
                        vocalizations_dict['Duration'].append(voc_sound.get_total_duration())
                        vocalizations_dict['VocQuality'].append(voc_quality)
                        vocalizations_dict['SourceRecPath'].append(rec_wav)
                        vocalizations_dict['StartTime'].append(start)
                        vocalizations_dict['EndTime'].append(end)
                        vocalizations_dict['RecDate'].append(rec_date_str)
                        vocalizations_dict['RecTime'].append(rec_time_int)
                        vocalizations_dict['RecDuration'].append(rec_duration)
                        vocalizations_dict['RecHuman'].append(rec_human)
                        vocalizations_dict['RecAnimal'].append(rec_animal)
                        vocalizations_dict['RecPartners'].append(rec_partners)
                        vocalizations_dict['RecLastKnownLocation'].append(rec_location)
                        vocalizations_dict['RecSpecies'].append(rec_species)

Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: you could scale the amplitudes or write to a binary file.
Advice: yo

KeyboardInterrupt: 

In [None]:
vocalizations_df = pd.DataFrame.from_dict(vocalizations_dict)
print(f'Saved {len(vocalizations_df)} vocalizations.')
print(f'(Skipped {len(missing_days)} days and {len(missing_recordings)} recordings from the info sheet)')

Saved 24756 vocalizations.
(Skipped 10 days and 370 recordings from the info sheet)


In [15]:
vocalizations_df

Unnamed: 0,SealID,Filepath,Duration,VocQuality,SourceRecPath,StartTime,EndTime,RecDate,RecTime,RecDuration,RecHuman,RecAnimal,RecPartners,RecLastKnownLocation,RecSpecies
0,r17-207,../data/vocalizations/r17-207/u_r17-207_201706...,0.892997,u,../data/Jarinos/06_june/11/ZOOM0002a/ZOOM0002_...,218.598254,219.491251,20170611,21,909.989333,Andrea,r17-207,r17-208,Q7,PV
1,r17-207,../data/vocalizations/r17-207/u_r17-207_201706...,0.485665,u,../data/Jarinos/06_june/11/ZOOM0002a/ZOOM0002_...,224.817901,225.303566,20170611,21,909.989333,Andrea,r17-207,r17-208,Q7,PV
2,r17-207,../data/vocalizations/r17-207/p_r17-207_201706...,0.541039,p,../data/Jarinos/06_june/11/ZOOM0002a/ZOOM0002_...,155.432636,155.973676,20170611,21,909.989333,Andrea,r17-207,r17-208,Q7,PV
3,r17-207,../data/vocalizations/r17-207/p_r17-207_201706...,0.459883,p,../data/Jarinos/06_june/11/ZOOM0002a/ZOOM0002_...,157.044934,157.504817,20170611,21,909.989333,Andrea,r17-207,r17-208,Q7,PV
4,r17-207,../data/vocalizations/r17-207/p_r17-207_201706...,0.811677,p,../data/Jarinos/06_june/11/ZOOM0002a/ZOOM0002_...,192.323886,193.135563,20170611,21,909.989333,Andrea,r17-207,r17-208,Q7,PV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24751,r17-289,../data/vocalizations/r17-289/p_r17-289_201707...,0.523914,p,../data/Jarinos/07_july/28/ZOOM0024/ZOOM0024_T...,536.225646,536.749560,20170728,11,648.888000,Marga,r17-288,r17-289,J8,PV
24752,r17-289,../data/vocalizations/r17-289/p_r17-289_201707...,0.567574,p,../data/Jarinos/07_july/28/ZOOM0024/ZOOM0024_T...,544.564613,545.132187,20170728,11,648.888000,Marga,r17-288,r17-289,J8,PV
24753,r17-289,../data/vocalizations/r17-289/p_r17-289_201707...,0.622148,p,../data/Jarinos/07_july/28/ZOOM0024/ZOOM0024_T...,546.823993,547.446141,20170728,11,648.888000,Marga,r17-288,r17-289,J8,PV
24754,r17-289,../data/vocalizations/r17-289/p_r17-289_201707...,0.729156,p,../data/Jarinos/07_july/28/ZOOM0024/ZOOM0024_T...,580.645500,581.374657,20170728,11,648.888000,Marga,r17-288,r17-289,J8,PV


In [16]:
vocalizations_df.to_csv(processed_data_folder + 'vocalizations_info.csv')

In [None]:
data = vocalizations_df
data.Filepath = [line[3:] for line in data.Filepath]
data.SourceRecPath = [line[3:] for line in data.SourceRecPath]

processed_data_folder = os.path.join(project_folder,'data/vocalizations/')

data.to_csv(processed_data_folder + 'vocalizations_info.csv')