# Data Fetching Code

Helper code to automate downloading archive files from live ATC (they come in 1-hour recordings) and segmenting them into smaller clips that are good for training by removing silence.

In [None]:
import os
import requests
from datetime import datetime

def download_liveatc_audio(feed, date, time, directory='audio/raw'):
    """
    Download an audio file from LiveATC based on feed, date, and time.

    Args:
    feed (str): The feed identifier (e.g., 'katl').
    date (str or datetime): The date in 'YYYY-mm-dd' format or a datetime object.
    time (str): The time in 'HHMM' format, 24-hour clock (e.g., '0000Z' for midnight).
    directory (str): The directory to save the audio file to.
    """
    # If the date is a datetime object, format it as a string
    if isinstance(date, datetime):
        date = date.strftime('%b-%d-%Y').lower()

    # Create the URL for the audio file
    url = f"https://archive.liveatc.net/{feed}-{date}-{time}.mp3"
    
    # Create the directory if it does not exist
    os.makedirs(directory, exist_ok=True)
    
    # Determine the local filename to save the audio
    local_filename = os.path.join(directory, f"{feed}-{date}-{time}.mp3")
    
    # Download the file from `url` and save it locally
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192): 
                    f.write(chunk)
        print(f"Downloaded {local_filename}")
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error: {err}")
    except Exception as e:
        print(f"Error: {e}")

In [8]:
import requests
# download_liveatc_audio('katl/KATL-Twr-10-28', datetime(2024, 2, 9), '0000Z')
url = "https://archive.liveatc.net/katl/KATL-Twr-10-28-Feb-12-2024-1400Z.mp3"
local_filename = "audio\\raw\\KATL-Twr-10-28-Feb-13-2024-1400Z.mp3"
with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192): 
            f.write(chunk)

In [41]:
from pydub import AudioSegment, silence

def preprocess_and_save_segments(file_path, output_dir, silence_thresh=-40, min_silence_len=500, keep_silence=100, min_segment_len=2000):
    """
    Preprocess raw audio recordings to remove silence and save non-silent segments
    that meet the minimum length requirement as separate files.

    Args:
    file_path (str): Path to the raw audio file.
    output_dir (str): Directory where the segments will be saved.
    silence_thresh (int): The threshold (in dB) below which a chunk is considered silent.
    min_silence_len (int): The minimum length (in ms) for a silence to be considered.
    keep_silence (int): Amount of silence (in ms) to leave at the beginning and end of each non-silent chunk.
    min_segment_len (int): The minimum length (in ms) for a segment to be saved as a separate file.
    """

    # Load the audio file
    # y, sr = librosa.load(file_path, sr=None)

    # Convert the NumPy array to a pydub AudioSegment
    audio_segment = AudioSegment.from_mp3(file_path)

    # Detect non-silent chunks
    non_silence_chunks = silence.split_on_silence(
        audio_segment,
        silence_thresh=silence_thresh,
        min_silence_len=min_silence_len,
        keep_silence=keep_silence
    )

    # Save non-silent chunks that meet the minimum length requirement
    for i, chunk in enumerate(non_silence_chunks):
        if len(chunk) >= min_segment_len:
            segment_path = f"{output_dir}/segment_{i}.wav"
            chunk.export(segment_path, format="wav")
    return len(non_silence_chunks)

# Example usage
# file_path = 'audio\\raw\\KATL-Gnd-0826-Feb-08-2024-0000Z.mp3'  # Update this to the path of your ATC audio file
local_filename = os.path.join('audio','raw','KATL-Twr-All-Feb-16-2024-1430Z.mp3')
output_dir = os.path.join('audio', 'segments','KATL-Twr-All-Feb-16-2024-1430Z')
preprocess_and_save_segments(local_filename, output_dir)

237

In [23]:
import requests

def download_and_process(airport, target):
    # airport katl
    # target = KATL-Twr-10-28-Feb-12-2024-1400Z
    # download_liveatc_audio('katl/KATL-Twr-10-28', datetime(2024, 2, 9), '0000Z')
    url = f"https://archive.liveatc.net/{airport}/{target}.mp3"

    local_filename = os.path.join('audio','raw',f'{target}.mp3')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    segment_dir = os.path.join('audio','segments', target)
    os.makedirs(segment_dir)
    return preprocess_and_save_segments(local_filename, segment_dir)

In [32]:
# download_and_process('katl', 'KATL-Twr-10-28-Feb-16-2024-1400Z')
targets = [f'KATL-Twr-All-Feb-16-2024-{hr}{min}Z' for hr in range(13, 15) for min in ['00', '30']]
for t in targets:
    n = download_and_process('katl', t)
    print(f'{t}: {n}')

KATL-Twr-All-Feb-16-2024-1300Z: 173
KATL-Twr-All-Feb-16-2024-1330Z: 230
KATL-Twr-All-Feb-16-2024-1400Z: 271


  y, sr = librosa.load(file_path, sr=None)


NoBackendError: 

In [48]:
import pathlib
path = pathlib.Path(os.path.join('audio','segments'))
files = [f for f in path.rglob('*.wav')]
print(len(files))
files[0]
audio, _ = librosa.load(files[0], sr=None)

1166
