In [1]:
import requests
import re
import os
import math
import aiohttp
import asyncio
from tqdm.notebook import tqdm


# sources used
GBIF.org (11 May 2023) GBIF Occurrence Download https://doi.org/10.15468/dl.m42ea5 

In [2]:
def Find(string):
 
    # findall() has been used
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex, string)
    return [x[0] for x in url]

In [3]:
async def download_and_save_audio(audio_url,filepath,session):
    if os.path.exists(filepath):
        return "Path Already Exist\t " + filepath
    try:
        async with session.get(audio_url,ssl=False) as resp:
            open(filepath, "wb").write(await resp.read())
            return "finished download for\t" + audio_url
    except Exception as e: 
        return "got error from downloading: \t" + audio_url + "\n error: " + str(e)
    

In [4]:
async def download(lines,folder):
    length = len(lines)    
    downloads = []
    connector = aiohttp.TCPConnector(limit_per_host=5)
    async with aiohttp.ClientSession(connector=connector) as session:
        for i in range(length):
            line = lines[i]
            url_list = Find(line)
            if not url_list:
                continue
            audio_url = url_list[0]
            # remove any query parameters if 'biocase.zfmk.de' not in url
            if 'biocase.zfmk.de' not in audio_url:
                audio_url = audio_url.split('?', 1)[0]
                # split the url into file name and file type
                file_array = audio_url.rsplit('/', 1)[-1].split('.', 1)
            else:
                file_array = audio_url.rsplit('guid=', 1)[-1].split('.', 1)

            
            # check if file_array contains at least 2 elements
            if len(file_array) < 2:
                # print failed url
                print("failed url: " + audio_url)
                continue
            
            file_name = file_array[0]
            file_type = file_array[1]
            audio_file_path = folder + '/geluiden/' + file_name + '.' + file_type 
            if  file_type == "mp3" or file_type == "wav" or file_type == "ogg" or file_type == "flac" or file_type == "m4a":
                downloads.append(download_and_save_audio(audio_url,audio_file_path,session)) 
        for f in tqdm(asyncio.as_completed(downloads), total=len(downloads)):
            result = await f
#             tqdm.write(result)
            

In [5]:
folder_path = "/home/birdo/MachineLearning/Vogelgeluiden/"
subfolders = [ f.path for f in os.scandir(folder_path) if f.is_dir() ]
# subfolders = ['/home/birdo/MachineLearning/Vogelgeluiden/Fringilla Coelebs Linnaeus']
# remove 'no bird' from the subfolders
subfolders = [x for x in subfolders if not x.endswith('no bird')]

print(subfolders)

fulldatasetpath = ''

for folder in subfolders:
# Iterate through each sound file and extract the features 
#     print(folder)
    downloadsource = folder + '/multimedia.txt'
    file = open(downloadsource, encoding="utf8")
    lines = file.readlines()
    tqdm.write(folder)
    await download(lines,folder)
    

['/home/birdo/MachineLearning/Vogelgeluiden/Turdus Merula Linnaeus', '/home/birdo/MachineLearning/Vogelgeluiden/Anas Platyrhynchos Linnaeus', '/home/birdo/MachineLearning/Vogelgeluiden/Phylloscopus Trochilus', '/home/birdo/MachineLearning/Vogelgeluiden/Sturnus Vulgaris Linnaeus', '/home/birdo/MachineLearning/Vogelgeluiden/Phylloscopus Collybita', '/home/birdo/MachineLearning/Vogelgeluiden/Parus Major Linnaeus', '/home/birdo/MachineLearning/Vogelgeluiden/Columba Palumbus Linnaeus', '/home/birdo/MachineLearning/Vogelgeluiden/Troglodytes Troglodytes', '/home/birdo/MachineLearning/Vogelgeluiden/Passer Domesticus', '/home/birdo/MachineLearning/Vogelgeluiden/Fringilla Coelebs Linnaeus']
/home/birdo/MachineLearning/Vogelgeluiden/Turdus Merula Linnaeus


  0%|          | 0/2549 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Anas Platyrhynchos Linnaeus


  0%|          | 0/1415 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Phylloscopus Trochilus


  0%|          | 0/2167 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Sturnus Vulgaris Linnaeus


  0%|          | 0/2159 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Phylloscopus Collybita


  0%|          | 0/2219 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Parus Major Linnaeus


  0%|          | 0/2075 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Columba Palumbus Linnaeus
failed url: https://xeno-canto.org/sounds/uploaded/VHVQLSCSXO/XC798562-R%C3%B6st_282-1-1


  0%|          | 0/889 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Troglodytes Troglodytes


  0%|          | 0/2301 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Passer Domesticus
failed url: http://arctos.database.museum/media/10305516
failed url: http://arctos.database.museum/media/10305512
failed url: http://arctos.database.museum/media/10305463
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: h

  0%|          | 0/4964 [00:00<?, ?it/s]

/home/birdo/MachineLearning/Vogelgeluiden/Fringilla Coelebs Linnaeus
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds
failed url: http://biocase.zfmk.de/sounds


  0%|          | 0/9549 [00:00<?, ?it/s]