In [1]:
import urllib.request, json
import requests
import sys
import os

savePath="../data/xeno-canto-dataset-full-all-Countries/"

## Performing Webscraping from xeno-canto API

In [2]:
def save_json(searchTerms, birdName, country):
    numPages = 1
    page = 1
    # create a path to save json files and recordings
    path = savePath + birdName.replace(':', '') + "/" + country
    if not os.path.exists(path):
        print("Creating subdirectory " + path + " for downloaded files...")
        os.makedirs(path)
        # download a json file for every page found in a query
    while page < numPages + 1:
        print("Loading page " + str(page) + "...")
        url = 'https://www.xeno-canto.org/api/2/recordings?query={0}&page={1}'.format(searchTerms.replace(' ', '%20'),
                                                                                      page)
        print(url)
        jsonPage = urllib.request.urlopen(url)
        jsondata = json.loads(jsonPage.read().decode('utf-8'))
        filename = path + "/jsondata_p" + str(page) + ".json"
        with open(filename, 'w') as outfile:
            json.dump(jsondata, outfile)
        # check number of pages
        numPages = jsondata['numPages']
        page = page + 1
    print("Found ", numPages, " pages in total.")
    # return number of files in json
    # each page contains 500 results, the last page can have less than 500 records
    print("Saved json for ", (numPages - 1) * 500 + len(jsondata['recordings']), " files")
    return path

# reads the json and return the list of values for selected json part
# i.e. "id" - ID number, "type": type of the bird sound such as call or song
# for all Xeno Canto files found with the given search terms.
def read_data(searchTerm, path):
    data = []
    numPages = 1
    page = 1
    # read all pages and save results in a list
    while page < numPages + 1:
        # read file
        with open(path + "/jsondata_p" + str(page) + ".json", 'r') as jsonfile:
            jsondata = jsonfile.read()
        jsondata = json.loads(jsondata)
        # check number of pages
        numPages = jsondata['numPages']
        # find "recordings" in a json and save a list with a search term
        for k in range(len(jsondata['recordings'])):
            data.append(jsondata["recordings"][k][searchTerm])
        page = page + 1
    return data

# downloads all sound files found with the search terms into xeno-canto directory
# into catalogue named after the search term (i.e. Apus apus)
# filename have two parts: the name of the bird in latin and ID number
def download(searchTerms, birdName, country):
    # create data/xeno-canto-dataset directory
    path = save_json(searchTerms, birdName, country)
    # get filenames: recording ID and bird name in latin from json
    filenamesID = read_data('id', path)
    filenamesCountry = read_data('cnt', path)
    # get website recording http download address from json
    fileaddress = read_data('file', path)
    numfiles = len(filenamesID)
    print("A total of ", numfiles, " files will be downloaded")
    basePath = "../data/xeno-canto-dataset-full-all-Countries/"
    for i in range(0, numfiles):
        print("Saving file ", i + 1, "/", numfiles,
              basePath + birdName.replace(':', '')+"/" +birdName.replace(':', '')+ filenamesID[
                  i] + ".mp3")
        file_path = basePath +birdName.replace(':', '') + "/"+ birdName.replace(':', '') + filenamesID[i] + ".mp3"
        #print(fileaddress[i])
        song = requests.get(fileaddress[i])
        with open(file_path,"wb") as f:
            f.write(song.content)

### List of all birds and countries from where sounds are recorded

In [None]:
countries = [
            'Poland', 
            'Germany',
            'Slovakia',
            'Czech', 
            'Lithuania'
            'brazil',
            'Spain']

#These are scientific names of birds

birds = [
         'Dendrocopos major',
         'Chloris chloris',
         'Corvus frugilegus',
         'Coccothraustes coccothraustes',
         'Columba palumbus',
         'Delichon urbicum',
         'Apus apus',
         'Sitta europaea', 
         'Corvus monedula',
         'Phoenicurus ochruros',
          'Turdus merula',
         'Turdus pilaris',
         'Passer montanus',
         'Phylloscopus trochilus',
         'Phylloscopus collybita',
         'Phoenicurus phoenicurus',
         'Motacilla alba',
         'Erithacus rubecula',
        'Streptopelia decaocto',
         'Parus major',
         'Parus caeruleus', # --
        'Alauda arvensis',   
        'Luscinia luscinia',
        'Garrulus glandarius',
        'Turdus philomelos', 
        'Pica pica',
        'Troglodytes troglodytes', 
        'Carduelis carduelis', 
        'Sturnus vulgaris', 
         'Emberiza citrinella',
         'Passer domesticus',
         'Corvus corone',
         'Fringilla coelebs', #complete
         'Cyanocitta cristata','Coloeus monedula','Corvus brachyrhynchos','Turnix velox',
        'Charadrius leschenaulti','Struthio camelus','Rhea americana','Apteryx mantelli','Cyanopica cooki']

for country in countries:
    for bird in birds:
        download(bird + ' cnt:' + country + ' type:song', bird.replace(' ', ''), country)
