# Download more data from xeno-canto.org
Download more bird songs for our current categories from xeno-canto using a combination of the api and direct downloads.

In [1]:
import requests
import requests_random_user_agent

import warnings
from tqdm import tqdm

import pandas as pd
import time

import json
import os
import numpy as np

In [2]:
def get_xc_page(query, page=False):
    '''make a single api query to xeno-canto'''
    
    url = 'https://www.xeno-canto.org/api/2/recordings?query='
    
    url_encoded_query = []
    for q in query:
        url_encoded_query.append( requests.utils.quote(q))
        
    url_encoded_query = '&'.join(url_encoded_query)
    url += url_encoded_query
    
    if page:
        url+=f'&page={page}'

    # print('Fetching ', url)
    
    r = requests.get(url)
    
    # raise an error if the status code is not ok
    r.raise_for_status()
    
    return r.json()

In [3]:
def search_xc(query):
    '''get all pages of a xeno-canto query'''
    
    r = get_xc_page(query)
    numRecordings = r['numRecordings']
    
    recordings = r['recordings']

    # get all the pages
    while r['page'] < r['numPages']:
        r = get_xc_page(query, page=r['page']+1)
        recordings += r['recordings']
    
    # verify that we have all the recordings
    if int(r['numRecordings']) != len(recordings):
        warnings.warn(f'''The number of downloaded records {len(recordings)}
does not match the number of indicated recordings {r['numRecordings']}''')
    
    return recordings

## Use current dataset csv to limit query to those species

In [4]:
df = pd.read_csv('data/train.csv')
df.head(2)

Unnamed: 0,rating,playback_used,ebird_code,channels,date,pitch,duration,filename,speed,species,...,xc_id,url,country,author,primary_label,longitude,length,time,recordist,license
0,3.5,no,aldfly,1 (mono),2013-05-25,Not specified,25,XC134874.mp3,Not specified,Alder Flycatcher,...,134874,https://www.xeno-canto.org/134874,United States,Jonathon Jongsma,Empidonax alnorum_Alder Flycatcher,-92.962,Not specified,8:00,Jonathon Jongsma,Creative Commons Attribution-ShareAlike 3.0
1,4.0,no,aldfly,2 (stereo),2013-05-27,both,36,XC135454.mp3,both,Alder Flycatcher,...,135454,https://www.xeno-canto.org/135454,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...


In [5]:
df.columns

Index(['rating', 'playback_used', 'ebird_code', 'channels', 'date', 'pitch',
       'duration', 'filename', 'speed', 'species', 'number_of_notes', 'title',
       'secondary_labels', 'bird_seen', 'sci_name', 'location', 'latitude',
       'sampling_rate', 'type', 'elevation', 'description', 'bitrate_of_mp3',
       'file_type', 'volume', 'background', 'xc_id', 'url', 'country',
       'author', 'primary_label', 'longitude', 'length', 'time', 'recordist',
       'license'],
      dtype='object')

In [6]:
#look at columns of interest
df[['ebird_code', 'filename', 'title', 'secondary_labels', 'sci_name', 
    'location', 'xc_id', 'primary_label']].head()

Unnamed: 0,ebird_code,filename,title,secondary_labels,sci_name,location,xc_id,primary_label
0,aldfly,XC134874.mp3,XC134874 Alder Flycatcher (Empidonax alnorum),"['Empidonax minimus_Least Flycatcher', 'Leioth...",Empidonax alnorum,"Grey Cloud Dunes SNA, Washington, Minnesota",134874,Empidonax alnorum_Alder Flycatcher
1,aldfly,XC135454.mp3,XC135454 Alder Flycatcher (Empidonax alnorum),[],Empidonax alnorum,"Carver's Gap Parking area, Roan Mountain Stat...",135454,Empidonax alnorum_Alder Flycatcher
2,aldfly,XC135455.mp3,XC135455 Alder Flycatcher (Empidonax alnorum),[],Empidonax alnorum,"Carver's Gap Parking area, Roan Mountain Stat...",135455,Empidonax alnorum_Alder Flycatcher
3,aldfly,XC135456.mp3,XC135456 Alder Flycatcher (Empidonax alnorum),"['Dumetella carolinensis_Gray Catbird', 'Bomby...",Empidonax alnorum,"Carver's Gap Parking area, Roan Mountain Stat...",135456,Empidonax alnorum_Alder Flycatcher
4,aldfly,XC135457.mp3,XC135457 Alder Flycatcher (Empidonax alnorum),[],Empidonax alnorum,"Carver's Gap Parking area, Roan Mountain Stat...",135457,Empidonax alnorum_Alder Flycatcher


In [7]:
# collect the  number of unique species in this set
species = df['sci_name'].unique()
len(species)

264

# Get all of the xeno-canto records for the species in this dataset.

In [8]:
search_results = []
for s in tqdm(species):
    search_results += search_xc([s])
    
    # be nice to the server
    time.sleep(1)
    
print('Collected', len(search_results), 'records')

100%|██████████| 264/264 [19:46<00:00,  4.49s/it]

Collected 62503 records





In [9]:
len(search_results)

62503

In [10]:
# save the results
with open('data/xc.json', 'w') as f:
    json.dump({'results': search_results}, f, indent=4)

In [8]:
# load search_results if restarting
with open('data/xc.json', 'r') as f:
    search_results = json.load(f)
search_results = search_results['results']

In [9]:
# create a data frame from the query results
sr_df = pd.DataFrame(search_results)
sr_df.head()

Unnamed: 0,id,gen,sp,ssp,en,rec,cnt,loc,lat,lng,...,lic,q,length,time,date,uploaded,also,rmk,bird-seen,playback-used
0,554809,Empidonax,alnorum,,Alder Flycatcher,Ron Overholtz,United States,"Wasilla, Matanuska-Susitna, Alaska",61.6841,-149.9757,...,//creativecommons.org/licenses/by-nc-sa/4.0/,no score,0:49,07:00,2019-06-11,2020-05-07,[],,yes,no
1,552408,Empidonax,alnorum,,Alder Flycatcher,Jerome Fischer,Colombia,"Mana Dulce, Cundinamarca",4.351,-74.652,...,//creativecommons.org/licenses/by-nc-sa/4.0/,no score,0:21,17:00,2020-04-14,2020-04-30,[],,yes,no
2,544552,Empidonax,alnorum,,Alder Flycatcher,Jerome Fischer,Colombia,"Mana Dulce, Cundinamarca",4.351,-74.652,...,//creativecommons.org/licenses/by-nc-sa/4.0/,no score,0:35,07:00,2020-04-05,2020-04-12,[],same ind. as XC544550,yes,yes
3,544551,Empidonax,alnorum,,Alder Flycatcher,Jerome Fischer,Colombia,"Mana Dulce, Cundinamarca",4.351,-74.652,...,//creativecommons.org/licenses/by-nc-sa/4.0/,no score,0:12,07:00,2020-04-05,2020-04-12,[],same ind. as XC544550,yes,yes
4,544550,Empidonax,alnorum,,Alder Flycatcher,Jerome Fischer,Colombia,"Mana Dulce, Cundinamarca",4.351,-74.652,...,//creativecommons.org/licenses/by-nc-sa/4.0/,no score,0:38,07:00,2020-04-05,2020-04-12,[],,yes,yes


In [10]:
# number of rows in the df
len(sr_df)

62503

In [11]:
# cast id as int for comparisons
sr_df['id'] = sr_df['id'].astype('int')

In [12]:
# find the number of overlapping rows
overlap_df = sr_df[sr_df['id'].isin(df['xc_id'])]
len(overlap_df)

21338

In [13]:
# recall number of rows in original dataset
len(df)

21375

There are a few records in the initial dataset that are not included in this search.

In [14]:
# check for duplicates
len(sr_df[sr_df.duplicated(subset='id', keep=False)])

0

There are no duplicates.  Find the records that are unique to the new data.

In [15]:
# filter new records not in our current dataset
new_records_df = sr_df[~sr_df['id'].isin(df['xc_id'])]
len(new_records_df)

41165

# Download the new files

In [19]:
def download_file(url, outpath):
    r = requests.get(url, allow_redirects=True)
    
    if r.status_code == 404:
        print('404 URL not found:', url)
    elif r.status_code == 500:
        print('500 Server Error:', url)
    else:
        r.raise_for_status()
        open(outpath, 'wb').write(r.content)    

In [21]:
output_dir = 'data/xeno-canto'
os.makedirs(output_dir, exist_ok=True)

# files to exclude because they throw errors on download
#exclude = [154082, 199642, 375263, 375264]

for idx in tqdm(new_records_df.index):
    row = new_records_df.loc[idx]
    
    # If this is an excluded file move on
    #if row['id'] in exclude:
    #    continue
        
    # output file path
    filename = 'XC' + str(row['id'])
    filename += os.path.splitext(row['file-name'])[1]
    outpath = os.path.join(output_dir, filename)

    # download the file if it doesn't exist locally
    if not os.path.exists(outpath):
        url = 'http:' + row['file']
        download_file(url, outpath)
        
        # wait a random amount of time
        speep_time = np.random.uniform(0,4)
        time.sleep(speep_time)

  7%|▋         | 2950/41165 [00:00<00:06, 5490.42it/s]

500 Server Error: http://www.xeno-canto.org/154082/download


 34%|███▍      | 14135/41165 [00:07<00:04, 5913.05it/s]

404 URL not found: http://www.xeno-canto.org/199642/download


 42%|████▏     | 17258/41165 [00:10<00:11, 2161.88it/s]

404 URL not found: http://www.xeno-canto.org/375263/download


 55%|█████▍    | 22607/41165 [00:20<00:33, 557.50it/s] 

404 URL not found: http://www.xeno-canto.org/375264/download


 67%|██████▋   | 27618/41165 [00:23<00:04, 3200.33it/s]

404 URL not found: http://www.xeno-canto.org/507640/download


 69%|██████▊   | 28199/41165 [00:26<00:25, 506.25it/s] 

404 URL not found: http://www.xeno-canto.org/385654/download


 83%|████████▎ | 34040/41165 [1:10:03<7:15:23,  3.67s/it] 

404 URL not found: http://www.xeno-canto.org/375269/download


100%|██████████| 41165/41165 [9:16:38<00:00,  1.23it/s]   
