## Code to search and download from `freesound.org` using API

In [5]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import json

import os
import math
import pathlib
from tqdm import tqdm
from humanfriendly import format_size, parse_size

# Freesound.org APIv2 key
token = 'dedOJD0nnsIebO5RcNQHqt9fLMElgDWccZi5rqcn'


def get_ids_freesound(query, page_num=1, page_size=15, save=True):
    global file_ids
    
    query = query
    page_num = int(page_num)
    page_size = int(page_size)
    
    print("Query: {}\nStarting page: {}\nPage size: {}\nSave: {}"
          .format(query, page_num, page_size, save))

    file_ids = []
    end_reached = False

    while not end_reached:
        res = requests.get('https://freesound.org/apiv2/search/text/',
                          params={'token' : token,
                                  'query' : query,
                                  'page' : page_num,
                                  'page_size' : page_size,
                                 })
        try:
            json_res = json.loads(res.content.decode('utf8'))
            if page_num == 1:
                print(json_res['count'], "results")
                print("Expected number of pages: {}"
                      .format(math.ceil(json_res['count']/page_size) - page_num + 1))
            file_ids.extend([res['id'] for res in json_res['results']])
            print(page_num, end=' ')
        except:
            end_reached = True
        page_num += 1
        
    if save:
        try:
            with open('../_data/{}_ids.txt'.format(query), 'x') as out_text:
                print("\nSaving to {}".format('../_data/{}_ids.txt'.format(query)))
                for item in file_ids:
                    out_text.write("{}\n".format(item))
        except FileExistsError as e:
                print("File already exists, skipping save.")
    
    return file_ids


def load_ids(query):
    with open('../_data/{}_ids.txt'.format(query), 'r+') as read_text:
        id_list = read_text.read().splitlines()
    return id_list


def save_mp3s(id_list, name='unknown', start_enum=1, size_threshold='100MB', overwrite=False):
    # First result has enum value of 1, instead of 0
    enum = start_enum
    
    print("Query: {}".format(name)) 
    pathlib.Path('../_data/mp3s/{}'.format(name)).mkdir(parents=True, exist_ok=True) 

    for file_id in id_list:
        file_path = "../_data/mp3s/{0}/{2:04}_{0}_{1}.mp3".format(name, file_id, enum)
        
        print("======== Result no: {} | File id: {} ========".format(enum, file_id))
        print("Saving to {}".format(file_path))
        
        if overwrite:
            mode = "wb+"
        else:
            mode = "xb"
        try:
            with open(file_path, mode) as handle:
                res = requests.get('https://freesound.org/apiv2/sounds/{}/'.format(file_id),
                                   params={'token' : token,
                                           })
                json_res = json.loads(res.content.decode('utf8'))
                filesize = json_res['filesize']
                print("File size is {}".format(format_size(filesize)))
                
                if filesize > parse_size(size_threshold):
                    raise Exception("size_threshold")
                
                url = json_res['previews']['preview-hq-mp3']
                response = requests.get(url, stream=True)

                for data in tqdm(response.iter_content()):
                    handle.write(data)
        except FileExistsError as e:
            print("File already exists, skipping.")
        except Exception as e:
            if str(e) != "size_threshold":
                raise
            else:
                print("File size greater than {}, skipping."
                     .format(format_size(parse_size(size_threshold))))
                os.remove(file_path)
        
        enum += 1
    return
        
        
def fetch_mp3s(query, startindex=0, endindex=1, size_threshold='100MB', overwrite=False):
    startindex, endindex = int(startindex), int(endindex)
    try:
        query_all = load_ids(query)
    except:
        print("ID list not found")
        return    
    query_list = query_all[startindex:endindex]
    save_mp3s(query_list, query, start_enum=startindex+1, size_threshold=size_threshold,
              overwrite=overwrite)
    return


In [3]:
%%time
id_list = get_ids_freesound('laughing', page_num=1, page_size=150)

Query: laughing
Starting page: 1
Page size: 150
1923 results
Expected number of pages: 13
1 2 3 4 5 6 7 8 9 10 11 12 13 
Saving to ../_data/laughing_ids.txt
CPU times: user 580 ms, sys: 48 ms, total: 628 ms
Wall time: 49.3 s


In [13]:
%%time
id_list = get_ids_freesound('crying', page_num=1, page_size=150)

Query: crying
Starting page: 1
Page size: 150
Save: True
864 results
Expected number of pages: 6
1 2 3 4 5 6 File already exists, skipping save.
CPU times: user 260 ms, sys: 24 ms, total: 284 ms
Wall time: 22.9 s


In [17]:
fetch_mp3s('laughing', startindex=0, endindex=3, size_threshold='1.28MB', overwrite=True)

Query: laughing
Saving to ../_data/mp3s/laughing/0001_laughing_361282.mp3
File size is 3.27 MB
File size greater than 1.28 MB, skipping.
Saving to ../_data/mp3s/laughing/0002_laughing_361288.mp3
File size is 2.77 MB
File size greater than 1.28 MB, skipping.
Saving to ../_data/mp3s/laughing/0003_laughing_361289.mp3
File size is 3.66 MB
File size greater than 1.28 MB, skipping.


In [71]:
import IPython.display as ipd

In [72]:
ipd.Audio(filename='../_data/mp3s/unknown/unknown_244526.mp3')

In [6]:
res = requests.get('https://freesound.org/apiv2/sounds/{}/'.format(371399),
                                   params={'token' : token,
                                           })

In [7]:
res

<Response [429]>