# Getting Data

#### Imports

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import os
import subprocess

#### Get species list
To use the selenium function below, I first needed to build a list of species for it to search with. Since there are several hundred species in North America, I went to [Cornell Lab of Ornithology](https://birdsna.org/Species-Account/bna/species) to scrape all of these species names. In addition to this, I gathered the family name for each species to have a filtering option.

In [None]:
base_url = "https://birdsna.org/Species-Account/bna/species"
res = requests.get(base_url)
soup = BeautifulSoup(res.content)

both = []
table = soup.find(('ul', {'class': 'ListGrid-list'}))
for f in table.find_all('h3', {'class':'ListGrid-key notranslate'}):
    family_table = table.find('li',{'id': f'{f.text}'})
    for s in family_table.find_all('em',{'class': 'Sci TextLight notranslate'}):
        combine = [s.text, f.text]
        both.append(combine)

family = []
species = []
for i in both:
    family.append(i[1])
    species.append(i[0])
    
dct = {}
dct['family'] = family
dct['species'] = species

df = pd.DataFrame(dct)
df.to_csv('family_species.csv')

In [None]:
len(species)

In [None]:
df['family'].value_counts()

#### Selenium setup
Additional steps that were taken outside of this notebook:
- created the folder for the files to land in called `mp3_downloads`
- downloaded [chromedriver](https://chromedriver.chromium.org/) and copied in filepath to that download

In [None]:
# instantiate
options = webdriver.ChromeOptions()

# specify where the files will be downloaded
download_dir = 'FILE_PATH/mp3_downloads//'

# set download guides into a dictionary
preferences = {'download.default_directory': download_dir,
              "download.prompt_for_download": False}

# define options
options.add_experimental_option('prefs', preferences)

# define driver
driver = webdriver.Chrome(executable_path='FILE_PATH/chromedriver', options=options);

# add in wait function to not override webstie/computer
driver.implicitly_wait(3)

# set in url to retrieve from
driver.get('https://www.xeno-canto.org/explore')


#### Selenium
- **Step 1. Move through list of species**
    - selenium has to go to the search bar and enter in the species name and click submit to bring up the page with the audio files for that species.
    
    
- **Step 2. Download each audio file for each page in the search results**
    - selenium needs to click on each download button on the page, then click next. This process is repeated until there the 'Next' button no longer exists.
    

- **Note**
    - this code could be improved by accounting for breaking. Sometimes there are no recordings for some species. Sometimes the website breaks. Adding in measures like `time.sleep()` or more `try/except` statements could help to counter act this.
    - My solution was to index species to start after the code broke.
        - If the code broke with the counter at 103
        - I would change the for loop to start with `species_list = species[103:]`

In [None]:
## Step One: Move through list of species ##
species_list = species
counter = 0

for species in species_list:        
    element = driver.find_elements_by_xpath("//input[@placeholder='Search recordings...']") #find searchbar
    element[0].clear() #make sure it's clear
    element[0].send_keys(species) #input species name
    element[0].submit() #click submit
    counter += 1

    ## Step Two: Move through each page and grab everything ##

    #download first page of results
    elems = driver.find_elements_by_xpath("//*[contains(@src, '/static/img/download.png')]")
    for elem in elems:
        elem.click() 
    # while there's a 'Next' button
    while True:
        try:
            #click the 'Next' button
            driver.find_element_by_xpath("//*[contains(text(), 'Next')]").click()
            # download that page of results
            elems = driver.find_elements_by_xpath("//*[contains(@src, '/static/img/download.png')]")
            for elem in elems:
                elem.click()

        #if there's no 'Next' button return some status statements and then go to next species in list
        except:
            if counter % 100 == 0:
                break
            elif counter == 765:
                print(f'Done! Reached the end for all {counter} species')
                break
            else:
                break

#### Convert to wav and save as csv
- The filenames is not modified at all coming in, so it reads in a bit messy. Most are look something like this: `XC18350 - Northern Mockingbird - Mimus polyglottos`


- For the purposes of reading these files and getting them ready for modeling, it is not required to have these filenames cleaned. The format of the `EDA.ipynb` is simply have the filenames callable. So they just need to be in dataframe along with their class (i.e. `Northern Mockingbird`)

In [None]:
# Create empty lists to append to
wav = []
bird_names = []

# go through each filename in the mp3_downloads folder
for filename in os.listdir('FILE_PATH/mp3_downloads'):
    
    #double check it's an mp3 file
    if '.mp3' in filename:
        
        #define FILE_PATH
        src = f'FILE_PATH/mp3_downloads/{filename}'
        dst = f'FILE_PATH/wav_downloads/{filename[:-4]}.wav'

        #convert to wav and save in correct folder
        subprocess.call(['ffmpeg', '-i', src, dst])
        
        #append names to empty lists
        wav.append(f'{filename[:-4]}.wav')
        bird_names.append(filename.split(' - ')[1])
    
    #print the non-mp3 files for reference
    else:
        print(filename, 'whoops')
        

# create empty dictionary
dct = {}

#create wav and bird_name
dct['wav'] = wav
dct['bird_names'] = bird_names

# call pd.DataFrame on dictionary
df = pd.DataFrame(dct)

# save df to csv
df.to_csv('FILE_PATH/wav_species.csv', index=False)
