## NOAH Downloads Scraper
### About
This Notebook is used for scraping/downloading the datasets in the Project NOAH [website](http://noah.up.edu.ph/) and serves as a basic exercise in scraping using Beautiful Soup and Python 3.

### RUNNING THE TOOL
_**REQUIREMENTS**_
* Python 3.6.1
* jupyter
* Beautiful Soup

They are also found in the __requirements.txt__ file.  
You can install these requirements using __pip__ (__sudo pip install -r requirements.txt__ _or_ __pip install -r requirements.txt__)  

_**PROCEDURE**_
1. Choose the URLs to be scraped in the URL list.
2. Add the proxy in PROXY (if any).
3. Select the root directory to save the scraped files to (ROOT_SAVEDIR).
4. Add a QUERY list to limit the download (i.e. ["Albay", "Abra"]; only files containing strings matching any of the elements in QUERY will be downloaded).
5. Run All.

### LICENSE  
_Copyright (C) 2017 Ben Hur S. Pintor (bhs.pintor@gmail.com)_ [[website](https://benhur07b.github.io)]

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [None]:
from bs4 import BeautifulSoup
import requests
import shutil
import re
import sys
import os
from datetime import datetime

sys.dont_write_bytecode = True

NOAH_URL = "http://noah.up.edu.ph/downloads"
LANDSLIDE_RAS_URL = "LANDSLIDE/RASTER/HAZARDS"
LANDSLIDE_SHP_URL = "LANDSLIDE/SHAPEFILES/HAZARDS"
STORMSURGE_RAS_SSA1_URL = "STORMSURGE/RASTER/SSA1"
STORMSURGE_RAS_SSA2_URL = "STORMSURGE/RASTER/SSA2"
STORMSURGE_RAS_SSA3_URL = "STORMSURGE/RASTER/SSA3"
STORMSURGE_RAS_SSA4_URL = "STORMSURGE/RASTER/SSA4"
STORMSURGE_SHP_SSA1_URL = "STORMSURGE/SHAPEFILES/SSA1"
STORMSURGE_SHP_SSA2_URL = "STORMSURGE/SHAPEFILES/SSA2"
STORMSURGE_SHP_SSA3_URL = "STORMSURGE/SHAPEFILES/SSA3"
STORMSURGE_SHP_SSA4_URL = "STORMSURGE/SHAPEFILES/SSA4"

# add the URLs to scrape here
URLS = [STORMSURGE_SHP_SSA1_URL,
        STORMSURGE_SHP_SSA2_URL,
        STORMSURGE_SHP_SSA3_URL,
        STORMSURGE_SHP_SSA4_URL]

PROXY = ""  # insert proxy here

PROXIES = {"http": PROXY}

ROOT_SAVEDIR = ""  # add directory to save scraped files here (absolute path)

QUERY = []  # add query items (strings to limit downloads) here, leave empty to download ALL data

In [None]:
def get_links(r, query):
    """Returns a list of links in a page provided for by a requests object r.
    
    Keyword arguments:
    r -- the requests object
    """
    
    links = list()
    soup = BeautifulSoup(r.text)
    for link in soup.find_all('a'):
        links.append(link.get('href'))
    
    if len(query) > 0:
        return [l for l in links if any(q in l for q in query)]  # return only the links that match any of the query items
                                                                 # for every l in links, checks if l contains any q in query
    else:
        return links[1:]

    
def save_to_file(url, name, proxies=None):
    """Saves the sensor measurements in the url into a .csv file.
    
    Keyword arguments:
    url -- the url of the sensor measurement
    name -- the name of the file to save
    proxies -- the proxy settings, if any (default = None)
    """
    
    if proxies:
        r = requests.get(url, stream=True, proxies=proxies)
    else:
        r = requests.get(url, stream=True)
        
    with open(name, 'wb') as outfile:
        shutil.copyfileobj(r.raw, outfile)
    
    del r
    print("{} saved.".format(name))


for url in URLS:
    
    s_url = "{}/{}/".format(NOAH_URL, url)
    
    SAVEDIR = "{}/{}".format(ROOT_SAVEDIR, url)
    
    if SAVEDIR:
        if not os.path.exists(SAVEDIR):
            os.makedirs(SAVEDIR)

        os.chdir(SAVEDIR)
    
    r = requests.get(s_url, proxies=PROXIES)
    links = get_links(r, QUERY)
    for link in links:
        if not os.path.exists(link):
            save_to_file("{}{}".format(s_url, link), link, PROXIES)
        else:
            pass