In [1]:
import requests
from bs4 import BeautifulSoup
import re
import datetime
from tqdm.notebook import tqdm
import pandas as pd
import functools

## Notes
Need to scrap old and new data  

https://api.avalanche.org/v2/public/products?avalanche_center_id=SAC&date_start=2021-09-01&date_end=2022-08-31  
https://api.avalanche.org/v2/public/avalanche-center/SAC  
Only goes back to ~2018

In [2]:
BASE_URL = "https://www.sierraavalanchecenter.org"
URL = "/prior-jan-6-2021/archive"

prog = re.compile(r"(\d+\-\d+\-\d+)$")
problem_filter = re.compile("Avalanche Problem \d: ([\w\s]*)")

In [25]:
# https://www.sierraavalanchecenter.org//sites/all/themes/responsive_sac/img/rating-icons/6.png
def get_rating(img):
    rating = img.get('src').split('/')[-1]
    ratings = {
        '1.png': 'low',
        '2.png': 'moderate',
        '3.png': 'considerable',
        '4.png': 'high',
        '5.png': 'extreme',
        '0.png': 'na',
        '6.png': 'na',
    }
    try:
        rating = ratings[rating]
    except KeyError as e:
        rating = None
    return rating

In [26]:
@functools.lru_cache
def scrape_problems(url):
    problem_types = []
    page = requests.get(url)
    problem_soup = BeautifulSoup(page.content, "html.parser")
    problems = problem_soup.find_all(text=problem_filter)
    for problem in problems:
        m = re.findall(problem_filter, problem)
        problem_types.append(m[0])
    return tuple(problem_types)

In [27]:
url = BASE_URL + URL
print(url)
page = requests.get(url)

https://www.sierraavalanchecenter.org/prior-jan-6-2021/archive


In [28]:
soup = BeautifulSoup(page.content, "html.parser")
rows = soup.find('tbody').find_all("tr")

In [29]:
row = rows[0]
td = row.find('td')
report = td.find('a').get('href')
date_str = prog.findall(td.find('strong').text)[0]
img = td.find("img")
print(img)
print(report)
print(date_str)
rating = get_rating(img)
problem_url = BASE_URL + report
print(problem_url)
problems = scrape_problems(problem_url)
print(rating)
print(problems)

<img src="/sites/all/themes/responsive_sac/img/rating-icons/2.png"/>
/advisory/2021/jan/8/2021-01-08-065831-avalanche-forecast
2021-01-08
https://www.sierraavalanchecenter.org/advisory/2021/jan/8/2021-01-08-065831-avalanche-forecast
moderate
('Wind Slab', 'Persistent Slab')


In [30]:
def scrape_url(data, url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    rows = soup.find('tbody').find_all("tr")
    for row in rows:
        td = row.find('td')
        report = td.find('a').get('href')
        date_str = prog.findall(td.find('strong').text)[0]
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
        img = td.find("img")
        rating = get_rating(img)
        problem_url = BASE_URL + report
        try:
            problems = scrape_problems(problem_url)
        except:
            problems = ['NA']
        if rating:
            data.append([date, rating, problem_url, problems])
    try:
        next_url = soup.find('li', class_='pager-next').find('a').get('href')
    except AttributeError:
        next_url = False
    return data, next_url

In [31]:
data = []
url = URL
with tqdm(total=22) as pbar:
    while url:
        data, url = scrape_url(data, BASE_URL + url)
        pbar.update()

  0%|          | 0/22 [00:00<?, ?it/s]

In [32]:
df = pd.DataFrame(data, columns =['date', 'rating', 'report', 'problems'])

In [33]:
df.head()

Unnamed: 0,date,rating,report,problems
0,2021-01-08,moderate,https://www.sierraavalanchecenter.org/advisory...,"(Wind Slab, Persistent Slab)"
1,2021-01-06,moderate,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab, Wind Slab)"
2,2021-01-05,considerable,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab, Wind Slab)"
3,2021-01-04,high,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab, Wind Slab, Storm Slab)"
4,2021-01-03,moderate,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab,)"


In [34]:
df.tail()

Unnamed: 0,date,rating,report,problems
1841,2008-12-21,low,https://www.sierraavalanchecenter.org/content/...,()
1842,2008-12-20,low,https://www.sierraavalanchecenter.org/content/...,()
1843,2008-12-19,considerable,https://www.sierraavalanchecenter.org/content/...,()
1844,2008-12-18,low,https://www.sierraavalanchecenter.org/content/...,()
1845,2008-12-17,low,https://www.sierraavalanchecenter.org/content/...,()


In [35]:
df.rating.value_counts()

moderate        928
low             539
considerable    259
high             62
na               56
extreme           2
Name: rating, dtype: int64

0.5027085590465872

In [36]:
df.head()

Unnamed: 0,date,rating,report,problems
0,2021-01-08,moderate,https://www.sierraavalanchecenter.org/advisory...,"(Wind Slab, Persistent Slab)"
1,2021-01-06,moderate,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab, Wind Slab)"
2,2021-01-05,considerable,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab, Wind Slab)"
3,2021-01-04,high,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab, Wind Slab, Storm Slab)"
4,2021-01-03,moderate,https://www.sierraavalanchecenter.org/advisory...,"(Persistent Slab,)"


In [38]:
# df.to_pickle(f'./data/avy_data_old.pkl')