In [8]:
import requests
from bs4 import BeautifulSoup
import re
import datetime
from tqdm.notebook import tqdm
import pandas as pd

Could just just this: 
https://api.avalanche.org/v2/public/products?avalanche_center_id=SAC&date_start=2021-09-01&date_end=2022-08-31
https://api.avalanche.org/v2/public/avalanche-center/SAC

In [2]:
BASE_URL = "https://www.sierraavalanchecenter.org"
URL = "/prior-jan-6-2021/archive"

prog = re.compile(r"(\d+\-\d+\-\d+)$")
problem_filter = re.compile("Avalanche Problem \d: ([\w\s]*)")

In [3]:
# https://www.sierraavalanchecenter.org//sites/all/themes/responsive_sac/img/rating-icons/6.png
def get_rating(img):
    rating = img.get('src').split('/')[-1]
    ratings = {
        '1.png': 'low',
        '2.png': 'moderate',
        '3.png': 'considerable',
        '4.png': 'high',
        '5.png': 'extream',
        '0.png': 'na',
        '6.png': 'na',
    }
    try:
        rating = ratings[rating]
    except KeyError as e:
        print(img.parent.parent)
        rating = None
    return rating

In [4]:
def scrape_problems(url):
    problem_types = []
    page = requests.get(url)
    problem_soup = BeautifulSoup(page.content, "html.parser")
    problems = problem_soup.find_all(text=problem_filter)
    for problem in problems:
        m = re.findall(problem_filter, problem)
        problem_types.append(m[0])
    return problem_types

In [5]:
def scrape_url(data, url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    rows = soup.find('tbody').find_all("tr")
    for row in rows:
        td = row.find('td')
        report = td.find('a').get('href')
        date_str = prog.findall(td.find('strong').text)[0]
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
        img = td.find("img")
        rating = get_rating(img)
        problem_url = BASE_URL + report
        try:
            problems = scrape_problems(problem_url)
        except:
            problems = ['NA']
        if rating:
            data.append([date, rating, problem_url, problems])
    try:
        next_url = soup.find('li', class_='pager-next').find('a').get('href')
    except AttributeError:
        next_url = False
    return data, next_url

In [6]:
data = []
url = URL
with tqdm(total=22) as pbar:
    while url:
        data, url = scrape_url(data, BASE_URL + url)
        pbar.update()

  0%|          | 0/22 [00:00<?, ?it/s]

<td class="views-field views-field-title active">
<a href="/content/2012-04-29-070032"><strong>Click here to see the full forecast for 2012-04-29</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2011-11-18-023128"><strong>Click here to see the full forecast for 2011-11-18</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2011-11-10-094107"><strong>Click here to see the full forecast for 2011-11-10</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2011-11-

<td class="views-field views-field-title active">
<a href="/content/2008-04-08-120000"><strong>Click here to see the full forecast for 2008-04-08</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-04-07-120000"><strong>Click here to see the full forecast for 2008-04-07</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-04-06-120000"><strong>Click here to see the full forecast for 2008-04-06</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-04-

<td class="views-field views-field-title active">
<a href="/content/2008-03-12-120000"><strong>Click here to see the full forecast for 2008-03-12</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-03-11-120000"><strong>Click here to see the full forecast for 2008-03-11</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-03-10-120000"><strong>Click here to see the full forecast for 2008-03-10</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-03-

<td class="views-field views-field-title active">
<a href="/content/2008-02-14-010000"><strong>Click here to see the full forecast for 2008-02-14</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-02-13-010000"><strong>Click here to see the full forecast for 2008-02-13</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-02-12-010000"><strong>Click here to see the full forecast for 2008-02-12</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-02-

<td class="views-field views-field-title active">
<a href="/content/2008-01-18-010000"><strong>Click here to see the full forecast for 2008-01-18</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-01-17-010000"><strong>Click here to see the full forecast for 2008-01-17</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-01-16-010000"><strong>Click here to see the full forecast for 2008-01-16</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2008-01-

<td class="views-field views-field-title active">
<a href="/content/2007-12-19-010000"><strong>Click here to see the full forecast for 2007-12-19</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-12-18-010000"><strong>Click here to see the full forecast for 2007-12-18</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-12-17-010000"><strong>Click here to see the full forecast for 2007-12-17</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-12-

<td class="views-field views-field-title active">
<a href="/content/2007-04-07-120000"><strong>Click here to see the full forecast for 2007-04-07</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-04-06-120000"><strong>Click here to see the full forecast for 2007-04-06</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-04-05-120000"><strong>Click here to see the full forecast for 2007-04-05</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-04-

<td class="views-field views-field-title active">
<a href="/content/2007-03-11-010000"><strong>Click here to see the full forecast for 2007-03-11</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-03-10-010000"><strong>Click here to see the full forecast for 2007-03-10</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-03-09-010000"><strong>Click here to see the full forecast for 2007-03-09</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-03-

<td class="views-field views-field-title active">
<a href="/content/2007-02-10-010000"><strong>Click here to see the full forecast for 2007-02-10</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-02-09-010000"><strong>Click here to see the full forecast for 2007-02-09</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-02-08-010000"><strong>Click here to see the full forecast for 2007-02-08</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-02-

<td class="views-field views-field-title active">
<a href="/content/2007-01-12-010000"><strong>Click here to see the full forecast for 2007-01-12</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-01-11-010000"><strong>Click here to see the full forecast for 2007-01-11</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-01-10-010000"><strong>Click here to see the full forecast for 2007-01-10</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2007-01-

<td class="views-field views-field-title active">
<a href="/content/2006-12-15-010000"><strong>Click here to see the full forecast for 2006-12-15</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2006-12-14-010000"><strong>Click here to see the full forecast for 2006-12-14</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2006-12-13-010000"><strong>Click here to see the full forecast for 2006-12-13</strong></a><br/><a href="http://www.avalanche.org/danger_card.php" target="_blank"><img src="/sites/all/themes/responsive_sac/img/rating-icons/.png"/></a> </td>
<td class="views-field views-field-title active">
<a href="/content/2006-12-

In [9]:
df = pd.DataFrame(data, columns =['date', 'rating', 'report', 'problems'])
df = df.set_index('date')

In [10]:
df.head()

Unnamed: 0_level_0,rating,report,problems
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-08,moderate,https://www.sierraavalanchecenter.org/advisory...,"[Wind Slab, Persistent Slab]"
2021-01-06,moderate,https://www.sierraavalanchecenter.org/advisory...,"[Persistent Slab, Wind Slab]"
2021-01-05,considerable,https://www.sierraavalanchecenter.org/advisory...,"[Persistent Slab, Wind Slab]"
2021-01-04,high,https://www.sierraavalanchecenter.org/advisory...,"[Persistent Slab, Wind Slab, Storm Slab]"
2021-01-03,moderate,https://www.sierraavalanchecenter.org/advisory...,[Persistent Slab]


In [11]:
df.rating.value_counts()

moderate        928
low             539
considerable    259
high             62
na               56
extream           2
Name: rating, dtype: int64

In [None]:
# values = [0, 1, 2, 3, 4, 5]
# conditions = (
#     df['rating'] == 'na',
#     df['rating'] == 'low',
#     df['rating'] == 'moderate',
#     df['rating'] == 'considerable',
#     df['rating'] == 'high',
#     df['rating'] == 'extream',
# )
# df['num_rating'] = np.select(conditions, values)

In [None]:
# df.head()

In [None]:
# df.num_rating.plot.line(figsize=(20,5))

In [12]:
df.to_csv('./data/avy_rating.csv')

In [None]:
assert False

In [None]:
test_data = []
url = BASE_URL + URL
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
rows = soup.find('tbody').find_all("tr")
for row in rows:
    td = row.find('td')
    report = td.find('a').get('href')
    date_str = prog.findall(td.find('strong').text)[0]
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    img = td.find("img")
    rating = get_rating(img)
    if rating:
        test_data.append([date, rating, BASE_URL + report])

In [None]:
test_data

In [None]:
data