In [12]:
import asyncio
import aiohttp
from aiohttp.resolver import AsyncResolver
from collections import namedtuple, OrderedDict
from datetime import datetime
from dateutil.parser import parse

import re
import pyquery
import pendulum
import pytz
import vobject

from html2text import html2text
from recurrent import RecurringEvent
from pyquery import PyQuery as pq
from tqdm import tqdm_notebook

In [None]:
# http://dbbs.wustl.edu/Lists/Events/MyItems.aspx?Paged=Next&p_StartTimeUTC=20191101T000000Z&View={860A9BEF-2D95-4524-8732-DB3328186BCB}

In [22]:
now = pendulum.now('America/Chicago')
month_shifts = [0, 1]
timestamps = [
    now.add(months=mo_shift).format('YYYYMM01T000000\Z')
    for mo_shift in month_shifts
]
timestamps

['20191001T000000Z', '20191101T000000Z']

In [23]:
event_list_urls = [
    'http://dbbs.wustl.edu/Lists/Events/MyItems.aspx?Paged=Next'
    f'&p_StartTimeUTC={timestamp}'
    '&View={860A9BEF-2D95-4524-8732-DB3328186BCB}'
    for timestamp in timestamps
]
event_list_urls

['http://dbbs.wustl.edu/Lists/Events/MyItems.aspx?Paged=Next&p_StartTimeUTC=20191001T000000Z&View={860A9BEF-2D95-4524-8732-DB3328186BCB}',
 'http://dbbs.wustl.edu/Lists/Events/MyItems.aspx?Paged=Next&p_StartTimeUTC=20191101T000000Z&View={860A9BEF-2D95-4524-8732-DB3328186BCB}']

In [40]:
async def parse_event_id(session, url):
    r = await session.get(url)
    d = pq(await r.text())
    a_doms = d('.ms-vb.itx a')
    event_ids = []
    for a in a_doms:
        m = re.search(r'&ID=(\d+)', a.attrib['href'])
        if m:
            event_ids.append(m.group(1))
    return event_ids
        
async def retrieve_all_event_ids(event_list_urls, conn):
    loop = asyncio.get_event_loop()
    event_ids = []
    async with aiohttp.ClientSession(connector=conn) as session:
        tasks = [loop.create_task(parse_event_id(session, url)) for url in event_list_urls]
        for f in tqdm_notebook(asyncio.as_completed(tasks), total=len(tasks)):
            await f
        for t in tasks:
            event_ids.extend(t.result())
    return event_ids

In [41]:
resolver = AsyncResolver(nameservers=["8.8.8.8", "8.8.4.4"])
conn = aiohttp.TCPConnector(limit=5, resolver=resolver)

# loop = asyncio.get_event_loop()
event_ids = await retrieve_all_event_ids(event_list_urls, conn)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [43]:
cst = pytz.timezone('America/Chicago')

In [46]:
Event = namedtuple(
    'Event', 
    'title location start end '
    'speaker description last_modified recurrence '
    'event_id link'
)

async def fetch_event(session, event_id):
    r = await session.get(f'http://dbbs.wustl.edu/Lists/Events/DispForm.aspx?ID={event_id}')
    d = pq(await r.text())
    table_rows = OrderedDict()
    for tr in d('.ms-formtable tr'):
        k, v, *_ = [pq(elem).text() for elem in pq(tr)('td')]
        table_rows[k] = v
    title = table_rows['Title']
    desc = table_rows['Comments'].rstrip()
    if desc:
        desc = html2text(desc).rstrip()
    recurrence = table_rows['Recurrence']
    start, end = [
        cst.localize(parse(time_str, dayfirst=False, ignoretz=True))
        for time_str in (table_rows['Start Time'], table_rows['End Time'])
    ]
    location = table_rows['Location']
    # Speakers
    speaker_info = []
    for speaker_order in ['', '2nd ', '3rd ', '4th ']:
        speaker_col = f'{speaker_order}Speaker/Honorific'
        speaker_aff_col = f'{speaker_order}Speaker Affiliation'
        if table_rows[speaker_col]:
            if table_rows[speaker_aff_col]:
                speaker_entry = f'{table_rows[speaker_col]}, {table_rows[speaker_aff_col]}'
            else:
                speaker_entry = table_rows[speaker_col]
            speaker_info.append(speaker_entry)
    speaker = '\n'.join(speaker_info)
    try:
        last_modified = parse(re.match(r'^Last modified at (.*) by ', d('#onetidinfoblock2').text()).group(1))
    except AttributeError:
        last_modified = None
    link = f'http://dbbs.wustl.edu/Resources/Pages/calendar_event.aspx?EvID={event_id}'
    return Event(title, location, start, end, speaker, desc, last_modified, recurrence, event_id, link)
        
async def fetch_all_events(event_ids, conn):
    loop = asyncio.get_event_loop()
    async with aiohttp.ClientSession(connector=conn) as session:
        tasks = [loop.create_task(fetch_event(session, event_id)) for event_id in event_ids]
        for f in tqdm_notebook(asyncio.as_completed(tasks), total=len(tasks)):
            await f
        return [t.result() for t in tasks]

In [47]:
resolver = AsyncResolver(nameservers=["8.8.8.8", "8.8.4.4"])
conn = aiohttp.TCPConnector(limit=5, resolver=resolver)

event_infos = await fetch_all_events(event_ids, conn)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [48]:
cal = vobject.iCalendar()
cal.add('TZID').value = 'America/Chicago'

In [54]:
for e in event_infos:
    event = cal.add('vevent')
    event.add('summary').value = e.title
    # Compose the event detail
    event_description = []
    if e.speaker:
        event_description.append(f'Speakers:\n{e.speaker}')
    if e.description:
        event_description.append(f'Description:\n{e.description}')
    event_description.append(f'Link: {e.link}')
    event.add('description').value = '\n\n'.join(event_description)
    
    event.add('location').value = e.location
    event.add('dtstart').value = e.start
    if e.recurrence:
        rre = RecurringEvent(now_date=e.start)
        rre.parse(e.recurrence)
        rre.until = e.end
        event.add('rrule').value = rre.get_RFC_rrule()[len('RRULE:'):]
        event.add('dtend').value = cst.localize(datetime.combine(e.start.date(), e.end.time()))
    else:
        event.add('dtend').value = e.end
    if e.last_modified is not None:
        event.add('dtstamp').value = e.last_modified
    event.add('uid').value = f'{e.event_id}@events.dbbs.wustl' 

In [55]:
with open('../output/DBBS.ics', 'w') as f:
    f.write(cal.serialize())