In [1]:
import asyncio
import aiohttp
from aiohttp.resolver import AsyncResolver
from collections import namedtuple, OrderedDict
from datetime import datetime
from dateutil.parser import parse

import re
import pyquery
import pendulum
import pytz
import vobject

from html2text import html2text
from recurrent import RecurringEvent
from pyquery import PyQuery as pq
from tqdm import tqdm_notebook

In [2]:
now = pendulum.now('America/Chicago')
next_month = now.add(months=1)
month_reprs = [
    dt.format('%B%%2C%%20%Y', locale='en')
    for dt in [now, next_month]
]
month_reprs

['January%2C%202017', 'February%2C%202017']

In [3]:
event_list_urls = [
    f'http://dbbs.wustl.edu/Pages/Print-Events.aspx?dt={month_repr}'
    for month_repr in month_reprs
]
event_list_urls

['http://dbbs.wustl.edu/Pages/Print-Events.aspx?dt=January%2C%202017',
 'http://dbbs.wustl.edu/Pages/Print-Events.aspx?dt=February%2C%202017']

In [4]:
async def parse_event_id(session, url):
    r = await session.get(url)
    d = pq(await r.text())
    a_doms = d('#calendar_list li > h1 a')
    # print(dir(a_doms[0]))
    return [
        re.search(r'\?EvID=(\d+)', a.attrib['href']).group(1)
        for a in a_doms
    ]
        
async def retrieve_all_event_ids(event_list_urls, conn):
    loop = asyncio.get_event_loop()
    event_ids = []
    async with aiohttp.ClientSession(connector=conn) as session:
        tasks = [loop.create_task(parse_event_id(session, url)) for url in event_list_urls]
        for f in tqdm_notebook(asyncio.as_completed(tasks), total=len(tasks)):
            await f
        for t in tasks:
            event_ids.extend(t.result())
    return event_ids

In [5]:
%%time
resolver = AsyncResolver(nameservers=["8.8.8.8", "8.8.4.4"])
conn = aiohttp.TCPConnector(limit=5, resolver=resolver)

loop = asyncio.get_event_loop()
event_ids = loop.run_until_complete(retrieve_all_event_ids(event_list_urls, conn))


CPU times: user 97 ms, sys: 19 ms, total: 116 ms
Wall time: 1.25 s


In [6]:
cst = pytz.timezone('America/Chicago')

In [7]:
Event = namedtuple(
    'Event', 
    'title location start end '
    'speaker description last_modified recurrence '
    'event_id link'
)

async def fetch_event(session, event_id):
    r = await session.get(f'http://dbbs.wustl.edu/Lists/Events/DispForm.aspx?ID={event_id}')
    d = pq(await r.text())
    table_rows = OrderedDict()
    for tr in d('.ms-formtable tr'):
        k, v, *_ = [pq(elem).text() for elem in pq(tr)('td')]
        table_rows[k] = v
    title = table_rows['Title']
    desc = table_rows['Comments']
    if desc:
        desc = html2text(desc)
    recurrence = table_rows['Recurrence']
    start, end = [
        cst.localize(parse(time_str, dayfirst=False, ignoretz=True))
        for time_str in (table_rows['Start Time'], table_rows['End Time'])
    ]
    location = table_rows['Location']
    # title = d('a[name="SPBookmark_Title"]').closest('td').nextAll('td').text()
    # desc = d('.ms-rtestate-field > div').html() or ''
    # if desc:
    #     desc = html2text(desc)
    # recurrence = d('#SPFieldRecurrence').text()
    # start, end = [
    #    cst.localize(parse(elem.text_content().strip(), dayfirst=False, ignoretz=True))
    #     for elem in d('#SPFieldDateTime')
    # ]
    # location = d('a[name="SPBookmark_Location0"]').closest('td').nextAll('td').text()
    # Speakers
    speaker_info = []
    for speaker_order in ['', '2nd ', '3rd ', '4th ']:
        speaker_col = f'{speaker_order}Speaker/Honorific'
        speaker_aff_col = f'{speaker_order}Speaker Affiliation'
        if table_rows[speaker_col]:
            if table_rows[speaker_aff_col]:
                speaker_entry = f'{table_rows[speaker_col]}, {table_rows[speaker_aff_col]}'
            else:
                speaker_entry = table_rows[speaker_col]
            speaker_info.append(speaker_entry)
    speaker = '\n'.join(speaker_info)
    last_modified = parse(re.match(r'^Last modified at (.*) by ', d('#onetidinfoblock2').text()).group(1))
    link = f'http://dbbs.wustl.edu/Resources/Pages/calendar_event.aspx?EvID={event_id}'
    return Event(title, location, start, end, speaker, desc, last_modified, recurrence, event_id, link)
        
async def fetch_all_events(event_ids, conn):
    loop = asyncio.get_event_loop()
    async with aiohttp.ClientSession(connector=conn) as session:
        tasks = [loop.create_task(fetch_event(session, event_id)) for event_id in event_ids]
        for f in tqdm_notebook(asyncio.as_completed(tasks), total=len(tasks)):
            await f
        return [t.result() for t in tasks]

In [8]:
%%time
resolver = AsyncResolver(nameservers=["8.8.8.8", "8.8.4.4"])
conn = aiohttp.TCPConnector(limit=5, resolver=resolver)

loop = asyncio.get_event_loop()
event_infos = loop.run_until_complete(fetch_all_events(event_ids, conn))


CPU times: user 2.77 s, sys: 71.3 ms, total: 2.85 s
Wall time: 7.02 s


In [9]:
cal = vobject.iCalendar()

cal.add('TZID').value = 'America/Chicago'

In [10]:
for e in event_infos:
    event = cal.add('vevent')
    event.add('summary').value = e.title
    event.add('description').value = f'''\
Speakers:
{e.speaker}

Description:
{e.description}

Link: {e.link}'''
    event.add('location').value = e.location
    event.add('dtstart').value = e.start
    if e.recurrence:
        rre = RecurringEvent(now_date=e.start)
        rre.parse(e.recurrence)
        rre.until = e.end
        event.add('rrule').value = rre.get_RFC_rrule()[len('RRULE:'):]
        event.add('dtend').value = cst.localize(datetime.combine(e.start.date(), e.end.time()))
    else:
        event.add('dtend').value = e.end
    event.add('dtstamp').value = e.last_modified
    event.add('uid').value = f'WUSTL DBBS Event Calendar - Event ID {e.event_id}' 

In [11]:
with open('out.ics', 'w') as f:
    f.write(cal.serialize())