In [1]:
import re
import feedparser
from bs4 import BeautifulSoup

In [2]:
RSS_LINK = 'http://dbbs.wustl.edu/_layouts/feed.aspx?xsl=1&web=%2F&page=af99a055-81ee-4d9e-922b-16659022a22a&wp=0b079397-29e7-45a3-bc9a-1936ceac62e9&pageurl=%2FPages%2FRssFeed%2Easpx'

In [3]:
feed = feedparser.parse(RSS_LINK)

In [4]:
parsed_entries = []
for entry in feed['entries']:
    parsed_entry = {}
    parsed_entry['link'] = entry['link']
    parsed_entry['event_id'] = re.search(r'\?EvID=(\d+)', entry['link']).group(1)
    parsed_entry['title'] = entry['title']
    parsed_entry['published'] = entry['published']
    
    # parse summary
    summary = entry['summary']
    soup = BeautifulSoup(summary, "lxml")
    parsed_summary = soup.get_text()
    clean_summary = '\n'.join(l.strip(' \t').replace('\xa0', ' ') for l in parsed_summary.splitlines())
    parsed_entry['summary'] = clean_summary
    
    # extract start date and location
    start_parser = r'Start Event:\s*(?P<start>.*)'
    m = re.search(start_parser, clean_summary)
    parsed_entry['start_date'] = m.group('start') if m else entry['published']
    
    location_parser = r'Location:\s*(?P<location>.*)'
    m = re.search(location_parser, clean_summary)
    parsed_entry['location'] = m.group('location') if m else ''
    
    parsed_entries.append(parsed_entry)

In [5]:
parsed_entries[:3]

[{'event_id': '5086',
  'link': 'http://dbbs.wustl.edu/Resources/Pages/calendar_event.aspx?EvID=5086',
  'location': 'Danforth Campus, Danforth University Center, room 300 (Liberman Graduate Center)',
  'published': 'Mon, 09 Jan 2017 21:15:36 GMT',
  'start_date': '2017-03-21 16:00:00',
  'summary': 'Start Event:\n2017-03-21 16:00:00\nLocation:\nDanforth Campus, Danforth University Center, room 300 (Liberman Graduate Center)\nDescription:\n \nIncreasing Student Engagement Among PeersTuesday, March 21, 2017\n4 pm, Liberman Graduate Center (Danforth Campus)This hour long seminar will feature two guests from Student Affairs.  Vice Chancellor Lori White and Associate Dean James Parker will provide insight for graduate students that are participating in group and choosing leadership rolls to enhance peer engagement and opportunity within their environment.  Please RSVP to attend this event.\n \nRSVP and more information: https://gradcenter.wustl.edu/programming/leadership-seminar-seriesThis

In [6]:
from datetime import datetime, timedelta
from dateutil.parser import parse
import pytz
import vobject

In [7]:
e = parsed_entries[0]

In [8]:
# cst = pendulum.timezone('US/Central')
# utc = pendulum.timezone('UTC')
cst = pytz.timezone('America/Chicago')  # ('US/Central')
utc = pytz.utc

### Playing with pendulum

https://pendulum.eustace.io/

In [9]:
import pendulum

In [10]:
now = pendulum.now('US/Central')

In [11]:
now.to_iso8601_string()

'2017-01-09T18:46:24-06:00'

In [12]:
import asyncio

In [13]:
import aiohttp
from aiohttp.resolver import AsyncResolver

from html2text import html2text
from datetime import datetime
from recurrent import RecurringEvent

In [14]:
from pyquery import PyQuery as pq
from tqdm import tqdm_notebook

In [20]:
async def fetch_event(session, event_id):
    r = await session.get(f'http://dbbs.wustl.edu/Lists/Events/DispForm.aspx?ID={event_id}')
    d = pq(await r.text())
    desc = d('.ms-rtestate-field > div').html() or ''
    if desc:
        desc = html2text(desc)
    recurrence = d('#SPFieldRecurrence').text()
    start, end = [elem.text_content().strip() for elem in d('#SPFieldDateTime')]
    end = cst.localize(parse(end, dayfirst=False, ignoretz=True))
    return (desc, end, recurrence, d)
        
async def fetch_all_events(event_ids, conn):
    loop = asyncio.get_event_loop()
    async with aiohttp.ClientSession(connector=conn) as session:
        tasks = [loop.create_task(fetch_event(session, event_id)) for event_id in event_ids]
        for f in tqdm_notebook(asyncio.as_completed(tasks), total=len(tasks)):
            await f
        return [t.result() for t in tasks]

In [21]:
%%time
resolver = AsyncResolver(nameservers=["8.8.8.8", "8.8.4.4"])
conn = aiohttp.TCPConnector(limit=5, resolver=resolver)

loop = asyncio.get_event_loop()
results = loop.run_until_complete(fetch_all_events([e['event_id'] for e in parsed_entries[:10]], conn))


CPU times: user 316 ms, sys: 20.8 ms, total: 337 ms
Wall time: 5.64 s


In [38]:
d = results[0][-1]

In [56]:
d('a[name="SPBookmark_Location0"]').closest('td').nextAll('td').text()

'Danforth Campus, Danforth University Center, room 300 (Liberman Graduate Center)'

In [80]:
from collections import OrderedDict

In [81]:
table_rows = OrderedDict()
for tr in d('.ms-formtable tr'):
    k, v, *_ = [pq(elem).text() for elem in pq(tr)('td')]
    table_rows[k] = v

In [82]:
table_rows

OrderedDict([('Requestor', 'Jessica Hutchins'),
             ('Requestor Department', 'DBBS'),
             ('Requestor Phone', '314-747-0876'),
             ('Requestor Email', 'jhutchins@wustl.edu'),
             ('Type of Event', 'Non-Scientific Seminar'),
             ('Type of Lecture', 'Named Lecture'),
             ('Start Time', '3/21/2017 4:00 PM'),
             ('End Time', '3/21/2017 5:00 PM'),
             ('Recurrence', ''),
             ('All Day Event', ''),
             ('Show RSVP', 'No'),
             ('Sponsoring Department/Group', 'Liberman Graduate Center'),
             ('Speaker/Honorific', 'Lori White'),
             ('Speaker Affiliation', 'Vice Chancellor for Student Affairs'),
             ('Title', 'Increasing Student Engagment Among Peers'),
             ('Add More Speakers (Optional)', 'No'),
             ('2nd Speaker/Honorific', ''),
             ('2nd Speaker Affiliation', ''),
             ('2nd Title', ''),
             ('3rd Speaker/Honorific', ''),


In [17]:
cal = vobject.iCalendar()

cal.add('X-WR-CALNAME').value = 'DBBS'
cal.add('TZID').value = 'America/Chicago'

In [18]:
for e, (desc, dt_end, recurrence, *_) in zip(parsed_entries, results):
    dt_start = cst.localize(parse(e['start_date']))
    # dt_start = pendulum.from_format(e['start_date'], '%Y-%m-%d %H:%M:%S', cst)
    event = cal.add('vevent')
    event.add('summary').value = e['title']
    event.add('description').value = f'''\
{desc}

Link: {e['link']}
    '''
    event.add('location').value = e['location']
    event.add('dtstart').value = dt_start
    if recurrence:
        rre = RecurringEvent(now_date=dt_start)
        rre.parse(recurrence)
        rre.until = dt_end
        event.add('rrule').value = rre.get_RFC_rrule()[len('RRULE:'):]
        event.add('dtend').value = cst.localize(datetime.combine(dt_start.date(), dt_end.time()))
    else:
        event.add('dtend').value = dt_end
    event.add('dtstamp').value = parse(e['published'])
    event.add('uid').value = f'WUSTL DBBS Event Calendar - Event ID {e["event_id"]}' 

In [19]:
with open('out.ics', 'w') as f:
    f.write(cal.serialize())