In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict
import requests

dom = BeautifulSoup(request_html('http://www.ceca.uwaterloo.ca/students/sessions.php?month_num=1&year_num=2017'), 'html.parser')
len(dom.select('td p a'))

companies = defaultdict(dict)
for year in range(2006, 2018):
    for month in range(1, 13):
        url = 'http://www.ceca.uwaterloo.ca/students/sessions.php?month_num={month}&year_num={year}'.format(month=month, year=year)
        dom = BeautifulSoup(request_html(url), 'html.parser')
        calendar = []
        for i in dom.select('td p a'):
            calendar.append(i.text)
        companies[year][month] = calendar

In [140]:
def request_html(url):
    """GET request to blog link, returns decoded HTML."""
    try:
        r = requests.get(url)
        return r.content
    except Exception as e:
        print(e)
        print('Error: could not GET from ' + url)
        return None

In [141]:
class Event:
    def __init__(self, title, year, month):
        self._title = title
        self.year = year
        self.month = month
        
    def is_valid(self):
        if 'No info sessions' in self._title:
            return False
        if 'MAIN ' in self._title:
            return False
        if 'Reading week' in self._title:
            return False
        if 'Info Session' in self._title:
            return False
        return True
       
    @property
    def title(self):
        return self._title.lower().replace('cancelled', '').replace('*', '').replace('-', '').strip()

In [142]:
terms = []
for i in companies:
    for j in range(0, 3):
        terms.append(companies[i][j*4+1] + companies[i][j*4+4] + companies[i][j*4+2] + companies[i][j*4+3])

In [143]:
events = []
for i in companies:
    for j in companies[i]:
        for item in companies[i][j]:
            events.append(Event(item, i, j))
events = filter(lambda x: x.is_valid(), events)

In [151]:
def group_by_term(events):
    groups = defaultdict(dict)
    for i in range(2006, 2018):
        for j in range(0, 3):
            groups[i][j] = []
            
    for e in events:
        groups[e.year][(e.month-1) / 4].append(e)
    flat_terms = []
    for i in range(2006, 2018):
        for j in range(0, 3):
            flat_terms.append(groups[i][j])
    return flat_terms
            

In [152]:
terms = group_by_term(events)

In [153]:
all_companies = []
for i in terms:
    for e in i:
        all_companies.append(e.title)
all_companies = set(all_companies)

In [154]:
count = defaultdict(int)
for t in terms:
    for e in t:
        count[e.title] += 1
#for w in sorted(count, key=count.get, reverse=True):
#    print w, count[w]

In [155]:
tech_companies = ['microsoft', 'facebook', 'amazon', 'google', 'amazon', 'yelp', 'linkedin', 'twiter', 'palantir', 'hulu', 'yext', 'wish', 'nvidia', 'meraki', 'yahoo', 'shopify', 'pinterest', 'vmware', 'dropbox', 'mozilla', 'noom', 'stripe', 'atomic', 'zenefits', 'two sigma', 'tesla', 'pagerduty', 'snap', 'expedia', 'jane street', 'wealthsimple', 'quora', 'square']

In [156]:
company_matches = []
for t in terms:
    matches = []
    for e in t:
        for tc in tech_companies:
            if tc in e.title:
                matches.append(tc)
    company_matches.append(set(matches))

In [157]:
print(company_matches)

[set(['amazon', 'microsoft', 'vmware', 'nvidia']), set(['amazon', 'microsoft']), set(['expedia', 'amazon', 'google', 'nvidia', 'microsoft', 'vmware']), set(['expedia', 'amazon', 'google', 'nvidia', 'microsoft', 'vmware']), set(['amazon', 'google', 'microsoft']), set(['expedia', 'yahoo', 'amazon', 'google', 'microsoft', 'vmware']), set(['google', 'yahoo', 'hulu', 'amazon', 'expedia', 'facebook', 'nvidia', 'microsoft', 'vmware']), set(['amazon', 'microsoft', 'yahoo']), set(['google', 'yahoo', 'hulu', 'amazon', 'expedia', 'facebook', 'nvidia', 'microsoft', 'vmware']), set(['google', 'hulu', 'amazon', 'facebook', 'nvidia', 'microsoft']), set(['amazon', 'google', 'facebook']), set(['google', 'mozilla', 'yahoo', 'hulu', 'amazon', 'expedia', 'facebook', 'microsoft']), set(['google', 'mozilla', 'hulu', 'amazon', 'facebook', 'microsoft']), set(['amazon', 'mozilla', 'facebook', 'microsoft']), set(['google', 'yelp', 'mozilla', 'hulu', 'amazon', 'facebook', 'microsoft']), set(['google', 'shopify',

In [170]:
for i, t in enumerate(company_matches):
    print(len(t))
    if i % 3 == 2:
        print(i, 'wat')

4
2
6
(2, 'wat')
6
3
6
(5, 'wat')
9
3
9
(8, 'wat')
6
3
8
(11, 'wat')
6
4
7
(14, 'wat')
9
6
13
(17, 'wat')
8
10
13
(20, 'wat')
11
12
14
(23, 'wat')
14
13
19
(26, 'wat')
16
19
21
(29, 'wat')
18
16
15
(32, 'wat')
12
9
11
(35, 'wat')
