# Download EPSA 2019 sessions and abstracts

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import re
import time

## Create download folders

In [2]:
os.makedirs("html/sessions", exist_ok = True)
os.makedirs("html/abstracts", exist_ok = True)

## Find panel URLs

The `html/presentation-types.html` page was [manually downloaded](https://app.oxfordabstracts.com/events/772/program-app/presentation-types) from the [conference website](https://app.oxfordabstracts.com/events/772/).

In [3]:
panel_page = BeautifulSoup(open("html/presentation-types.html"), "lxml")

In [4]:
panel_urls = []
for i in panel_page.find_all("a", href = re.compile("session")):
    panel_urls.append(i.get("href"))

In [5]:
panel_urls[0:4]

['/events/772/program-app/session/4544',
 '/events/772/program-app/session/4546',
 '/events/772/program-app/session/4545',
 '/events/772/program-app/session/4395']

In [6]:
# check length of panel id is always 4
j = []
for i in panel_urls:
    j.append(len(re.search(r'\d+$', i).group(0)))

assert set(j) == {4}, "Unusual panel ids detected."

In [7]:
len(panel_urls)

191

In [8]:
# unique values
panel_urls = list(set(panel_urls))

In [9]:
len(panel_urls)

191

In [10]:
panel_urls[0:4]

['/events/772/program-app/session/4428',
 '/events/772/program-app/session/4302',
 '/events/772/program-app/session/4367',
 '/events/772/program-app/session/4334']

## Download panels

### Warning: empty pages

Some pages will fail to load properly despite the time allocated for that, and will contain nothing except a "Loading event data" message.

Delete them and run the notebook again.

In [11]:
driver = webdriver.Chrome('/Applications/chromedriver')

In [12]:
left = panel_urls[::-1]

for i in panel_urls:
    u = "https://app.oxfordabstracts.com" + i
    f = "html/sessions/session" + i[-4:] + ".html"
    print("(" + str(left.index(i) + 1).rjust(3) + " left) " + u) #  + " -> " + f
    if os.path.exists(f):
        next
    else:
        driver.get(u)
        # give it some time to load
        time.sleep(7.5)
        with open(f, "w") as file:
            file.write(driver.page_source)

print("done")

(191 left) https://app.oxfordabstracts.com/events/772/program-app/session/4428
(190 left) https://app.oxfordabstracts.com/events/772/program-app/session/4302
(189 left) https://app.oxfordabstracts.com/events/772/program-app/session/4367
(188 left) https://app.oxfordabstracts.com/events/772/program-app/session/4334
(187 left) https://app.oxfordabstracts.com/events/772/program-app/session/4387
(186 left) https://app.oxfordabstracts.com/events/772/program-app/session/4235
(185 left) https://app.oxfordabstracts.com/events/772/program-app/session/4416
(184 left) https://app.oxfordabstracts.com/events/772/program-app/session/4253
(183 left) https://app.oxfordabstracts.com/events/772/program-app/session/4376
(182 left) https://app.oxfordabstracts.com/events/772/program-app/session/4323
(181 left) https://app.oxfordabstracts.com/events/772/program-app/session/4338
(180 left) https://app.oxfordabstracts.com/events/772/program-app/session/4399
(179 left) https://app.oxfordabstracts.com/events/77

## Find abstracts URLs

This will fail if the page loaded improperly. In that case, go back to previous step and download the page again.

In [13]:
files = [f for f in os.listdir("html/sessions") if re.match(r'.*\.html', f)]
abstract_urls = []
for i in files:
    # print(i)
    x = BeautifulSoup(open("html/sessions/" + i), "lxml")
    # determine panel type
    y = x.find("span", string = re.compile("Presentation type$")).parent.text
    # find links to abstracts
    z = []
    for j in x.find_all("a", href = re.compile("submission")):
        z.append(j.get("href"))
    print(i + ": " + y.replace('Presentation type', '') + " with " + str(len(z)) + " abstract(s)")
    abstract_urls.append(z)

session4538.html: Panel with 4 abstract(s)
session4304.html: Panel with 5 abstract(s)
session4345.html: Panel with 3 abstract(s)
session4312.html: Panel with 4 abstract(s)
session4410.html: Panel with 4 abstract(s)
session4555.html: Panel with 5 abstract(s)
session4386.html: Panel with 4 abstract(s)
session4369.html: Panel with 4 abstract(s)
session4294.html: Panel with 5 abstract(s)
session4282.html: Panel with 5 abstract(s)
session4328.html: Panel with 4 abstract(s)
session4406.html: Panel with 4 abstract(s)
session4543.html: Social event with 0 abstract(s)
session4390.html: Panel with 5 abstract(s)
session4308.html: Panel with 4 abstract(s)
session4426.html: Panel with 4 abstract(s)
session4365.html: Panel with 5 abstract(s)
session4277.html: Panel with 4 abstract(s)
session4298.html: Panel with 5 abstract(s)
session4332.html: Panel with 5 abstract(s)
session4324.html: Panel with 2 abstract(s)
session4373.html: Panel with 4 abstract(s)
session4372.html: Panel with 4 abstract(s)
sess

In [14]:
# should equal number of panels
assert len(panel_urls) == len(panel_urls), "Some session pages were not parsed."

In [15]:
# unlist
abstract_urls = [item for sublist in abstract_urls for item in sublist]
abstract_urls[0:4]

['/events/772/program-app/submission/87064',
 '/events/772/program-app/submission/85846',
 '/events/772/program-app/submission/86584',
 '/events/772/program-app/submission/86412']

In [16]:
# actual number of abstracts
len(abstract_urls)

784

In [17]:
# unique values
abstract_urls = list(set(abstract_urls))
len(abstract_urls)

784

In [18]:
abstract_urls[0:4]

['/events/772/program-app/submission/86598',
 '/events/772/program-app/submission/86239',
 '/events/772/program-app/submission/86370',
 '/events/772/program-app/submission/86538']

### Warning: abstract ids

Abstract identifier length is actually variable.

In [19]:
# check length of abstract id is always 5 or 6
j = []
for i in abstract_urls:
    j.append(len(re.search(r'\d+$', i).group(0)))

assert set(j) == {5, 6}, "Unusual abstract ids detected."

## Download abstracts

In [20]:
left = abstract_urls[::-1]

for i in abstract_urls:
    u = "https://app.oxfordabstracts.com" + i
    f = "html/abstracts/abstract_" + re.search(r'\d+$', i).group(0) + ".html"
    print("(" + str(left.index(i) + 1).rjust(3) + " left) " + u) #  + " -> " + f
    if os.path.exists(f):
        next
    else:
        driver.get(u)
        # give it ample time to load
        time.sleep(7.5)
        with open(f, "w") as file:
            file.write(driver.page_source)

print("done")

(784 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86598
(783 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86239
(782 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86370
(781 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86538
(780 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86229
(779 left) https://app.oxfordabstracts.com/events/772/program-app/submission/85285
(778 left) https://app.oxfordabstracts.com/events/772/program-app/submission/91444
(777 left) https://app.oxfordabstracts.com/events/772/program-app/submission/81549
(776 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86176
(775 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86509
(774 left) https://app.oxfordabstracts.com/events/772/program-app/submission/86632
(773 left) https://app.oxfordabstracts.com/events/772/program-app/submission/85953
(772

Have a nice day.