# Download EPSA 2019 Web pages

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import re
import time

# Get panel URLs

The `html/presentation-types.html` page was manually downloaded from the [conference website](https://app.oxfordabstracts.com/events/772/), using an Oxford Abstracts user account.

In [2]:
panel_page = BeautifulSoup(open("html/presentation-types.html"), "lxml")

In [3]:
panel_urls = []
for i in panel_page.find_all("a", href = re.compile("session")):
    panel_urls.append(i.get("href"))

In [4]:
panel_urls[0:4]

['/events/772/program-app/session/4544',
 '/events/772/program-app/session/4546',
 '/events/772/program-app/session/4545',
 '/events/772/program-app/session/4395']

In [5]:
len(panel_urls)

191

In [6]:
# unique values
panel_urls = list(set(panel_urls))

In [7]:
len(panel_urls)

191

In [8]:
panel_urls[0:4]

['/events/772/program-app/session/4321',
 '/events/772/program-app/session/4281',
 '/events/772/program-app/session/4277',
 '/events/772/program-app/session/4405']

# Download panels

## Warning: empty pages

Some pages will fail to load properly despite the time allocated for that, and will contain nothing except a "Loading event data" message.

Delete them and run the notebook again.

In [9]:
driver = webdriver.Chrome('/Applications/chromedriver')

In [10]:
left = panel_urls[::-1]

for i in panel_urls:
    u = "https://app.oxfordabstracts.com" + i
    f = "html/sessions/session" + i[-4:] + ".html"
    print("(" + str(left.index(i) + 1) + " left) URL: " + u)
    if os.path.exists(f):
        next
    else:
        driver.get(u)
        # give it some time to load
        time.sleep(7.5)
        with open(f, "w") as file:
            file.write(driver.page_source)

print("done")

(191 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4321
(190 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4281
(189 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4277
(188 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4405
(187 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4403
(186 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4423
(185 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4292
(184 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4364
(183 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4320
(182 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4250
(181 left) URL: https://app.oxfordabstracts.com/events/772/program-app/session/4374
(180 left) URL: https://app.oxfordabstracts.com/events/772/program-app/sessi

# Get abstracts URLs

This will fail if the page loaded improperly. In that case, go back to previous step and download the page again.

In [11]:
files = [f for f in os.listdir("html/sessions") if re.match(r'.*\.html', f)]
abstract_urls = []
for i in files:
    # print(i)
    x = BeautifulSoup(open("html/sessions/" + i), "lxml")
    # determine panel type
    y = x.find("span", string = re.compile("Presentation type$")).parent.text
    # find links to abstracts
    z = []
    for j in x.find_all("a", href = re.compile("submission")):
        z.append(j.get("href"))
    print(i + ": " + y.replace('Presentation type', '') + " with " + str(len(z)) + " abstracts")
    abstract_urls.append(z)

session4538.html: Panel with 4 abstracts
session4304.html: Panel with 5 abstracts
session4345.html: Panel with 3 abstracts
session4312.html: Panel with 4 abstracts
session4410.html: Panel with 4 abstracts
session4555.html: Panel with 5 abstracts
session4386.html: Panel with 4 abstracts
session4369.html: Panel with 4 abstracts
session4294.html: Panel with 5 abstracts
session4282.html: Panel with 5 abstracts
session4328.html: Panel with 4 abstracts
session4406.html: Panel with 4 abstracts
session4543.html: Social event with 0 abstracts
session4390.html: Panel with 5 abstracts
session4308.html: Panel with 4 abstracts
session4426.html: Panel with 4 abstracts
session4365.html: Panel with 5 abstracts
session4277.html: Panel with 4 abstracts
session4298.html: Panel with 5 abstracts
session4332.html: Panel with 5 abstracts
session4324.html: Panel with 2 abstracts
session4373.html: Panel with 4 abstracts
session4372.html: Panel with 4 abstracts
session4519.html: Panel with 4 abstracts
session43

In [12]:
# should equal number of panels
len(abstract_urls)

191

In [13]:
abstract_urls = [item for sublist in abstract_urls for item in sublist]
abstract_urls[0:4]

['/events/772/program-app/submission/87064',
 '/events/772/program-app/submission/85846',
 '/events/772/program-app/submission/86584',
 '/events/772/program-app/submission/86412']

In [14]:
# actual number of abstracts
len(abstract_urls)

784

In [15]:
# unique values
abstract_urls = list(set(abstract_urls))
len(abstract_urls)

784

In [16]:
abstract_urls[0:4]

['/events/772/program-app/submission/86547',
 '/events/772/program-app/submission/85418',
 '/events/772/program-app/submission/86035',
 '/events/772/program-app/submission/84026']

# Download abstracts

In [17]:
left = abstract_urls[::-1]

for i in abstract_urls:
    u = "https://app.oxfordabstracts.com" + i
    f = "html/abstracts/abstract_" + i[-4:] + ".html"
    print("(" + str(left.index(i) + 1) + " left) URL: " + u)
    if os.path.exists(f):
        next
    else:
        driver.get(u)
        # give it ample time to load
        time.sleep(7.5)
        with open(f, "w") as file:
            file.write(driver.page_source)

print("done")

(784 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86547
(783 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/85418
(782 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86035
(781 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/84026
(780 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/84474
(779 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86318
(778 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/82238
(777 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86112
(776 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86283
(775 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86355
(774 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86141
(773 left) URL: https://app.oxfo

(206 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/76582
(205 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/84425
(204 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86710
(203 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/85778
(202 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86616
(201 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/78661
(200 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/85952
(199 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86207
(198 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/85359
(197 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/86357
(196 left) URL: https://app.oxfordabstracts.com/events/772/program-app/submission/91144
(195 left) URL: https://app.oxfo