In [19]:
import re
import typing
from urllib import (request, error)
import threading
from bs4 import BeautifulSoup

In [2]:
def fetch_llvm_releases_github_page() -> str:
    
    """
    
    Returns the HTML page at https://github.com/llvm/llvm-project/releases as a string object.
    
    """
    
    page: str = ""
    LLVM_RELEASE_PAGE = r"https://github.com/llvm/llvm-project/releases"
    req = request.Request(url = LLVM_RELEASE_PAGE, method = "GET", headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
        "Connection": "close"
    })
    try:
        with request.urlopen(req) as response:
            page = str(response.read())
    except error.HTTPError as err:
        raise error.HTTPError(err.__dict__) 
    return page

In [3]:
# okay

page = fetch_llvm_releases_github_page();

In [4]:
def extract_llvm_release_versions_and_links(html_document: str) -> typing.Dict[str, str]:
    
    """
    
    html_document: str
    returns dict[version: str, uri: str]
    
    Takes the LLVM releases GitHub HTML page as a string, parses the page to extract released LLVM versions and their cognate
    download URIs and returns them paired in a dictionary.
    Requires BeautifulSoup for parsing the HTML document.
    
    Note that this function is delicate and prone to breaks as it depends on very intricate structural details of the HTML ducument.
    Any small changes in the structure of the HTML ducument could potentially break the parsing logic. (If GitHub decides to change their page 
    structures)
    
    """
    
    if not isinstance(html_document, str):
        raise TypeError("Incompatible types. Argument must be of string <class 'str'> type.")
        
    links: typing.Dict[str, str] = dict()
    
    soup = BeautifulSoup(html_document, "html.parser")
    
    for section in soup.find_all("section"):
        version = section.find("h2", attrs = {"class": "sr-only"}).text
        include_fragment = section.find("include-fragment", attrs = {"loading": "lazy"})
        
        if include_fragment:
            links[version] = include_fragment.get("src")
            
        else:
            lazy_load = section.find("include-fragment", attrs = {"class": "js-truncated-assets-fragment"})
            links[version] = lazy_load.get("data-deferred-src")
    return links

In [5]:
extract_llvm_release_versions_and_links(page)

{'LLVM 16.0.0-rc3': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-16.0.0-rc3',
 'LLVM 16.0.0-rc2': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-16.0.0-rc2',
 'LLVM 16.0.0-rc1': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-16.0.0-rc1',
 'LLVM 15.0.7': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.7',
 'LLVM 15.0.6': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.6',
 'LLVM 15.0.5': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.5',
 'LLVM 15.0.4': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.4',
 'LLVM 15.0.3': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.3',
 'LLVM 15.0.2': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.2',
 'LLVM 15.0.1': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.1'}

In [7]:
%%timeit -n 1 -r 1

extract_llvm_win64_download_uri('https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.1')

838 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [14]:
def get_llvm_win64_download_uris(version_uris: typing.Dict[str, str]) -> typing.Dict[str, str]:
    
    """
    
    version_uris: dict[str, str]
    returns dict[str, str]
    
    Receives the LLVM release versions and their download URIs paired in a Python dictionary object.
    (returns of a call to extract_llvm_release_versions_and_links)
    Returns the LLVM release versions and the download URIs for Windows x86-64 executables (.exes), paired in a 
    Python dictionary.
    If a specifc release does not contain the Windows 64 bit executable, that release will be paired with None.
    Uses asynchronous requests to improve performance.
    
    """
    
    BASE_URL = "https://github.com"
    win64 = re.compile(r"/llvm/llvm-project/releases/download/[\d\w\-\.\/]*win64.exe")
    result: typing.Dict[str, str] = dict.fromkeys(version_uris.keys(), "")
    
    for (release, uri) in version_uris.items():
        req_custom = request.Request(url = uri, method = "GET", headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
            "Connection": "close"
        })
    
        with request.urlopen(req_custom) as response:
            try:
                resp_body = str(response.read().decode("utf8"))
                download_link = re.findall(win64, resp_body)
                if download_link:
                    result[release] = BASE_URL + download_link[0]
                else:
                    result[release] = None
                
            except error.HTTPError as err:
                print(err.__dict__())
    return result

In [15]:
%%timeit -n 1 -r 1

get_llvm_win64_download_uris(extract_llvm_release_versions_and_links(page))

9.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [16]:
# a new fetch and extract function that takes just one url

def extract_LLVM_win64_download_uri(link: str) -> str:
    BASE_URL = "https://github.com"
    win64 = re.compile(r"/llvm/llvm-project/releases/download/[\d\w\-\.\/]*win64.exe")
    
    req_custom = request.Request(url = link, method = "GET", headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
            "Connection": "close"
        })
    
    with request.urlopen(req_custom) as response:
        try:
            resp_body = str(response.read().decode("utf8"))
            download_link = re.findall(win64, resp_body)
            if download_link:
                return BASE_URL + download_link[0]
            else:
                return None
        except error.HTTPError as err:
            print(err.__dict__())

In [17]:
get_llvm_win64_download_uris(extract_llvm_release_versions_and_links(page))

{'LLVM 16.0.0-rc3': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.0-rc3/LLVM-16.0.0-rc3-win64.exe',
 'LLVM 16.0.0-rc2': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.0-rc2/LLVM-16.0.0-rc2-win64.exe',
 'LLVM 16.0.0-rc1': None,
 'LLVM 15.0.7': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.7/LLVM-15.0.7-win64.exe',
 'LLVM 15.0.6': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.6/LLVM-15.0.6-win64.exe',
 'LLVM 15.0.5': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.5/LLVM-15.0.5-win64.exe',
 'LLVM 15.0.4': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.4/LLVM-15.0.4-win64.exe',
 'LLVM 15.0.3': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.3/LLVM-15.0.3-win64.exe',
 'LLVM 15.0.2': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.2/LLVM-15.0.2-win64.exe',
 'LLVM 15.0.1': 'https://github.com/llvm/llvm-project/releases

In [18]:
extract_LLVM_win64_download_uri('https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.1')

'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/LLVM-15.0.1-win64.exe'

In [20]:
page_links = extract_llvm_release_versions_and_links(page)
page_links

{'LLVM 16.0.0-rc3': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-16.0.0-rc3',
 'LLVM 16.0.0-rc2': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-16.0.0-rc2',
 'LLVM 16.0.0-rc1': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-16.0.0-rc1',
 'LLVM 15.0.7': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.7',
 'LLVM 15.0.6': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.6',
 'LLVM 15.0.5': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.5',
 'LLVM 15.0.4': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.4',
 'LLVM 15.0.3': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.3',
 'LLVM 15.0.2': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.2',
 'LLVM 15.0.1': 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.1'}

In [21]:
win64_downloads = dict.fromkeys(page_links.keys())

In [23]:
win64_downloads

{'LLVM 16.0.0-rc3': None,
 'LLVM 16.0.0-rc2': None,
 'LLVM 16.0.0-rc1': None,
 'LLVM 15.0.7': None,
 'LLVM 15.0.6': None,
 'LLVM 15.0.5': None,
 'LLVM 15.0.4': None,
 'LLVM 15.0.3': None,
 'LLVM 15.0.2': None,
 'LLVM 15.0.1': None}

In [49]:
class Win64FetcherThread(threading.Thread):
    
    def __init__(self, link: str, regex: typing.Pattern[str]) -> None:
        threading.Thread.__init__(self)
        self.link: str = link
        self.result = None
        self.BASE_URL = "https://github.com"
        self.pattern = regex 
        # I presume having to compile the regex at every instance of this call would lead to performance loss
        # so, take a compiled regex object on initialization.
        # removing the self.patterns = re.compile(r"/llvm/llvm-project/releases/download/[\d\w\-\.\/]*win64.exe")
    
    def run(self) -> str:


        req_custom = request.Request(url = self.link, method = "GET", headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
                "Connection": "close"
            })

        with request.urlopen(req_custom) as response:
            try:
                resp_body = str(response.read().decode("utf8"))
                download_link = re.findall(self.pattern, resp_body)
                if download_link:
                    self.result = self.BASE_URL + download_link[0]
                else:
                    self.result = None
            except error.HTTPError as err:
                print(err.__dict__())

In [50]:
thread_1 = Win64FetcherThread(link = 'https://github.com/llvm/llvm-project/releases/expanded_assets/llvmorg-15.0.3')

In [41]:
thread_1.start()
thread_1.join()

In [42]:
thread_1.result

'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.3/LLVM-15.0.3-win64.exe'

In [71]:
def execute(page_links: typing.Dict[str, str]) -> typing.Dict[str, str]:
    
    regex = re.compile(r"/llvm/llvm-project/releases/download/[\d\w\-\.\/]*win64.exe")
    win64_downloads = dict.fromkeys(page_links.keys())
    
    threads = [Win64FetcherThread(link = uri, regex = regex) for uri in page_links.values()]
    
    for thread in threads:
        thread.start()
        
    for thread in threads:
        thread.join()
    
    for (i, release) in enumerate(win64_downloads.keys()):
        win64_downloads[release] = threads[i].result
    
    return win64_downloads

In [74]:
%%timeit -n 1 -r 1

execute(page_links)

1.12 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [73]:
execute(page_links)

{'LLVM 16.0.0-rc3': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.0-rc3/LLVM-16.0.0-rc3-win64.exe',
 'LLVM 16.0.0-rc2': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.0-rc2/LLVM-16.0.0-rc2-win64.exe',
 'LLVM 16.0.0-rc1': None,
 'LLVM 15.0.7': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.7/LLVM-15.0.7-win64.exe',
 'LLVM 15.0.6': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.6/LLVM-15.0.6-win64.exe',
 'LLVM 15.0.5': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.5/LLVM-15.0.5-win64.exe',
 'LLVM 15.0.4': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.4/LLVM-15.0.4-win64.exe',
 'LLVM 15.0.3': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.3/LLVM-15.0.3-win64.exe',
 'LLVM 15.0.2': 'https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.2/LLVM-15.0.2-win64.exe',
 'LLVM 15.0.1': 'https://github.com/llvm/llvm-project/releases