In [4]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from latex import escape

In [12]:
# Create empty set for storing links
links = set()

# I just want the newest 1000 thesis
for x in range(1,11):
    
    # Create the URL with the current page number
    URL = "https://openaccess.nhh.no/nhh-xmlui/discover?rpp=100&etal=0&scope=/&group_by=none&page=" + str(x) + "&sort_by=dc.date.issued_dt&order=desc&filtertype_0=doctype&filtertype_1=dateIssued&filter_relational_operator_1=contains&filter_relational_operator_0=equals&filter_1=&filter_0=Master+thesis"
    
    # Use the requests library to retrieve the HTML content of the URL
    page = requests.get(URL)
    
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find all 'a' tags in the HTML
    for link in soup.find_all('a'):
        href = link.get('href')
        
        # Check if the 'href' attribute contains the desired link
        if href and '/nhh-xmlui/handle/11250/' in href:
            
             # Add the link to the set
            links.add("https://openaccess.nhh.no" + href)


After collecting the links for all master thesis, we can read all the abstracts.

In [13]:
abstracts = []

# function for extracting abstract
def extract_abstract(link):
    # Get the HTML content of the webpage
    html_content = requests.get(link).text
    
    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract the title
    title = soup.find("title").text.replace("NHH Brage: ","")
    
    # Extract the abstract if there is one
    abstract = soup.find("meta", {"name":"DCTERMS.abstract"})
    if abstract:
        abstract = abstract["content"]
        abstracts.append({'title': title, 'abstract': abstract})
    else:
        print(f"No abstract found for link {link}")

# Run in paralell for faster computation
with ThreadPoolExecutor() as executor:
    future_to_link = {executor.submit(extract_abstract, link): link for link in links}
    for future in as_completed(future_to_link):
        link = future_to_link[future]
        try:
            future.result()
        except Exception as e:
            print(f"Error occured while processing link {link}: {e}")


No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2772251
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2681034
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2781350
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2985381
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2772145
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/3014019
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2682586No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2982082

No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/3032223
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/2734277
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/3027623
No abstract found for link https://openaccess.nhh.no/nhh-xmlui/handle/11250/

Finally, I write the abstracts to a Latex-file which can be compiled to a pdf in Overleaf. Each abstract will be on a separate page with the title in bold to make it easily readable.

In [14]:
with open("abstracts.tex", "w", encoding='utf-8') as file:
    for item in abstracts:
        title = item['title']
        abstract = item['abstract']
        file.write(f"\\textbf{{{escape(title)}}} \\\\ \n")
        file.write(f"{escape(abstract)} \n")
        file.write("\\newpage \n")

I have compiled the file in Overleaf, which can be found in pdf-format on the Github repository :)