In [None]:
from bs4 import BeautifulSoup
import requests
import getpass
# Step 1: Start a session and login

url="https://drupal.star.bnl.gov/STAR/theses?page="
username = getpass.getpass(prompt='Please, enter username: ')
password = getpass.getpass(prompt='Please, enter password: ')
otp_code = getpass.getpass(prompt='Please, enter OneTimePass from Authenticator App: ')

session = requests.Session()
# This payload will need to be tailored to the specific site's login parameters
login_page = session.get(url)
soup = BeautifulSoup(login_page.text, 'html.parser')
# Find form_build_id
form_build_id = soup.find('input', {'name': 'form_build_id'}).get('value')
# Step 2: Submit the login form
login_data = {
    'form_build_id': form_build_id,
    'form_id': 'user_login_block',
    'name': username,
    'pass': password,
    'gacode': otp_code,
    'op': 'Log+in'
}
# Perform the login
response = session.post(url, data=login_data)
if "Log out" in response.text or response.status_code == 200:
    print("Login successful!")

In [None]:
url="https://drupal.star.bnl.gov/STAR/theses?page="

thesis_metadata = {}
for page in range(0, 36):
    pageurl = url + str(page)
    page_response = session.get(pageurl)
    soup = BeautifulSoup(page_response.text, 'html.parser')

    # Define patterns to exclude unwanted links
    exclude_keywords = ["STAR Theses", "List", "Search", "Submit A Thesis", "first", "previous", "next", "last"]
    exclude_patterns = ["/STAR/theses?", "/STAR/theses?page"]

    file_extensions = ('.pdf', '.gz', '.ps')

    # Loop through all <a> tags to find thesis titles and file links
    for link in soup.find_all('a', href=True):
        # Check if link contains a valid thesis entry by excluding unwanted entries nor
        if ('theses' in link['href'] and not link['href'].endswith('.pdf') and not link['href'].endswith('.ps') and not link['href'].endswith('.gz') and
            all(keyword not in link.get_text(strip=True) for keyword in exclude_keywords) and
            all(pattern not in link['href'] for pattern in exclude_patterns)):
            # This is a thesis title link
            title = link.get_text(strip=True)
            # Find the next <a> tag that might contain a file link ending with file extensions
            file_link_tag = link.find_next('a', href=lambda href: href and  href.endswith(file_extensions))
            
            if file_link_tag:
                file_link = file_link_tag['href']
                pdf_filename =  file_link.split('/')[-1]
            else:
                file_link = None
            
        # Store the data in the thesis_data list
            thesis_metadata[pdf_filename] = {"Title": [title], "URL": file_link}
    
# Output the collected metadata
for data in thesis_metadata:
    print (thesis_metadata[data])


print ("Number of theses found: ", len(thesis_metadata))


In [None]:
import json
# Convert the dictionary to JSON format
metadata_json = json.dumps(thesis_metadata, indent=4)

# Optionally, save it to a file
with open('metadata.json', 'w') as json_file:
    json_file.write(metadata_json)

In [None]:
url="https://drupal.star.bnl.gov/STAR/theses?page="

for page in range(0, 36):
    page_url = f"{url}{page}"

    print (f"Downloading page {page_url}")
    
    page_response = session.get(page_url)
    soup = BeautifulSoup(page_response.text, 'html.parser')
    

    pdf_links = []
    download_folder="theses"

    for link in soup.find_all('a', href=True):
        if link['href'].endswith('.pdf') or link['href'].endswith('.ps'):
            pdf_links.append(link['href'])
    print ('\n'.join(pdf_links))

    for pdf_link in pdf_links:
        pdf_name = pdf_link.split("/")[-1]
        pdf_name = f"{download_folder}/{pdf_name}"
        pdf_response = session.get(pdf_link)

        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(pdf_response.content)
            print(f"Downloaded {pdf_name}")


In [None]:
# convert .ps to .pdf
import os
import subprocess

# Define the folder containing the .ps files
folder_path = 'theses'

# List all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a .ps file
    if file_name.endswith('.ps'):
        # Construct the full file path
        print (file_name)
        ps_file = os.path.join(folder_path, file_name)
        pdf_file = os.path.join(folder_path, file_name.replace('.ps', '.pdf'))
        
        # Convert the .ps file to .pdf using ps2pdf
        subprocess.run(['ps2pdf', ps_file, pdf_file])
print("Conversion complete!")