# Data for all Questions - Applications/Offers/Entrances for departments by nationality

Converting the pdf to csv:

import requests
import pandas as pd
import tabula
import jpype


# URL of the PDF download link
#pdf_download_url = "https://public.tableau.com/vizql/w/ApplicationsOffersandEntrants/v/About/tempfile/sessions/6CFB072D66F548F0BD680B6D8269BD44-0:0/?key=2889495739&keepfile=yes&attachment=yes"
#pdf_download_url = "https://public.tableau.com/vizql/w/LSEStatisticsonStudents/v/TableA/tempfile/sessions/E54A8D2F36614981A37995A0DF6C69BC-0:0/?key=674594458&keepfile=yes&attachment=yes"
# Send a GET request to download the PDF file
#response = requests.get(pdf_download_url)

# Save the content of the response to a file with .pdf extension
file_path = "Data/LSE_Students_acceptance_program_v1.pdf"
#with open(file_path, "wb") as f:
#    f.write(response.content)

# Extract the table from the PDF file
tables = tabula.read_pdf(file_path, pages='all', multiple_tables=True)

# Check if any tables are extracted
if tables:
    # Concatenate all tables into one DataFrame if there are multiple
    combined_table = pd.concat(tables, ignore_index=True)
    
    # Save the combined table to a CSV file
    csv_file = "Data/table_v2.csv"
    combined_table.to_csv(csv_file, index=False)
    print(f"Table saved to {csv_file}")
else:
    print("No table found in the PDF file.")



# Question 2

Undergraduate courses:

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import csv
from bs4 import BeautifulSoup

def setup_driver():
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')  # Bypass OS security model
    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
    driver = webdriver.Chrome(options=options)
    return driver

def print_department_text(html_content):
    """Extracts and returns the text after the <h2 class="card__title"> tag."""
    soup = BeautifulSoup(html_content, 'html.parser')
    card_titles = soup.find_all('h2', class_='card__title')
    departments = [title.text.strip() for title in card_titles]
    return departments

def print_why_study_with_us_section(driver, course_link):
    """Returns the HTML content of the 'why study with us' section."""
    driver.get(course_link)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    try:
        why_study_with_us_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'why-study-with-us'))
        )
        html_content = why_study_with_us_section.get_attribute('outerHTML')
        return print_department_text(html_content)
    except TimeoutException:
        print("Failed to locate or extract the 'why study with us' section.")
        return []

# Setup WebDriver
driver = setup_driver()
initial_url = "https://www.lse.ac.uk/programmes/search-courses?studyType=0%2F1%2F26%2F85%2F86"
driver.get(initial_url)

def process_course_page(course_link):
    """ Visits each course link and extracts the course name, median salary, and department. """
    departments = print_why_study_with_us_section(driver, course_link)
    department = ', '.join(departments) if departments else "Department not found."
    
    try:
        course_name = driver.find_element(By.CSS_SELECTOR, 'h1.hero__title span').text.strip()
    except NoSuchElementException:
        course_name = "Course name not found."
    try:
        salary_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'section#graduate-destinations div.salary')))
        median_salary = salary_div.text.strip()
    except TimeoutException:
        median_salary = "Salary not found."

    return course_name, median_salary, department

def navigate_to_next_page():
    """Navigates to the next page if possible."""
    try:
        next_page_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]//button[contains(., 'Next')]")))
        next_page_button.click()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        return driver.current_url
    except TimeoutException:
        return None

courses_info = []
current_page_url = initial_url

while True:
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))
    courses = [(elem.get_attribute('href'), elem.text) for elem in driver.find_elements(By.CSS_SELECTOR, 'h2.card__title a')]
    for course_link, _ in courses:
        course_name, median_salary, department = process_course_page(course_link)
        courses_info.append((course_name, median_salary, department))
        driver.get(current_page_url)
    new_page_url = navigate_to_next_page()
    if new_page_url:
        current_page_url = new_page_url
    else:
        break

driver.quit()

# Save results to a CSV file
csv_file_path = 'Data/output2804.csv'
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Course Name', 'Median Salary', 'Department'])
    writer.writerows(courses_info)

print("Data extraction complete. Results saved to CSV.")


Data extraction complete. Results saved to CSV.


Postgraduate courses:

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import csv
from bs4 import BeautifulSoup
import os

def setup_driver():
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

def process_course_page(driver, course_link):
    """Visits each course link and extracts the course name, median salary, and department for postgraduate courses."""
    driver.get(course_link)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extracting course name
    course_name_elem = soup.find('h1', class_='pageTitle')
    course_name = course_name_elem.text.strip() if course_name_elem else "Course name not found."

    # Finding department
    department_elem = soup.find('li', class_='keyDetails__item--dept')
    department = department_elem.text.strip() if department_elem else "Department not found."

    # Extracting median salary directly from the Careers accordion content
    # Extracting median salary directly from the Careers accordion content
    median_salary = "Salary not found."
    careers_accordion = soup.find('h1', class_='accordion__title', string=lambda text: 'Careers' in text if text else False)
    if careers_accordion:
        careers_content = careers_accordion.find_next('div', class_='accordion__content')
        if careers_content:
            salary_tag = careers_content.find('strong', string=lambda text: "Median salary" in text if text else False)
            if salary_tag:
                median_salary = salary_tag.next_sibling.strip() if salary_tag.next_sibling else "Salary not found."

    print(median_salary)
    return course_name, median_salary, department


# Main scraping function
def scrape_courses(base_url):
    driver = setup_driver()
    driver.get(base_url)
    courses_info = []
    current_page_url = base_url  # Initialize current page URL

    while True:
        # Wait for the course links to be visible and then collect them
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))
        course_elements = driver.find_elements(By.CSS_SELECTOR, 'h2.card__title a')
        courses = [(elem.get_attribute('href'), elem.text) for elem in course_elements]

        for course_link, _ in courses:
            driver.get(course_link)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            course_info = process_course_page(driver, course_link)
            courses_info.append(course_info)
            driver.get(current_page_url)  # Go back to the current list page, not the base URL
            WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))  # Wait until all course links are visible again

        new_page_url = navigate_to_next_page(driver)
        if new_page_url:
            current_page_url = new_page_url  # Update the current page URL
            driver.get(new_page_url)  # Navigate to the next page
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))  # Ensure the page is loaded
        else:
            break  # Exit the loop if there are no more pages

    driver.quit()
    return courses_info

def navigate_to_next_page(driver):
    """Navigates to the next page if possible."""
    try:
        next_page_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]//button[contains(., 'Next')]")))
        next_page_button.click()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        return driver.current_url
    except TimeoutException:
        return None

    


def save_to_csv(data, filename):
    """Appends the data to a CSV file or creates it if it doesn't exist."""
    file_exists = os.path.isfile(filename)  # Check if the file already exists

    with open(filename, 'a', newline='') as file:  # Open the file in append mode
        writer = csv.writer(file)
        
        if not file_exists:
            writer.writerow(['Course Name', 'Median Salary', 'Department'])  # Write header only if the file does not exist
        
        writer.writerows(data)


# URL for postgraduate courses
postgraduate_url = "https://www.lse.ac.uk/programmes/search-courses?studyType=0%2F1%2F26%2F85%2F87"
postgraduate_courses = scrape_courses(postgraduate_url)
save_to_csv(postgraduate_courses, 'Data/output2804.csv')
print("Data extraction complete. Results added to 'output2804.csv'.")



£33,000
£42,000
Salary not found.
£35,000
£30,000
£30,000
£34,000
£30,000
£30,000
Salary not found.
Salary not found.
£34,000
Salary not found.
£30,000
£30,000
£38,000
£32,000
£35,000
Salary not found.
£32,000
Salary not found.
£33,000
£30,000
£34,000
£32,000
£30,000
Salary not found.
£34,000
Salary not found.
Salary not found.
Salary not found.
£30,000
£30,000
Salary not found.
£32,000
Salary not found.
£35,000
£39,500
£35,000
Salary not found.
£38,000
Salary not found.
£33,000
Salary not found.
£33,000
£30,000
£28,000
£30,000
£32,000
£30,000
£32,000
£32,000
£30,000
£34,000
Salary not found.
£34,000
Salary not found.
Salary not found.
Salary not found.
Salary not found.
Salary not found.
Salary not found.
£33,000
Salary not found.
£30,000
£38,000
£33,000
£34,500
Salary not found.
£33,000
£28,000
Salary not found.
£32,000
£30,000
£42,000
£30,000
£28,000
£38,000
Salary not found.
£38,000
Salary not found.
£33,000
£30,000
£35,000
£28,000
£38,000
Salary not found.
£38,000
Salary not found

# Question 3

In [3]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_gdp_by_country():
    url = "https://www.worldometers.info/gdp/gdp-by-country/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    rows = soup.find_all('tr')
    gdp_by_country_data = []

    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 7:
            country = cells[1].text.strip()
            gdp = cells[2].text.strip().replace('$', '').replace(',', '')
            population = cells[5].text.strip().replace(',', '')
            gdp_per_capita = cells[6].text.strip().replace('$', '').replace(',', '')
            gdp_by_country_data.append((country, gdp, population, gdp_per_capita))
    
    return gdp_by_country_data

def merge_with_country_nationality_mapping(gdp_data, country_nationality_file):
    country_nationality_map = {}
    with open(country_nationality_file, 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            country_name = row['en_short_name']
            nationality = row['nationality']
            country_nationality_map[country_name] = nationality

    merged_data = []
    for country, gdp, population, gdp_per_capita in gdp_data:
        if country in country_nationality_map:
            nationality = country_nationality_map[country]
            merged_data.append((country, nationality, gdp, population, gdp_per_capita))
        else:
            merged_data.append((country, '', gdp, population, gdp_per_capita))

    return merged_data

def write_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Country', 'Nationality', 'GDP', 'Population', 'GDP per Capita'])
        writer.writerows(data)

def main():
    gdp_by_country_data = scrape_gdp_by_country()
    if gdp_by_country_data:
        merged_data = merge_with_country_nationality_mapping(gdp_by_country_data, 'Data/countries.csv')
        write_to_csv(merged_data, 'Data/merged_gdp_data.csv')

if __name__ == "__main__":
    main()


# Question 4

In [4]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Setup ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
search_url = "https://eprints.lse.ac.uk/cgi/search/advanced"
driver.get(search_url)

# Prepare a DataFrame for storing results
columns = ['Department'] + [str(year) for year in range(2010, 2024)]
results_df = pd.DataFrame(columns=columns)

# List of departments
departments = [
    'Geography & Environment', 'Philosophy, Logic and Scientific Method', 
    'Psychological and Behavioural Science', 'Government', 'Law', 
    'Social Policy', 'Mathematics', 'Economic History', 'Sociology', 
    'International History', 'Statistics', 'Management', 'International Relations', 
    'Anthropology', 'Economics', 'Language Centre', 'Accounting', 'Finance', 
    'Methodology', 'School of Public Policy', 
    'European Institute', 'Media and Communications', 'Health Policy', 
    'International Development', 'Gender Studies'
]

# Process each department
for department in departments:
    row_data = {'Department': department}
    driver.get(search_url)  # Navigate back to the main search page for each department
    try:
        divisions_select = Select(driver.find_element(By.ID, "divisions"))  # Locate the dropdown again
        divisions_select.select_by_visible_text(department)
        available = True
    except NoSuchElementException:
        available = False
        print(f"Department {department} not found.")
    
    if available:
        for year in range(2010, 2024):
            try:
                wait = WebDriverWait(driver, 2)
                date_input = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[name='date']")))
                date_input.clear()
                date_input.send_keys(str(year))
                date_input.send_keys(Keys.RETURN)

                # Wait for the page to load and scrape the total results
                total_results_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "span.ep_search_number")))
                total_results = total_results_elements[-1].text  # Get the text of the last element
                row_data[str(year)] = total_results
                print(f"Results for {department} in {year}: {total_results}")
            except NoSuchElementException:
                row_data[str(year)] = 'Element not found'
                print(f"Element not found for {department} in {year}.")
            except TimeoutException:
                row_data[str(year)] = 'Timeout or no results'
                print(f"Timeout or no results for {department} in {year}.")
            driver.get('https://eprints.lse.ac.uk/cgi/search/archive/advanced')
            divisions_select = Select(driver.find_element(By.ID, "divisions"))  # Locate the dropdown again
            divisions_select.select_by_visible_text(department)
    # Append the results of this department to the DataFrame and save incrementally
    new_row = pd.DataFrame([row_data])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_df.to_csv('Data/department_yearly_results.csv', index=False)

# Close the browser
driver.quit()

print("Data scraping completed and saved to 'Data/department_yearly_results.csv'.")


Results for Geography & Environment in 2010: 220
Results for Geography & Environment in 2011: 296
Results for Geography & Environment in 2012: 243
Results for Geography & Environment in 2013: 160
Results for Geography & Environment in 2014: 160
Results for Geography & Environment in 2015: 108
Results for Geography & Environment in 2016: 129
Results for Geography & Environment in 2017: 119
Results for Geography & Environment in 2018: 119
Results for Geography & Environment in 2019: 117
Results for Geography & Environment in 2020: 175
Results for Geography & Environment in 2021: 177
Results for Geography & Environment in 2022: 170
Results for Geography & Environment in 2023: 145
Results for Philosophy, Logic and Scientific Method in 2010: 98
Results for Philosophy, Logic and Scientific Method in 2011: 60
Results for Philosophy, Logic and Scientific Method in 2012: 66
Results for Philosophy, Logic and Scientific Method in 2013: 68
Results for Philosophy, Logic and Scientific Method in 201

Results for Anthropology in 2022: 57
Results for Anthropology in 2023: 50
Results for Economics in 2010: 164
Results for Economics in 2011: 171
Results for Economics in 2012: 149
Results for Economics in 2013: 164
Results for Economics in 2014: 115
Results for Economics in 2015: 100
Results for Economics in 2016: 120
Results for Economics in 2017: 107
Results for Economics in 2018: 97
Results for Economics in 2019: 114
Results for Economics in 2020: 194
Results for Economics in 2021: 139
Results for Economics in 2022: 142
Results for Economics in 2023: 134
Timeout or no results for Language Centre in 2010.
Timeout or no results for Language Centre in 2011.
Timeout or no results for Language Centre in 2012.
Timeout or no results for Language Centre in 2013.
Timeout or no results for Language Centre in 2014.
Timeout or no results for Language Centre in 2015.
Timeout or no results for Language Centre in 2016.
Timeout or no results for Language Centre in 2017.
Timeout or no results for Lan

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import re

def setup_driver(download_dir_absolute):
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_experimental_option("prefs", {
        "download.default_directory": download_dir_absolute,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True
    })
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def extract_year_from_filename(filename):
    # Try to find a four-digit year first
    four_digit_year_match = re.search(r'(\d{4})', filename)
    if four_digit_year_match:
        return four_digit_year_match.group(1)
    # If not found, look for a two-digit year
    two_digit_year_match = re.search(r'(\d{2})', filename)
    if two_digit_year_match:
        return '20' + two_digit_year_match.group(1)
    # Return None if no year pattern is found
    return None

def rename_downloaded_file(download_dir, original_filename, year):
    original_path = os.path.join(download_dir, original_filename)
    new_filename = f"Fees{year}.pdf"
    new_path = os.path.join(download_dir, new_filename)
    os.rename(original_path, new_path)
    print(f"Renamed {original_filename} to {new_filename}")

def download_pdfs_by_class(base_url, class_name, download_dir):
    driver = setup_driver(download_dir)
    driver.get(base_url)
    time.sleep(1)  # Adjust based on your internet speed
    links = driver.find_elements(By.CLASS_NAME, class_name)

    for link in links:
        href = link.get_attribute('href')
        # Skip the unwanted PDF
        if "Fee-approval-cycle-2024.pdf" in href:
            continue
        if href and href.endswith('.pdf'):
            # Extract the original file name
            original_filename = href.split('/')[-1]
            # Extract year from the file name
            year = extract_year_from_filename(original_filename)
            if year:
                # Open the link in a new tab and download the file
                driver.execute_script(f"window.open('{href}');")
                time.sleep(1)  # Adjust for page load
                # The file is automatically downloaded to `download_dir`
                # Need to wait for the download to complete here (omitted for simplicity)
                # Rename the file after ensuring the download has completed
                rename_downloaded_file(download_dir, original_filename, year)
            # Switch back to the main window
            driver.switch_to.window(driver.window_handles[0])
    
    # Close the driver
    driver.quit()

# Base URL and class name remain the same
base_url = 'https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees'
class_name = 'sys_21'
download_dir_relative = 'Data/TuitionFees'

# Create the download directory if it doesn't exist
download_dir_absolute = os.path.abspath(download_dir_relative)
if not os.path.exists(download_dir_absolute):
    os.makedirs(download_dir_absolute)

# Call the download function
download_pdfs_by_class(base_url, class_name, download_dir_absolute)


Renamed Table-of-fees-2024-25-20Feb24-Updated-Home-PGR-fee.pdf to Fees2024.pdf
Renamed Table-of-fees-2023-24-7Nov23.pdf to Fees2023.pdf
Renamed Comb2022ToF-Final-19July23.pdf to Fees2022.pdf
Renamed ToF-3Aug21FinalComb.pdf to Fees2021.pdf
Renamed 2020-Table-of-Fees-25Jun20.pdf to Fees2020.pdf
Renamed 2019-Table-of-Fees.pdf to Fees2019.pdf
Renamed 2018-19-Fees-Table.pdf to Fees2018.pdf
Renamed 2017-18-Fees-Table.pdf to Fees2017.pdf
