# Question 1 - Applications/Offers/Entrances for departments by nationality

The data used came from the following website: https://public.tableau.com/app/profile/lseplanningdivision/vizzes. This contains all the official statistics on students which LSE publishes. Specifically, we used the "LSE applications, Offers and Entrants" tableau which breaks down the applications for various departments for both levels of study over the past 5 years by country.

The problem with this data source was that the website is not dynmaically generated and therefore we can not use BeautifulSoup to scrape the data. We further tried to obtain the data via FOI (Freedom of Information) but unfortunately due to restricted access to the data this was not a possibility.

Therefore, to circumvent this issue we downloaded the data as a pdf. Then we used the tabula and jpype modules to convert this pdf into a csv. 

IMPORTANT: To run this code you have to have Java installed. 

Converting the pdf to csv:

In [None]:
import requests
import pandas as pd
import tabula
import jpype


# URL of the PDF download link
#pdf_download_url = "https://public.tableau.com/vizql/w/ApplicationsOffersandEntrants/v/About/tempfile/sessions/6CFB072D66F548F0BD680B6D8269BD44-0:0/?key=2889495739&keepfile=yes&attachment=yes"
#pdf_download_url = "https://public.tableau.com/vizql/w/LSEStatisticsonStudents/v/TableA/tempfile/sessions/E54A8D2F36614981A37995A0DF6C69BC-0:0/?key=674594458&keepfile=yes&attachment=yes"
# Send a GET request to download the PDF file
#response = requests.get(pdf_download_url)

# Save the content of the response to a file with .pdf extension
file_path = "Data/LSE_Students_acceptance_program_v1.pdf"
#with open(file_path, "wb") as f:
#    f.write(response.content)

# Extract the table from the PDF file
tables = tabula.read_pdf(file_path, pages='all', multiple_tables=True)

# Check if any tables are extracted
if tables:
    # Concatenate all tables into one DataFrame if there are multiple
    combined_table = pd.concat(tables, ignore_index=True)
    
    # Save the combined
    
    table to a CSV file
    csv_file = "Data/data_table.csv"
    combined_table.to_csv(csv_file, index=False)
    print(f"Table saved to {csv_file}")
else:
    print("No table found in the PDF file.")



Notably the data extracted above will not only be used for question 1 but constantly throughout the project. Therefore, it arguably is our most important data source as it contains all the information regarding applications from different countries for the last five years which forms the basis of our analysis.

# Question 2

The salary data was systematically gathered from the individual course websites using the Selenium and Beautiful Soup modules. Due to structural differences between undergraduate and postgraduate course websites, separate scraping scripts were used for each category. Initially, both scripts navigate to the central course overview website, where they apply filters to select either undergraduate or postgraduate courses. Subsequently, the scripts access each course's specific page from the listings.

On reaching a course's individual page, the scripts are programmed to extract essential information, such as the name of the course, the department to which it belongs, and the median salary of graduates 15 months after completing the course. Upon extracting this information, going back to the main page and reaching the end of the page, the scripts automatically find and activate the “next” button to proceed to subsequent pages, ensuring a comprehensive capture of available data.

For postgraduate courses, while the overall procedure remains the same, the specific HTML elements from which data is extracted differ due to variations in page layout. BeautifulSoup is used across both scripts to parse HTML content and accurately locate and extract the targeted data elements. The extracted data is then compiled and stored in a CSV file.

Undergraduate courses:

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import csv
from bs4 import BeautifulSoup


options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')  
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

# Define URL
initial_url = "https://www.lse.ac.uk/programmes/search-courses?studyType=0%2F1%2F26%2F85%2F86"
driver.get(initial_url)

course_data = []

#Continue scraping until all pages are visited
while True:
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a'))
    )
    course_links = [elem.get_attribute('href') for elem in driver.find_elements(By.CSS_SELECTOR, 'h2.card__title a')]

    for link in course_links:
        driver.get(link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

        #Extract department name
        try:
            department_section = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'why-study-with-us')))
            soup = BeautifulSoup(department_section.get_attribute('outerHTML'), 'html.parser')
            department_titles = [title.text.strip() for title in soup.find_all('h2', class_='card__title')]
            department = ', '.join(department_titles) if department_titles else "Department not found."
        except TimeoutException:
            department = "Department not found."

        #Extract course name
        try:
            course_name = driver.find_element(By.CSS_SELECTOR, 'h1.hero__title span').text.strip()
        except NoSuchElementException:
            course_name = "Course name not found."

        #Extract median salary
        try:
            salary_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'section#graduate-destinations div.salary'))
            )
            median_salary = salary_element.text.strip()
        except TimeoutException:
            median_salary = "Salary not found."

        course_data.append((course_name, median_salary, department))

        #Navigate back to the list page
        driver.get(initial_url)

    #Attempt to move to the next page
    try:
        next_page_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]//button[contains(., 'Next')]")))
        next_page_button.click()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        initial_url = driver.current_url
    except TimeoutException:
        break

driver.quit()

#save data to CSV
csv_path = 'Data/output_median_salary.csv'
with open(csv_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Course Name', 'Median Salary', 'Department'])
    writer.writerows(course_data)

print("Data extraction complete. Results saved to 'output_median_salary.csv'.")
display()

Data extraction complete. Results saved to CSV.


Postgraduate courses:

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import csv
from bs4 import BeautifulSoup
import os

#Set up the WebDriver for Chrome
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

courses_info = []

#Define the base URL filtered for postgraduate courses
base_url = "https://www.lse.ac.uk/programmes/search-courses?studyType=0%2F1%2F26%2F85%2F87"
driver.get(base_url)
current_page_url = base_url

while True:
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))
    course_elements = driver.find_elements(By.CSS_SELECTOR, 'h2.card__title a')
    courses = [(elem.get_attribute('href'), elem.text) for elem in course_elements]

    for course_link, _ in courses:
        driver.get(course_link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        
        #Extract course details using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        #Extract course name
        course_name_elem = soup.find('h1', class_='pageTitle')
        course_name = course_name_elem.text.strip() if course_name_elem else "Course name not found."
        
        #Extract department name
        department_elem = soup.find('li', class_='keyDetails__item--dept')
        department = department_elem.text.strip() if department_elem else "Department not found."
        
        #Extract median salary
        median_salary = "Salary not found."
        careers_accordion = soup.find('h1', class_='accordion__title', string=lambda text: 'Careers' in text if text else False)
        if careers_accordion:
            careers_content = careers_accordion.find_next('div', class_='accordion__content')
            if careers_content:
                salary_tag = careers_content.find('strong', string=lambda text: "Median salary" in text if text else False)
                if salary_tag and salary_tag.next_sibling:
                    median_salary = salary_tag.next_sibling.strip()

        #Store extracted data
        courses_info.append((course_name, median_salary, department))

        #Navigate back to current page URL
        driver.get(current_page_url)
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))

    #Navigate to the next page if possible
    try:
        next_page_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]//button[contains(., 'Next')]")))
        next_page_button.click()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        current_page_url = driver.current_url
    except TimeoutException:
        break

driver.quit()

#Save results to a CSV file
csv_file_path = 'Data/output_median_salary.csv'
file_exists = os.path.isfile(csv_file_path) 

with open(csv_file_path, 'a', newline='') as file: 
    writer = csv.writer(file)
    if not file_exists:
        writer.writerow(['Course Name', 'Median Salary', 'Department'])
    writer.writerows(courses_info)

print("Data extraction complete. Results added to 'output_median_salary.csv'.")

Data extraction complete. Results added to 'output2804.csv'.


# Question 3

This Question aims to relate the Brexit induced changes at LSE to economic characteristics of the countries students applied from. We thus acquire the GDP per capita as well as the population for every country. We conduct a series of operations to gather GDP data by country from a designated webpage, integrate it with nationality information from a CSV file, and then store the merged data in a new CSV file. The original data is from the World Bank and portraits the data from 2022.  A  function iterates over the HTML elements, retrieving the information, namely country name, GDP, population, and GDP per capita.
Subsequently, another function combines the scraped GDP data with nationality information fetched from the CSV file we obtained in Question 1. It reads the CSV file containing country-nationality mappings, matches countries with their corresponding nationalities, and creates a merged dataset. Upon execution, the code facilitates the compilation of comprehensive GDP data by country, along with their respective nationalities, for further analysis or visualization purposes.


In [3]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_gdp_by_country():
    url = "https://www.worldometers.info/gdp/gdp-by-country/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    rows = soup.find_all('tr')
    gdp_by_country_data = []

    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 7:
            country = cells[1].text.strip()
            gdp = cells[2].text.strip().replace('$', '').replace(',', '')
            population = cells[5].text.strip().replace(',', '')
            gdp_per_capita = cells[6].text.strip().replace('$', '').replace(',', '')
            gdp_by_country_data.append((country, gdp, population, gdp_per_capita))
    
    return gdp_by_country_data

def merge_with_country_nationality_mapping(gdp_data, country_nationality_file):
    country_nationality_map = {}
    with open(country_nationality_file, 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            country_name = row['en_short_name']
            nationality = row['nationality']
            country_nationality_map[country_name] = nationality

    merged_data = []
    for country, gdp, population, gdp_per_capita in gdp_data:
        if country in country_nationality_map:
            nationality = country_nationality_map[country]
            merged_data.append((country, nationality, gdp, population, gdp_per_capita))
        else:
            merged_data.append((country, '', gdp, population, gdp_per_capita))

    return merged_data

def write_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Country', 'Nationality', 'GDP', 'Population', 'GDP per Capita'])
        writer.writerows(data)

def main():
    gdp_by_country_data = scrape_gdp_by_country()
    if gdp_by_country_data:
        merged_data = merge_with_country_nationality_mapping(gdp_by_country_data, 'Data/countries.csv')
        write_to_csv(merged_data, 'Data/merged_gdp_data.csv')

if __name__ == "__main__":
    main()


# Question 4

For question 4 we decided to use publications data to proxy a department’s research effort. A reliable data source for this was “LSE Research Online” (https://eprints.lse.ac.uk/cgi/search/advanced) with which it is possible to search through LSE’s publications. Especially useful was the fact that here it is possible to directly specify the division whose publications you want to search.
As this is an extensive process we again decided to use the selenium webdriver to automate the web browsing and thus the data acquisition process. 

The list of departments that was initialized in the code are the ones for whose publications were searched. Specifically the code attempts to select the department from a dropdown menu on the page. For those where the department is available it then enters a year into the search field and submits the form to load the search results for that year. These are then stored in a dataframe for the corresponding department and year. Through this the code collects the number of publications from each department in the initialized list for each year from 2019 to 2023 and then stores the results in a csv. Notably the dataframe is saved to the csv file incrementally as the automated webdriving is a long process and can be interrupted frequently.


In [5]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
search_url = "https://eprints.lse.ac.uk/cgi/search/advanced"
driver.get(search_url)

#Prepare a DataFrame for storing results
columns = ['Department'] + [str(year) for year in range(2019, 2024)]
results_df = pd.DataFrame(columns=columns)

#List of departments
departments = [
    'Geography & Environment', 'Philosophy, Logic and Scientific Method', 
    'Psychological and Behavioural Science', 'Government', 'Law', 
    'Social Policy', 'Mathematics', 'Economic History', 'Sociology', 
    'International History', 'Statistics', 'Management', 'International Relations', 
    'Anthropology', 'Economics', 'Language Centre', 'Accounting', 'Finance', 
    'Methodology', 'School of Public Policy', 
    'European Institute', 'Media and Communications', 'Health Policy', 
    'International Development', 'Gender Studies'
]

#Process each department
for department in departments:
    row_data = {'Department': department}
    driver.get(search_url)  #Navigate back to the main search page for each department
    try:
        divisions_select = Select(driver.find_element(By.ID, "divisions"))  #Locate the dropdown again
        divisions_select.select_by_visible_text(department)
        available = True
    except NoSuchElementException:
        available = False
        print(f"Department {department} not found.")
    
    if available:
        for year in range(2019, 2024):
            try:
                wait = WebDriverWait(driver, 2)
                date_input = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[name='date']")))
                date_input.clear()
                date_input.send_keys(str(year))
                date_input.send_keys(Keys.RETURN)

                # Wait for the page to load and scrape the total results
                total_results_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "span.ep_search_number")))
                total_results = total_results_elements[-1].text  #Get the text of the last element
                row_data[str(year)] = total_results
            except NoSuchElementException:
                row_data[str(year)] = 'Element not found'
            except TimeoutException:
                row_data[str(year)] = 'Timeout or no results'
            driver.get('https://eprints.lse.ac.uk/cgi/search/archive/advanced')
            divisions_select = Select(driver.find_element(By.ID, "divisions"))  #Locate the dropdown again
            divisions_select.select_by_visible_text(department)
    #Append the results of this department to the DataFrame and save incrementally
    new_row = pd.DataFrame([row_data])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_df.to_csv('Data/department_yearly_results.csv', index=False)

#Close the browser
driver.quit()

print("Data scraping completed and saved to 'Data/department_yearly_results.csv'.")
display(results_df)

Data scraping completed and saved to 'Data/department_yearly_results.csv'.


Unnamed: 0,Department,2019,2020,2021,2022,2023
0,Geography & Environment,117,175,177,170,145
1,"Philosophy, Logic and Scientific Method",64,60,63,66,61
2,Psychological and Behavioural Science,77,125,152,156,146
3,Government,106,126,85,100,113
4,Law,85,118,146,92,117
5,Social Policy,112,124,116,138,142
6,Mathematics,53,91,56,64,50
7,Economic History,49,45,39,38,53
8,Sociology,45,54,59,57,50
9,International History,41,43,37,35,32


This question also required the data on LSE's fees. LSE's website contains a section specific for this (https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees). Here the fees for each academic year from 2017/18 to the upcoming one (2024/2025) is available in the forms of downloadable pdfs.

Again we used the selenium webdriver to automate the process of downloading these pdfs. As these had a inconsistent pattern in names we ensured to label the names in a way that is easier interpretable which was especially important for later on, once this data was cleaned and visualised.

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import re

download_dir_relative = 'Data/TuitionFees'
download_dir_absolute = os.path.abspath(download_dir_relative)
if not os.path.exists(download_dir_absolute):
    os.makedirs(download_dir_absolute)

options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {
    "download.default_directory": download_dir_absolute,
    "download.prompt_for_download": False,
    "plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

#Base URL and class name for PDF downloads
base_url = 'https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees'
class_name = 'sys_21'
driver.get(base_url)
time.sleep(2)  # Allow time for the page to load
links = driver.find_elements(By.CLASS_NAME, class_name)

for link in links:
    href = link.get_attribute('href')
    if "Fee-approval-cycle-2024.pdf" in href:
        continue
    if href and href.endswith('.pdf'):
        original_filename = href.split('/')[-1]
        # Extract year from the file name
        four_digit_year_match = re.search(r'(\d{4})', original_filename)
        year = four_digit_year_match.group(1) if four_digit_year_match else None
        if not year:
            two_digit_year_match = re.search(r'(\d{2})', original_filename)
            year = '20' + two_digit_year_match.group(1) if two_digit_year_match else None
        
        if year:
            driver.execute_script(f"window.open('{href}');")
            time.sleep(1)  
            # Path handling for renaming the file
            original_path = os.path.join(download_dir_absolute, original_filename)
            new_filename = f"Fees{year}.pdf"
            new_path = os.path.join(download_dir_absolute, new_filename)
            os.rename(original_path, new_path)
            print(f"Renamed {original_filename} to {new_filename}")
            driver.switch_to.window(driver.window_handles[0])

driver.quit()


Renamed Table-of-fees-2024-25-20Feb24-Updated-Home-PGR-fee.pdf to Fees2024.pdf
Renamed Table-of-fees-2023-24-7Nov23.pdf to Fees2023.pdf
Renamed Comb2022ToF-Final-19July23.pdf to Fees2022.pdf
Renamed ToF-3Aug21FinalComb.pdf to Fees2021.pdf
Renamed 2020-Table-of-Fees-25Jun20.pdf to Fees2020.pdf
Renamed 2019-Table-of-Fees.pdf to Fees2019.pdf
Renamed 2018-19-Fees-Table.pdf to Fees2018.pdf
Renamed 2017-18-Fees-Table.pdf to Fees2017.pdf
