In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import csv
from bs4 import BeautifulSoup

def setup_driver():
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')  # Bypass OS security model
    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
    driver = webdriver.Chrome(options=options)
    return driver

def print_department_text(html_content):
    """Extracts and returns the text after the <h2 class="card__title"> tag."""
    soup = BeautifulSoup(html_content, 'html.parser')
    card_titles = soup.find_all('h2', class_='card__title')
    departments = [title.text.strip() for title in card_titles]
    return departments

def print_why_study_with_us_section(driver, course_link):
    """Returns the HTML content of the 'why study with us' section."""
    driver.get(course_link)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    try:
        why_study_with_us_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'why-study-with-us'))
        )
        html_content = why_study_with_us_section.get_attribute('outerHTML')
        return print_department_text(html_content)
    except TimeoutException:
        print("Failed to locate or extract the 'why study with us' section.")
        return []

# Setup WebDriver
driver = setup_driver()
initial_url = "https://www.lse.ac.uk/programmes/search-courses?studyType=0%2F1%2F26%2F85%2F86"
driver.get(initial_url)

def process_course_page(course_link):
    """ Visits each course link and extracts the course name, median salary, and department. """
    departments = print_why_study_with_us_section(driver, course_link)
    department = ', '.join(departments) if departments else "Department not found."
    
    try:
        course_name = driver.find_element(By.CSS_SELECTOR, 'h1.hero__title span').text.strip()
    except NoSuchElementException:
        course_name = "Course name not found."
    try:
        salary_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'section#graduate-destinations div.salary')))
        median_salary = salary_div.text.strip()
    except TimeoutException:
        median_salary = "Salary not found."

    return course_name, median_salary, department

def navigate_to_next_page():
    """Navigates to the next page if possible."""
    try:
        next_page_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]//button[contains(., 'Next')]")))
        next_page_button.click()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        return driver.current_url
    except TimeoutException:
        return None

courses_info = []
current_page_url = initial_url

while True:
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))
    courses = [(elem.get_attribute('href'), elem.text) for elem in driver.find_elements(By.CSS_SELECTOR, 'h2.card__title a')]
    for course_link, _ in courses:
        course_name, median_salary, department = process_course_page(course_link)
        courses_info.append((course_name, median_salary, department))
        driver.get(current_page_url)
    new_page_url = navigate_to_next_page()
    if new_page_url:
        current_page_url = new_page_url
    else:
        break

driver.quit()

# Save results to a CSV file
csv_file_path = 'output.csv'
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Course Name', 'Median Salary', 'Department'])
    writer.writerows(courses_info)

print("Data extraction complete. Results saved to CSV.")


Postgraduate Courses:

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import csv
from bs4 import BeautifulSoup
import os

def setup_driver():
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

def process_course_page(driver, course_link):
    """Visits each course link and extracts the course name, median salary, and department for postgraduate courses."""
    driver.get(course_link)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extracting course name
    course_name_elem = soup.find('h1', class_='pageTitle')
    course_name = course_name_elem.text.strip() if course_name_elem else "Course name not found."

    # Finding department
    department_elem = soup.find('li', class_='keyDetails__item--dept')
    department = department_elem.text.strip() if department_elem else "Department not found."

    # Extracting median salary directly from the Careers accordion content
    # Extracting median salary directly from the Careers accordion content
    median_salary = "Salary not found."
    careers_accordion = soup.find('h1', class_='accordion__title', string=lambda text: 'Careers' in text if text else False)
    if careers_accordion:
        careers_content = careers_accordion.find_next('div', class_='accordion__content')
        if careers_content:
            salary_tag = careers_content.find('strong', string=lambda text: "Median salary" in text if text else False)
            if salary_tag:
                median_salary = salary_tag.next_sibling.strip() if salary_tag.next_sibling else "Salary not found."

    print(median_salary)
    return course_name, median_salary, department


# Main scraping function
def scrape_courses(base_url):
    driver = setup_driver()
    driver.get(base_url)
    courses_info = []
    current_page_url = base_url  # Initialize current page URL

    while True:
        # Wait for the course links to be visible and then collect them
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))
        course_elements = driver.find_elements(By.CSS_SELECTOR, 'h2.card__title a')
        courses = [(elem.get_attribute('href'), elem.text) for elem in course_elements]

        for course_link, _ in courses:
            driver.get(course_link)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            course_info = process_course_page(driver, course_link)
            courses_info.append(course_info)
            driver.get(current_page_url)  # Go back to the current list page, not the base URL
            WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h2.card__title a')))  # Wait until all course links are visible again

        new_page_url = navigate_to_next_page(driver)
        if new_page_url:
            current_page_url = new_page_url  # Update the current page URL
            driver.get(new_page_url)  # Navigate to the next page
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))  # Ensure the page is loaded
        else:
            break  # Exit the loop if there are no more pages

    driver.quit()
    return courses_info

def navigate_to_next_page(driver):
    """Navigates to the next page if possible."""
    try:
        next_page_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]//button[contains(., 'Next')]")))
        next_page_button.click()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        return driver.current_url
    except TimeoutException:
        return None

    


def save_to_csv(data, filename):
    """Appends the data to a CSV file or creates it if it doesn't exist."""
    file_exists = os.path.isfile(filename)  # Check if the file already exists

    with open(filename, 'a', newline='') as file:  # Open the file in append mode
        writer = csv.writer(file)
        
        if not file_exists:
            writer.writerow(['Course Name', 'Median Salary', 'Department'])  # Write header only if the file does not exist
        
        writer.writerows(data)


# URL for postgraduate courses
postgraduate_url = "https://www.lse.ac.uk/programmes/search-courses?studyType=0%2F1%2F26%2F85%2F87"
postgraduate_courses = scrape_courses(postgraduate_url)
save_to_csv(postgraduate_courses, 'output.csv')
print("Data extraction complete. Results added to 'output.csv'.")



Cleaning of webscraping output:

In [None]:
import pandas as pd

# Load the original CSV file
data_path = 'output.csv'
data = pd.read_csv(data_path, encoding='ISO-8859-1')

# Simplify department names by removing "Department of "
data['Department'] = data['Department'].str.replace('Department of ', '')

# Replace "Salary not found" with NaN
data['Median Salary'] = data['Median Salary'].replace('Salary not found', pd.NA)

# Convert salary values to numeric, removing non-numeric characters like £ and commas
data['Median Salary'] = pd.to_numeric(data['Median Salary'].str.replace('[£,]', '', regex=True), errors='coerce')

# Determine median salary to categorize into 'High' and 'Low' groups
median_salary = data['Median Salary'].median()
data['Salary Group'] = data['Median Salary'].apply(lambda x: 'High' if x >= median_salary else 'Low')

# Remove duplicate rows
data.drop_duplicates(inplace=True)

# Save the cleaned data to a new CSV file
clean_data_path = 'cleaned_output.csv'
data.to_csv(clean_data_path, index=False)

print(f"Data has been cleaned, duplicates removed, and saved to {clean_data_path}.")


Basic plots:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('PhilipEUCleaned.csv')

# Define the years to aggregate data over
years = ['2019', '2020', '2021', '2022', '2023']

# Prepare the aggregation for applications, offers, and entrances by year and salary group
agg_columns = {}
for year in years:
    agg_columns[f'Applications {year}'] = 'sum'
    agg_columns[f'Offers {year}'] = 'sum'
    agg_columns[f'Entrances {year}'] = 'sum'

# Aggregating the data
agg_data = data.groupby('Salary Group').agg(agg_columns).reset_index()

# Calculate growth rate of applications year-over-year for each salary group
for year in years[1:]:  # Start from 2020 to be able to calculate growth from 2019
    previous_year = str(int(year) - 1)
    agg_data[f'Applications Growth {year}'] = (
        (agg_data[f'Applications {year}'] - agg_data[f'Applications {previous_year}']) / agg_data[f'Applications {previous_year}']) * 100

# Prepare data for plotting growth rates
growth_columns = [f'Applications Growth {year}' for year in years[1:]]  # Exclude 2019 as there's no previous year data
melted_growth_data = agg_data.melt(id_vars=['Salary Group'], value_vars=growth_columns, var_name='Year_Type', value_name='Growth Rate')
melted_growth_data['Year'] = melted_growth_data['Year_Type'].str.extract('(\d+)').astype(int)
melted_growth_data['Type'] = 'Applications Growth'

# Combine growth data with other rate data for plotting
combined_data = pd.concat([melted_growth_data, melted_data[melted_data['Type'].str.contains('Rate')]])

# Plotting with applications growth rate
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 18))

# Adjust plot settings
types = ['Applications Growth', 'Offer Rate', 'Entrance Rate']
colors = ['purple', 'green', 'red']
for ax, rate_type in zip(axes.flatten(), types):
    pivot_df = combined_data[combined_data['Type'] == rate_type].pivot(index='Year', columns='Salary Group', values='Growth Rate' if 'Growth' in rate_type else 'Rate')
    pivot_df.plot(kind='bar', ax=ax, color=colors)
    ax.set_title(f'{rate_type} Over the Years by Salary Group')
    ax.set_ylabel('Growth Rate (%)' if 'Growth' in rate_type else 'Rate (%)')
    ax.set_xlabel('Year')

plt.tight_layout()
plt.show()

Regression of difference in applications on median salary:

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression

# Load the CSV files
file_eu_cleaned = pd.read_csv('PhilipEUCleaned.csv')
file_output = pd.read_csv('data/PhilipOutput.csv')

# Filter for only undergraduate courses in the EU cleaned data
undergrad_eu_cleaned = file_eu_cleaned[file_eu_cleaned['Program'].str.contains("UG Degree")]

# Compute total applications for the years 2020 (pre-Brexit) and 2022 (post-Brexit)
pre_brexit_apps = undergrad_eu_cleaned.groupby('Department')['Applications 2019'].sum()
post_brexit_apps = undergrad_eu_cleaned.groupby('Department')['Applications 2023'].sum()

# Calculate the difference in applications
application_difference = post_brexit_apps - pre_brexit_apps

# Get the median salary for each department from the second file, ensuring to match only undergraduate departments
median_salaries = file_output[file_output['Course Name'].str.contains("BSc")].groupby('Simple Department')['Median Salary'].median()

# Join the application difference and median salary dataframes
department_data = pd.DataFrame({
    'Median Salary': median_salaries,
    'Application Difference': application_difference,
    'Total Pre-Brexit Applications': pre_brexit_apps,
    'Total Post-Brexit Applications': post_brexit_apps
}).dropna()

# Add department names for plotting
department_data['Department'] = department_data.index

# Perform linear regression
X = department_data['Median Salary'].values.reshape(-1, 1)
y = department_data['Application Difference'].values
model = LinearRegression()
model.fit(X, y)

# Predictions for the line
department_data['Predicted Difference'] = model.predict(X)

# Create the interactive plot
fig = px.scatter(department_data, x='Median Salary', y='Application Difference', trendline='ols',
                 labels={
                     'Median Salary': 'Median Salary (£)',
                     'Application Difference': 'Difference in Total Applications (2023 vs 2019)'
                 },
                 hover_data=['Department', 'Total Pre-Brexit Applications', 'Total Post-Brexit Applications'],
                 title='Interactive Plot: Median Salary vs. Application Difference Post vs Pre Brexit for Undergraduate Courses')
fig.update_traces(marker=dict(size=8),
                  selector=dict(mode='markers'))
fig.show()

Summary table:

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the CSV files
file_eu_cleaned = pd.read_csv('PhilipEUCleaned.csv')
file_output = pd.read_csv('data/PhilipOutput.csv')

# Filter for only undergraduate courses in the EU cleaned data
undergrad_eu_cleaned = file_eu_cleaned[file_eu_cleaned['Program'].str.contains("UG Degree")]

# Compute total applications for the years 2020 (pre-Brexit) and 2022 (post-Brexit)
pre_brexit_apps = undergrad_eu_cleaned.groupby('Department')['Applications 2019'].sum()
post_brexit_apps = undergrad_eu_cleaned.groupby('Department')['Applications 2023'].sum()

# Calculate the difference in applications
application_difference = post_brexit_apps - pre_brexit_apps

# Get the median salary for each department from the second file, ensuring to match only undergraduate departments
median_salaries = file_output[file_output['Course Name'].str.contains("BSc")].groupby('Simple Department')['Median Salary'].median()

# Join the application difference and median salary dataframes
department_data = pd.DataFrame({
    'Median Salary': median_salaries,
    'Application Difference': application_difference,
    'Total Pre-Brexit Applications': pre_brexit_apps,
    'Total Post-Brexit Applications': post_brexit_apps
}).dropna()

# Add department names for indexing
department_data['Department'] = department_data.index

# Set up the X and y matrices for the regression, with an intercept added to X
X = department_data[['Median Salary']]
X = sm.add_constant(X)  # Adds a constant term to the predictor
y = department_data['Application Difference']

# Fit the model
model = sm.OLS(y, X)
results = model.fit()

# Print the summary of the regression
print(results.summary())


Regression Discontinuity:

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Load the dataset
cleaned_data = pd.read_csv('PhilipEUCleaned.csv')

# Aggregate application numbers by year and salary group
applications_by_salary_group = cleaned_data.melt(
    id_vars=['Nationality', 'Department', 'Program', 'Salary Group'],
    value_vars=['Applications 2019', 'Applications 2020', 'Applications 2021', 'Applications 2022', 'Applications 2023'],
    var_name='Year',
    value_name='Applications'
)
applications_by_salary_group['Year'] = applications_by_salary_group['Year'].str.extract('(\d+)').astype(int)
applications_by_salary_group = applications_by_salary_group.groupby(['Year', 'Salary Group'])['Applications'].sum().reset_index()


# Create rdf_high and rdf_low from the applications_by_salary_group
rdf_high = applications_by_salary_group[applications_by_salary_group['Salary Group'] == 'High']
rdf_low = applications_by_salary_group[applications_by_salary_group['Salary Group'] == 'Low']

# Preparing data for piecewise linear regression fitting
# Splitting the data based on the Brexit cutoff

# Data before the Brexit cutoff
pre_brexit_data_high = rdf_high[rdf_high['Year'] <= 2020]
pre_brexit_data_low = rdf_low[rdf_low['Year'] <= 2020]

# Data after the Brexit cutoff
post_brexit_data_high = rdf_high[rdf_high['Year'] > 2020]
post_brexit_data_low = rdf_low[rdf_low['Year'] > 2020]

# Fit the models for each segment
model_pre_high = sm.OLS(pre_brexit_data_high['Applications'], sm.add_constant(pre_brexit_data_high['Year'])).fit()
model_post_high = sm.OLS(post_brexit_data_high['Applications'], sm.add_constant(post_brexit_data_high['Year'])).fit()

model_pre_low = sm.OLS(pre_brexit_data_low['Applications'], sm.add_constant(pre_brexit_data_low['Year'])).fit()
model_post_low = sm.OLS(post_brexit_data_low['Applications'], sm.add_constant(post_brexit_data_low['Year'])).fit()

# Predict values for a smoother line
fit_pre_high = model_pre_high.predict(sm.add_constant([2019, 2020]))
fit_post_high = model_post_high.predict(sm.add_constant([2021, 2022, 2023]))

fit_pre_low = model_pre_low.predict(sm.add_constant([2019, 2020]))
fit_post_low = model_post_low.predict(sm.add_constant([2021, 2022, 2023]))

extended_pre_brexit_years_high = np.array([2019, 2020, 2021])
extended_pre_brexit_years_low = np.array([2019, 2020, 2021])
extended_fit_pre_high = model_pre_high.predict(sm.add_constant(extended_pre_brexit_years_high))
extended_fit_pre_low = model_pre_low.predict(sm.add_constant(extended_pre_brexit_years_low))

# Plotting with the adjusted cutoff line, extrapolated pre-Brexit line, and gridlines
plt.figure(figsize=(14, 6))

# High salary departments
plt.subplot(1, 2, 1)
plt.scatter(rdf_high['Year'], rdf_high['Applications'], color='blue')
plt.plot(extended_pre_brexit_years_high, extended_fit_pre_high, color='red', label='Extended Pre-Brexit Fit')
plt.plot([2021, 2022, 2023], fit_post_high, color='green', label='Post-Brexit Fit')
plt.axvline(x=2021, color='black', linestyle='--', label='Brexit Cutoff (2021)')
plt.title('High Salary Departments')
plt.xlabel('Year')
plt.ylabel('Applications')
plt.xticks(range(2019, 2024))
plt.grid(True)
plt.legend()

# Low salary departments
plt.subplot(1, 2, 2)
plt.scatter(rdf_low['Year'], rdf_low['Applications'], color='green')
plt.plot(extended_pre_brexit_years_low, extended_fit_pre_low, color='red', label='Extended Pre-Brexit Fit')
plt.plot([2021, 2022, 2023], fit_post_low, color='blue', label='Post-Brexit Fit')
plt.axvline(x=2021, color='black', linestyle='--', label='Brexit Cutoff (2021)')
plt.title('Low Salary Departments')
plt.xlabel('Year')
plt.xticks(range(2019, 2024))
plt.ylabel('Applications')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

# Load the dataset
cleaned_data = pd.read_csv('PhilipEUCleaned.csv')

# Aggregate application numbers by year and salary group
applications_by_salary_group = cleaned_data.melt(
    id_vars=['Nationality', 'Department', 'Program', 'Salary Group'],
    value_vars=['Applications 2019', 'Applications 2020', 'Applications 2021', 'Applications 2022', 'Applications 2023'],
    var_name='Year',
    value_name='Applications'
)
applications_by_salary_group['Year'] = applications_by_salary_group['Year'].str.extract('(\d+)').astype(int)
applications_by_salary_group = applications_by_salary_group.groupby(['Year', 'Salary Group'])['Applications'].sum().reset_index()

# Splitting the data based on the Brexit cutoff for high and low salary groups
pre_brexit_data_high = applications_by_salary_group[(applications_by_salary_group['Salary Group'] == 'High') & (applications_by_salary_group['Year'] <= 2020)]
post_brexit_data_high = applications_by_salary_group[(applications_by_salary_group['Salary Group'] == 'High') & (applications_by_salary_group['Year'] > 2020)]

pre_brexit_data_low = applications_by_salary_group[(applications_by_salary_group['Salary Group'] == 'Low') & (applications_by_salary_group['Year'] <= 2020)]
post_brexit_data_low = applications_by_salary_group[(applications_by_salary_group['Salary Group'] == 'Low') & (applications_by_salary_group['Year'] > 2020)]

# Fit the models for each segment
model_pre_high = sm.OLS(pre_brexit_data_high['Applications'], sm.add_constant(pre_brexit_data_high['Year'])).fit()
model_post_high = sm.OLS(post_brexit_data_high['Applications'], sm.add_constant(post_brexit_data_high['Year'])).fit()

model_pre_low = sm.OLS(pre_brexit_data_low['Applications'], sm.add_constant(pre_brexit_data_low['Year'])).fit()
model_post_low = sm.OLS(post_brexit_data_low['Applications'], sm.add_constant(post_brexit_data_low['Year'])).fit()

# Summary tables for regression models
summary_pre_high = model_pre_high.summary()
summary_post_high = model_post_high.summary()
summary_pre_low = model_pre_low.summary()
summary_post_low = model_post_low.summary()

# Output the summaries
print("High Salary Group - Pre-Brexit Model Summary:\n", summary_pre_high)
print("High Salary Group - Post-Brexit Model Summary:\n", summary_post_high)
print("Low Salary Group - Pre-Brexit Model Summary:\n", summary_pre_low)
print("Low Salary Group - Post-Brexit Model Summary:\n", summary_post_low)
