In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import re

def setup_driver(download_dir_absolute):
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_experimental_option("prefs", {
        "download.default_directory": download_dir_absolute,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True
    })
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def extract_year_from_filename(filename):
    # Try to find a four-digit year first
    four_digit_year_match = re.search(r'(\d{4})', filename)
    if four_digit_year_match:
        return four_digit_year_match.group(1)
    # If not found, look for a two-digit year
    two_digit_year_match = re.search(r'(\d{2})', filename)
    if two_digit_year_match:
        return '20' + two_digit_year_match.group(1)
    # Return None if no year pattern is found
    return None

def rename_downloaded_file(download_dir, original_filename, year):
    original_path = os.path.join(download_dir, original_filename)
    new_filename = f"Fees{year}.pdf"
    new_path = os.path.join(download_dir, new_filename)
    os.rename(original_path, new_path)
    print(f"Renamed {original_filename} to {new_filename}")

def download_pdfs_by_class(base_url, class_name, download_dir):
    driver = setup_driver(download_dir)
    driver.get(base_url)
    time.sleep(1)  # Adjust based on your internet speed
    links = driver.find_elements(By.CLASS_NAME, class_name)

    for link in links:
        href = link.get_attribute('href')
        # Skip the unwanted PDF
        if "Fee-approval-cycle-2024.pdf" in href:
            continue
        if href and href.endswith('.pdf'):
            # Extract the original file name
            original_filename = href.split('/')[-1]
            # Extract year from the file name
            year = extract_year_from_filename(original_filename)
            if year:
                # Open the link in a new tab and download the file
                driver.execute_script(f"window.open('{href}');")
                time.sleep(1)  # Adjust for page load
                # The file is automatically downloaded to `download_dir`
                # Need to wait for the download to complete here (omitted for simplicity)
                # Rename the file after ensuring the download has completed
                rename_downloaded_file(download_dir, original_filename, year)
            # Switch back to the main window
            driver.switch_to.window(driver.window_handles[0])
    
    # Close the driver
    driver.quit()

# Base URL and class name remain the same
base_url = 'https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees'
class_name = 'sys_21'
download_dir_relative = 'Data/TuitionFees'

# Create the download directory if it doesn't exist
download_dir_absolute = os.path.abspath(download_dir_relative)
if not os.path.exists(download_dir_absolute):
    os.makedirs(download_dir_absolute)

# Call the download function
download_pdfs_by_class(base_url, class_name, download_dir_absolute)


In [1]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)

        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

def process_csv(output_csv_path):
    df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
    df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])
    
    contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
    starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)
    df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

    df_filtered.rename(columns={
        'Undergraduate programmes': 'Program',
        'Unnamed: 1': 'Home fees',
        'Unnamed: 2': 'Overseas fees'
    }, inplace=True)

    df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
    df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
    df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
    df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]
    df_filtered.reset_index(drop=True, inplace=True)

    return df_filtered

# Define years and corresponding paths
years = [2020, 2019, 2018, 2017]
data_folder = 'Data/TuitionFees/'
dfs = []

for year in years:
    pdf_path = f'{data_folder}Fees{year}.pdf'
    output_csv_path = f'{data_folder}{year}_Fees.csv'
    
    extract_tables_from_pdf(pdf_path, output_csv_path)
    df = process_csv(output_csv_path)
    df['Year'] = year  # Optional: Add a year column to each dataframe
    dfs.append(df)

final_df = pd.concat(dfs, axis=0)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
final_df




  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
Skipping line 244: expected 5 fields, saw 8
Skipping line 245: expected 5 fields, saw 8
Skipping line 246: expected 5 fields, saw 8
Skipping line 247: expected 5 fields, saw 8
Skipping line 248: expected 5 fields, saw 8
Skipping line 249: expected 5 fields, saw 8
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fields, saw 9
Skipping line 253: expected 5 fields, saw 9
Skipping line 254: expected 5 fields, saw 9
Skipping line 255: expected 5 fields, saw 9
Skipping line 256: expected 5 fields, saw 9
Skipping line 257: expected 5 fields, saw 9
Skipping line 258: expected 5 fields, saw 9
Skipping line 259: expected 5 fields, saw 9
Skipping line 260: expected 5 fields, saw 9
Skipping line 261: expected 5 fields, saw 9
Skipping line 262: expected 5 fields, saw 9




  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Unnamed: 0,Program,Home fees,Overseas fees,Year
0,UG,9250,21570,2020
1,MSc in Accounting and Finance,29184,29760,2020
2,"MSc in Accounting, Organisations and Institutions",29184,29760,2020
3,MSc in Anthropology and Development,14640,22608,2020
4,MSc in Anthropology and Development Management,22608,22608,2020
5,MSc in Applicable Mathematics,14640,22608,2020
6,MSc in Applied Social Data Science,29184,29760,2020
7,MSc in Behavioural Science,22608,22608,2020
8,MSc in China in Comparative Perspective,14640,22608,2020
9,MSc in City Design and Social Science,29184,29760,2020


In [2]:
# Convert 'Overseas fees' and 'Home fees' columns to numeric type
final_df['Overseas fees'] = pd.to_numeric(final_df['Overseas fees'])
final_df['Home fees'] = pd.to_numeric(final_df['Home fees'])

# Calculate the difference and store it in a new column
final_df['Difference in Fees'] = final_df['Overseas fees'] - final_df['Home fees']

# Drop the original 'Home fees' and 'Overseas fees' columns
final_df.drop(['Home fees', 'Overseas fees'], axis=1, inplace=True)

final_df

Unnamed: 0,Program,Year,Difference in Fees
0,UG,2020,12320
1,MSc in Accounting and Finance,2020,576
2,"MSc in Accounting, Organisations and Institutions",2020,576
3,MSc in Anthropology and Development,2020,7968
4,MSc in Anthropology and Development Management,2020,0
5,MSc in Applicable Mathematics,2020,7968
6,MSc in Applied Social Data Science,2020,576
7,MSc in Behavioural Science,2020,0
8,MSc in China in Comparative Perspective,2020,7968
9,MSc in City Design and Social Science,2020,576


In [3]:
import pandas as pd

# Assuming final_df contains your data in some format

# Convert final_df to a DataFrame (replace this line with your actual data conversion)
final_df = pd.DataFrame(final_df)

# Add an empty column named 'Department'
final_df['Department'] = pd.Series(dtype='float64')  # Adding a series with NaN values
final_df[final_df.columns[0]] = final_df[final_df.columns[0]].str.replace("in ", " ", regex=False)
final_df[final_df.columns[0]] = final_df[final_df.columns[0]].str.replace("MSc", "MSc ", regex=False)
# Remove leading and trailing whitespaces
final_df[final_df.columns[0]] = final_df[final_df.columns[0]].str.strip()

# Replace multiple spaces with a single space
final_df[final_df.columns[0]] = final_df[final_df.columns[0]].str.replace(r'\s+', ' ', regex=True)
final_df

Unnamed: 0,Program,Year,Difference in Fees,Department
0,UG,2020,12320,
1,MSc Accounting and Finance,2020,576,
2,"MSc Accounting, Organisations and Institutions",2020,576,
3,MSc Anthropology and Development,2020,7968,
4,MSc Anthropology and Development Management,2020,0,
5,MSc Applicable Mathematics,2020,7968,
6,MSc Applied Social Data Science,2020,576,
7,MSc Behavioural Science,2020,0,
8,MSc China Comparative Perspective,2020,7968,
9,MSc City Design and Social Science,2020,576,


In [4]:
csv_df = pd.read_csv('Data/PhilipOutput.csv')
pattern = r"BSc|BA|LLB"
csv_df = csv_df[~csv_df[csv_df.columns[0]].str.contains(pattern, na=False, case=False)]

csv_df

Unnamed: 0,Course Name,Median Salary,Department,Salary Category,Simple Department
39,MSc Geographic Data Science,35000,Department of Geography and Environment,High,Geography and Environment
40,MSc Economy and Society,28000,Department of Sociology,Low,Sociology
41,MSc Media and Communications (Media and Commun...,30000,Department of Media and Communications,Low,Media and Communications
43,Double Degree with Peking University: Environm...,35000,Department of Geography and Environment,High,Geography and Environment
44,"MSc Health Policy, Planning and Financing",38000,Department of Health Policy,High,Health Policy
45,MSc European and International Politics and Po...,30000,European Institute,Low,European Institute
46,MSc Social and Cultural Psychology,33000,Department of Psychological and Behavioural Sc...,Low,Psychological and Behavioural Science
47,MSc Media and Communications (Data and Society),30000,Department of Media and Communications,Low,Media and Communications
48,MSc Political Economy of Late Development,42000,Department of Economic History,High,Economic History
49,MSc Financial Mathematics,39500,Department of Mathematics,High,Mathematics


In [5]:
brackets_pattern = r"\[.*?\]|\(.*?\)"

# Remove content in brackets from the first column's values in final_df
final_df[final_df.columns[0]] = final_df[final_df.columns[0]].str.replace(brackets_pattern, '', regex=True)

# Remove content in brackets from the first column's values in csv_df
csv_df[csv_df.columns[0]] = csv_df[csv_df.columns[0]].str.replace(brackets_pattern, '', regex=True)


In [6]:
# Merge the dataframes based on the matches in the first columns

Unnamed: 0,Program,Year,Difference in Fees,Department
0,UG,2020,12320,
1,MSc Accounting and Finance,2020,576,Accounting
2,"MSc Accounting, Organisations and Institutions",2020,576,Accounting
3,MSc Anthropology and Development,2020,7968,Anthropology
4,MSc Anthropology and Development Management,2020,0,
5,MSc Applicable Mathematics,2020,7968,Mathematics
6,MSc Applied Social Data Science,2020,576,Methodology
7,MSc Behavioural Science,2020,0,Psychological and Behavioural Science
8,MSc China Comparative Perspective,2020,7968,
9,MSc City Design and Social Science,2020,576,Sociology
