In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import re

def setup_driver(download_dir_absolute):
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_experimental_option("prefs", {
        "download.default_directory": download_dir_absolute,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True
    })
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def extract_year_from_filename(filename):
    # Try to find a four-digit year first
    four_digit_year_match = re.search(r'(\d{4})', filename)
    if four_digit_year_match:
        return four_digit_year_match.group(1)
    # If not found, look for a two-digit year
    two_digit_year_match = re.search(r'(\d{2})', filename)
    if two_digit_year_match:
        return '20' + two_digit_year_match.group(1)
    # Return None if no year pattern is found
    return None

def rename_downloaded_file(download_dir, original_filename, year):
    original_path = os.path.join(download_dir, original_filename)
    new_filename = f"Fees{year}.pdf"
    new_path = os.path.join(download_dir, new_filename)
    os.rename(original_path, new_path)
    print(f"Renamed {original_filename} to {new_filename}")

def download_pdfs_by_class(base_url, class_name, download_dir):
    driver = setup_driver(download_dir)
    driver.get(base_url)
    time.sleep(1)  # Adjust based on your internet speed
    links = driver.find_elements(By.CLASS_NAME, class_name)

    for link in links:
        href = link.get_attribute('href')
        # Skip the unwanted PDF
        if "Fee-approval-cycle-2024.pdf" in href:
            continue
        if href and href.endswith('.pdf'):
            # Extract the original file name
            original_filename = href.split('/')[-1]
            # Extract year from the file name
            year = extract_year_from_filename(original_filename)
            if year:
                # Open the link in a new tab and download the file
                driver.execute_script(f"window.open('{href}');")
                time.sleep(1)  # Adjust for page load
                # The file is automatically downloaded to `download_dir`
                # Need to wait for the download to complete here (omitted for simplicity)
                # Rename the file after ensuring the download has completed
                rename_downloaded_file(download_dir, original_filename, year)
            # Switch back to the main window
            driver.switch_to.window(driver.window_handles[0])
    
    # Close the driver
    driver.quit()

# Base URL and class name remain the same
base_url = 'https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees'
class_name = 'sys_21'
download_dir_relative = 'Data/TuitionFees'

# Create the download directory if it doesn't exist
download_dir_absolute = os.path.abspath(download_dir_relative)
if not os.path.exists(download_dir_absolute):
    os.makedirs(download_dir_absolute)

# Call the download function
download_pdfs_by_class(base_url, class_name, download_dir_absolute)


In [1]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020):
    with pdfplumber.open(pdf_path_2020) as pdf_2020:
        all_tables_2020 = []
        for page_2020 in pdf_2020.pages:
            tables_2020 = page_2020.extract_tables()
            for table_2020 in tables_2020:
                all_tables_2020.extend(table_2020)
       
        with open(output_csv_path_2020, 'w', newline='') as csvfile_2020:
            writer_2020 = csv.writer(csvfile_2020)
            for row_2020 in all_tables_2020:
                writer_2020.writerow(row_2020)

pdf_path_2020 = 'Data/TuitionFees/Fees2020.pdf'
output_csv_path_2020 = 'Data/TuitionFees/2020_Fees.csv'

extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)

df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])

# Define the criteria for filtering the DataFrame
contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)

# Apply the criteria
df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

# Rename columns
df_filtered.rename(columns={
    'Undergraduate programmes': 'Program',
    'Unnamed: 1': 'Home fees',
    'Unnamed: 2': 'Overseas fees'
}, inplace=True)

# Change the value in the 'Program' column
df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)

# Remove commas, pound signs, and unwanted numbers from the 'Home fees' and 'Overseas fees' columns
df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]

# Remove rows where 'Home fees' and 'Overseas fees' are not numerical
df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]

# Reset the index
df_filtered.reset_index(drop=True, inplace=True)


df_filtered



  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)
Skipping line 244: expected 5 fields, saw 8
Skipping line 245: expected 5 fields, saw 8
Skipping line 246: expected 5 fields, saw 8
Skipping line 247: expected 5 fields, saw 8
Skipping line 248: expected 5 fields, saw 8
Skipping line 249: expected 5 fields, saw 8
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fields, saw 9
Skipping line 253: expected 5 fields, saw 9
Skipping line 254: expected 5 fields, saw 9
Skipping line 255: expected 5 fields, saw 9
Skipping line 256: expected 5 fields, saw 9
Skipping line 257: expected 5 fields, saw 9
Skipping line 258: expected 5 fields, saw 9
Skipping line 259: expected 5 fields, saw 9
Skipping line 260: expected 5 fields, saw 9
Skipping line 261: expected 5 fields, saw 9
Skipping line 262: expected 5 fiel

Unnamed: 0,Program,Home fees,Overseas fees
0,UG,9250,21570
1,MSc in Accounting and Finance,29184,29760
2,"MSc in Accounting, Organisations and Institutions",29184,29760
3,MSc in Anthropology and Development,14640,22608
4,MSc in Anthropology and Development Management,22608,22608
5,MSc in Applicable Mathematics,14640,22608
6,MSc in Applied Social Data Science,29184,29760
7,MSc in Behavioural Science,22608,22608
8,MSc in China in Comparative Perspective,14640,22608
9,MSc in City Design and Social Science,29184,29760


In [10]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020):
    with pdfplumber.open(pdf_path_2020) as pdf_2020:
        all_tables_2020 = []
        for page_2020 in pdf_2020.pages:
            tables_2020 = page_2020.extract_tables()
            for table_2020 in tables_2020:
                all_tables_2020.extend(table_2020)
       
        with open(output_csv_path_2020, 'w', newline='') as csvfile_2020:
            writer_2020 = csv.writer(csvfile_2020)
            for row_2020 in all_tables_2020:
                writer_2020.writerow(row_2020)

pdf_path_2020 = 'Data/TuitionFees/Fees2019.pdf'
output_csv_path_2020 = 'Data/TuitionFees/2019_Fees.csv'

extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)

df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])

# Define the criteria for filtering the DataFrame
contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)

# Apply the criteria
df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

# Rename columns
df_filtered.rename(columns={
    'Undergraduate programmes': 'Program',
    'Unnamed: 1': 'Home fees',
    'Unnamed: 2': 'Overseas fees'
}, inplace=True)

# Change the value in the 'Program' column
df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)

# Remove commas, pound signs, and unwanted numbers from the 'Home fees' and 'Overseas fees' columns
df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]

# Remove rows where 'Home fees' and 'Overseas fees' are not numerical
df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]

# Reset the index
df_filtered.reset_index(drop=True, inplace=True)


df_filtered



  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)
Skipping line 234: expected 5 fields, saw 8
Skipping line 235: expected 5 fields, saw 8
Skipping line 236: expected 5 fields, saw 8
Skipping line 237: expected 5 fields, saw 8
Skipping line 238: expected 5 fields, saw 8
Skipping line 239: expected 5 fields, saw 8
Skipping line 240: expected 5 fields, saw 9
Skipping line 241: expected 5 fields, saw 9
Skipping line 242: expected 5 fields, saw 9
Skipping line 243: expected 5 fields, saw 9
Skipping line 244: expected 5 fields, saw 9
Skipping line 245: expected 5 fields, saw 9
Skipping line 246: expected 5 fields, saw 9
Skipping line 247: expected 5 fields, saw 9
Skipping line 248: expected 5 fields, saw 9
Skipping line 249: expected 5 fields, saw 9
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fiel

Unnamed: 0,Program,Home fees,Overseas fees
0,UG,9250,19920
1,MSc in Accounting and Finance,28056,28608
2,"MSc in Accounting, Organisations and Institutions",28056,28608
3,MSc in Anthropology and Development,14088,21744
4,MSc in Anthropology and Development Management,21744,21744
5,MSc in Applicable Mathematics,14088,21744
6,MSc in Applied Social Data Science,28056,28608
7,MSc in Behavioural Science,21744,21744
8,MSc in China in Comparative Perspective,14088,21744
9,MSc in City Design and Social Science,28056,28608


In [11]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020):
    with pdfplumber.open(pdf_path_2020) as pdf_2020:
        all_tables_2020 = []
        for page_2020 in pdf_2020.pages:
            tables_2020 = page_2020.extract_tables()
            for table_2020 in tables_2020:
                all_tables_2020.extend(table_2020)
       
        with open(output_csv_path_2020, 'w', newline='') as csvfile_2020:
            writer_2020 = csv.writer(csvfile_2020)
            for row_2020 in all_tables_2020:
                writer_2020.writerow(row_2020)

pdf_path_2020 = 'Data/TuitionFees/Fees2018.pdf'
output_csv_path_2020 = 'Data/TuitionFees/2018_Fees.csv'

extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)

df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])

# Define the criteria for filtering the DataFrame
contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)

# Apply the criteria
df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

# Rename columns
df_filtered.rename(columns={
    'Undergraduate programmes': 'Program',
    'Unnamed: 1': 'Home fees',
    'Unnamed: 2': 'Overseas fees'
}, inplace=True)

# Change the value in the 'Program' column
df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)

# Remove commas, pound signs, and unwanted numbers from the 'Home fees' and 'Overseas fees' columns
df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]

# Remove rows where 'Home fees' and 'Overseas fees' are not numerical
df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]

# Reset the index
df_filtered.reset_index(drop=True, inplace=True)


df_filtered



  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexi

Unnamed: 0,Program,Home fees,Overseas fees
0,UG,9250,19152
1,MSc in Accounting and Finance,26976,27504
2,"MSc in Accounting, Organisations and Institutions",26976,27504
3,MSc in A frican Development,13536,20904
4,MSc in Anthropology and Development,13536,20904
5,MSc in Anthropology and Development Management,20904,20904
6,MSc in Applicable Mathematics,13536,20904
7,MSc in Applied Social Data Science,26976,27504
8,MSc in China in Comparative Perspective,13536,20904
9,MSc in City Design and Social Science,26976,27504


In [12]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020):
    with pdfplumber.open(pdf_path_2020) as pdf_2020:
        all_tables_2020 = []
        for page_2020 in pdf_2020.pages:
            tables_2020 = page_2020.extract_tables()
            for table_2020 in tables_2020:
                all_tables_2020.extend(table_2020)
       
        with open(output_csv_path_2020, 'w', newline='') as csvfile_2020:
            writer_2020 = csv.writer(csvfile_2020)
            for row_2020 in all_tables_2020:
                writer_2020.writerow(row_2020)

pdf_path_2020 = 'Data/TuitionFees/Fees2017.pdf'
output_csv_path_2020 = 'Data/TuitionFees/2017_Fees.csv'

extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)

df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])

# Define the criteria for filtering the DataFrame
contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)

# Apply the criteria
df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

# Rename columns
df_filtered.rename(columns={
    'Undergraduate programmes': 'Program',
    'Unnamed: 1': 'Home fees',
    'Unnamed: 2': 'Overseas fees'
}, inplace=True)

# Change the value in the 'Program' column
df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)

# Remove commas, pound signs, and unwanted numbers from the 'Home fees' and 'Overseas fees' columns
df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]

# Remove rows where 'Home fees' and 'Overseas fees' are not numerical
df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]

# Reset the index
df_filtered.reset_index(drop=True, inplace=True)


df_filtered



  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexi

Unnamed: 0,Program,Home fees,Overseas fees
0,UG,9250,18408
1,MSc in Accounting and Finance,25944,26448
2,"MSc in Accounting, Organisations and Institutions",25944,26448
3,MSc in African Development,13008,20112
4,MSc in Anthropology and Development,13008,20112
5,MSc in Anthropology and Development Management,20112,20112
6,MSc in Applicable Mathematics,13008,20112
7,MSc in China in Comparative Perspective,13008,20112
8,MSc in City Design and Social Science,25944,26448
9,MSc in Comparative Politics (All tracks),20112,20112


In [22]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)

        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

def process_csv(output_csv_path):
    df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
    df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])
    
    contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
    starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)
    df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

    df_filtered.rename(columns={
        'Undergraduate programmes': 'Program',
        'Unnamed: 1': 'Home fees',
        'Unnamed: 2': 'Overseas fees'
    }, inplace=True)

    df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
    df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
    df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
    df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]
    df_filtered.reset_index(drop=True, inplace=True)

    return df_filtered

# Define years and corresponding paths
years = [2020, 2019, 2018, 2017]
data_folder = 'Data/TuitionFees/'
dfs = []

for year in years:
    pdf_path = f'{data_folder}Fees{year}.pdf'
    output_csv_path = f'{data_folder}{year}_Fees.csv'
    
    extract_tables_from_pdf(pdf_path, output_csv_path)
    df = process_csv(output_csv_path)
    df['Year'] = year  # Optional: Add a year column to each dataframe
    dfs.append(df)

# Merge all dataframes horizontally (side by side)
final_df = pd.concat(dfs, axis=1)

final_df




  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
Skipping line 244: expected 5 fields, saw 8
Skipping line 245: expected 5 fields, saw 8
Skipping line 246: expected 5 fields, saw 8
Skipping line 247: expected 5 fields, saw 8
Skipping line 248: expected 5 fields, saw 8
Skipping line 249: expected 5 fields, saw 8
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fields, saw 9
Skipping line 253: expected 5 fields, saw 9
Skipping line 254: expected 5 fields, saw 9
Skipping line 255: expected 5 fields, saw 9
Skipping line 256: expected 5 fields, saw 9
Skipping line 257: expected 5 fields, saw 9
Skipping line 258: expected 5 fields, saw 9
Skipping line 259: expected 5 fields, saw 9
Skipping line 260: expected 5 fields, saw 9
Skipping line 261: expected 5 fields, saw 9
Skipping line 262: expected 5 fields, saw 9




  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Unnamed: 0,Program,Home fees,Overseas fees,Year,Program.1,Home fees.1,Overseas fees.1,Year.1,Program.2,Home fees.2,Overseas fees.2,Year.2,Program.3,Home fees.3,Overseas fees.3,Year.3
0,UG,9250.0,21570.0,2020.0,UG,9250.0,19920.0,2019.0,UG,9250.0,19152.0,2018.0,UG,9250,18408,2017
1,MSc in Accounting and Finance,29184.0,29760.0,2020.0,MSc in Accounting and Finance,28056.0,28608.0,2019.0,MSc in Accounting and Finance,26976.0,27504.0,2018.0,MSc in Accounting and Finance,25944,26448,2017
2,"MSc in Accounting, Organisations and Institutions",29184.0,29760.0,2020.0,"MSc in Accounting, Organisations and Institutions",28056.0,28608.0,2019.0,"MSc in Accounting, Organisations and Institutions",26976.0,27504.0,2018.0,"MSc in Accounting, Organisations and Institutions",25944,26448,2017
3,MSc in Anthropology and Development,14640.0,22608.0,2020.0,MSc in Anthropology and Development,14088.0,21744.0,2019.0,MSc in A frican Development,13536.0,20904.0,2018.0,MSc in African Development,13008,20112,2017
4,MSc in Anthropology and Development Management,22608.0,22608.0,2020.0,MSc in Anthropology and Development Management,21744.0,21744.0,2019.0,MSc in Anthropology and Development,13536.0,20904.0,2018.0,MSc in Anthropology and Development,13008,20112,2017
5,MSc in Applicable Mathematics,14640.0,22608.0,2020.0,MSc in Applicable Mathematics,14088.0,21744.0,2019.0,MSc in Anthropology and Development Management,20904.0,20904.0,2018.0,MSc in Anthropology and Development Management,20112,20112,2017
6,MSc in Applied Social Data Science,29184.0,29760.0,2020.0,MSc in Applied Social Data Science,28056.0,28608.0,2019.0,MSc in Applicable Mathematics,13536.0,20904.0,2018.0,MSc in Applicable Mathematics,13008,20112,2017
7,MSc in Behavioural Science,22608.0,22608.0,2020.0,MSc in Behavioural Science,21744.0,21744.0,2019.0,MSc in Applied Social Data Science,26976.0,27504.0,2018.0,MSc in China in Comparative Perspective,13008,20112,2017
8,MSc in China in Comparative Perspective,14640.0,22608.0,2020.0,MSc in China in Comparative Perspective,14088.0,21744.0,2019.0,MSc in China in Comparative Perspective,13536.0,20904.0,2018.0,MSc in City Design and Social Science,25944,26448,2017
9,MSc in City Design and Social Science,29184.0,29760.0,2020.0,MSc in City Design and Social Science,28056.0,28608.0,2019.0,MSc in City Design and Social Science,26976.0,27504.0,2018.0,MSc in Comparative Politics (All tracks),20112,20112,2017


In [23]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
final_df


Unnamed: 0,Program,Home fees,Overseas fees,Year,Program.1,Home fees.1,Overseas fees.1,Year.1,Program.2,Home fees.2,Overseas fees.2,Year.2,Program.3,Home fees.3,Overseas fees.3,Year.3
0,UG,9250.0,21570.0,2020.0,UG,9250.0,19920.0,2019.0,UG,9250.0,19152.0,2018.0,UG,9250,18408,2017
1,MSc in Accounting and Finance,29184.0,29760.0,2020.0,MSc in Accounting and Finance,28056.0,28608.0,2019.0,MSc in Accounting and Finance,26976.0,27504.0,2018.0,MSc in Accounting and Finance,25944,26448,2017
2,"MSc in Accounting, Organisations and Institutions",29184.0,29760.0,2020.0,"MSc in Accounting, Organisations and Institutions",28056.0,28608.0,2019.0,"MSc in Accounting, Organisations and Institutions",26976.0,27504.0,2018.0,"MSc in Accounting, Organisations and Institutions",25944,26448,2017
3,MSc in Anthropology and Development,14640.0,22608.0,2020.0,MSc in Anthropology and Development,14088.0,21744.0,2019.0,MSc in A frican Development,13536.0,20904.0,2018.0,MSc in African Development,13008,20112,2017
4,MSc in Anthropology and Development Management,22608.0,22608.0,2020.0,MSc in Anthropology and Development Management,21744.0,21744.0,2019.0,MSc in Anthropology and Development,13536.0,20904.0,2018.0,MSc in Anthropology and Development,13008,20112,2017
5,MSc in Applicable Mathematics,14640.0,22608.0,2020.0,MSc in Applicable Mathematics,14088.0,21744.0,2019.0,MSc in Anthropology and Development Management,20904.0,20904.0,2018.0,MSc in Anthropology and Development Management,20112,20112,2017
6,MSc in Applied Social Data Science,29184.0,29760.0,2020.0,MSc in Applied Social Data Science,28056.0,28608.0,2019.0,MSc in Applicable Mathematics,13536.0,20904.0,2018.0,MSc in Applicable Mathematics,13008,20112,2017
7,MSc in Behavioural Science,22608.0,22608.0,2020.0,MSc in Behavioural Science,21744.0,21744.0,2019.0,MSc in Applied Social Data Science,26976.0,27504.0,2018.0,MSc in China in Comparative Perspective,13008,20112,2017
8,MSc in China in Comparative Perspective,14640.0,22608.0,2020.0,MSc in China in Comparative Perspective,14088.0,21744.0,2019.0,MSc in China in Comparative Perspective,13536.0,20904.0,2018.0,MSc in City Design and Social Science,25944,26448,2017
9,MSc in City Design and Social Science,29184.0,29760.0,2020.0,MSc in City Design and Social Science,28056.0,28608.0,2019.0,MSc in City Design and Social Science,26976.0,27504.0,2018.0,MSc in Comparative Politics (All tracks),20112,20112,2017


In [39]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)

        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

def process_csv(output_csv_path):
    df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
    df_modified = df.drop(columns=['Unnamed: 3', 'Unnamed: 4'])
    
    contains_new_entrants = df_modified['Undergraduate programmes'].str.contains('New entrants', na=False)
    starts_with_msc_or_llm = df_modified['Undergraduate programmes'].str.startswith(('MSc', 'LLM'), na=False)
    df_filtered = df_modified[contains_new_entrants | starts_with_msc_or_llm]

    df_filtered.rename(columns={
        'Undergraduate programmes': 'Program',
        'Unnamed: 1': 'Home fees',
        'Unnamed: 2': 'Overseas fees'
    }, inplace=True)

    df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
    df_filtered['Home fees'] = df_filtered['Home fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
    df_filtered['Overseas fees'] = df_filtered['Overseas fees'].str.replace(',', '').str.replace('£', '').str.split().str[0]
    df_filtered = df_filtered[df_filtered['Home fees'].str.isnumeric() & df_filtered['Overseas fees'].str.isnumeric()]
    df_filtered.reset_index(drop=True, inplace=True)

    return df_filtered

# Define years and corresponding paths
years = [2020, 2019, 2018, 2017]
data_folder = 'Data/TuitionFees/'
dfs = []

for year in years:
    pdf_path = f'{data_folder}Fees{year}.pdf'
    output_csv_path = f'{data_folder}{year}_Fees.csv'
    
    extract_tables_from_pdf(pdf_path, output_csv_path)
    df = process_csv(output_csv_path)
    df['Year'] = year  # Optional: Add a year column to each dataframe
    dfs.append(df)

final_df = pd.concat(dfs, axis=0)

final_df




  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
Skipping line 244: expected 5 fields, saw 8
Skipping line 245: expected 5 fields, saw 8
Skipping line 246: expected 5 fields, saw 8
Skipping line 247: expected 5 fields, saw 8
Skipping line 248: expected 5 fields, saw 8
Skipping line 249: expected 5 fields, saw 8
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fields, saw 9
Skipping line 253: expected 5 fields, saw 9
Skipping line 254: expected 5 fields, saw 9
Skipping line 255: expected 5 fields, saw 9
Skipping line 256: expected 5 fields, saw 9
Skipping line 257: expected 5 fields, saw 9
Skipping line 258: expected 5 fields, saw 9
Skipping line 259: expected 5 fields, saw 9
Skipping line 260: expected 5 fields, saw 9
Skipping line 261: expected 5 fields, saw 9
Skipping line 262: expected 5 fields, saw 9




  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Program'] = df_filtered['Program'].apply(lambda x: 'UG' if 'New entrants' in x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Unnamed: 0,Program,Home fees,Overseas fees,Year
0,UG,9250,21570,2020
1,MSc in Accounting and Finance,29184,29760,2020
2,"MSc in Accounting, Organisations and Institutions",29184,29760,2020
3,MSc in Anthropology and Development,14640,22608,2020
4,MSc in Anthropology and Development Management,22608,22608,2020
5,MSc in Applicable Mathematics,14640,22608,2020
6,MSc in Applied Social Data Science,29184,29760,2020
7,MSc in Behavioural Science,22608,22608,2020
8,MSc in China in Comparative Perspective,14640,22608,2020
9,MSc in City Design and Social Science,29184,29760,2020


In [32]:
# Convert 'Overseas fees' and 'Home fees' columns to numeric type
final_df['Overseas fees'] = pd.to_numeric(final_df['Overseas fees'])
final_df['Home fees'] = pd.to_numeric(final_df['Home fees'])

# Calculate the difference and store it in a new column
final_df['Difference in Fees'] = final_df['Overseas fees'] - final_df['Home fees']

# Drop the original 'Home fees' and 'Overseas fees' columns
final_df.drop(['Home fees', 'Overseas fees'], axis=1, inplace=True)

final_df

Unnamed: 0,Program,Year,Difference in Fees
0,UG,2020,12320
1,MSc in Accounting and Finance,2020,576
2,"MSc in Accounting, Organisations and Institutions",2020,576
3,MSc in Anthropology and Development,2020,7968
4,MSc in Anthropology and Development Management,2020,0
5,MSc in Applicable Mathematics,2020,7968
6,MSc in Applied Social Data Science,2020,576
7,MSc in Behavioural Science,2020,0
8,MSc in China in Comparative Perspective,2020,7968
9,MSc in City Design and Social Science,2020,576


In [40]:
years = final_df['Year'].unique()  # Get all unique years
year_dfs = []  # List to store each year's DataFrame

for year in years:
    # Filter data for each year
    df_year = final_df[final_df['Year'] == year]
    
    # Rename columns to include the year for clarity
    df_year.rename(columns=lambda x: f"{x}_{year}" if x != 'Year' else f"Year_{year}", inplace=True)
    
    # Add to the list
    year_dfs.append(df_year)

# Concatenate all the DataFrames horizontally
final_merged_df = pd.concat(year_dfs, axis=1)

# Since we might have duplicate 'Year' columns, let's drop duplicates keeping the first one
final_merged_df = final_merged_df.loc[:, ~final_merged_df.columns.duplicated()]

# Print the resulting DataFrame structure
final_merged_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_year.rename(columns=lambda x: f"{x}_{year}" if x != 'Year' else f"Year_{year}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_year.rename(columns=lambda x: f"{x}_{year}" if x != 'Year' else f"Year_{year}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_year.rename(columns=lambda x: f"{x}_{year}" if x != 'Year' else f"Year_{year}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in

Unnamed: 0,Program_2020,Home fees_2020,Overseas fees_2020,Year_2020,Program_2019,Home fees_2019,Overseas fees_2019,Year_2019,Program_2018,Home fees_2018,Overseas fees_2018,Year_2018,Program_2017,Home fees_2017,Overseas fees_2017,Year_2017
0,UG,9250.0,21570.0,2020.0,UG,9250.0,19920.0,2019.0,UG,9250.0,19152.0,2018.0,UG,9250,18408,2017
1,MSc in Accounting and Finance,29184.0,29760.0,2020.0,MSc in Accounting and Finance,28056.0,28608.0,2019.0,MSc in Accounting and Finance,26976.0,27504.0,2018.0,MSc in Accounting and Finance,25944,26448,2017
2,"MSc in Accounting, Organisations and Institutions",29184.0,29760.0,2020.0,"MSc in Accounting, Organisations and Institutions",28056.0,28608.0,2019.0,"MSc in Accounting, Organisations and Institutions",26976.0,27504.0,2018.0,"MSc in Accounting, Organisations and Institutions",25944,26448,2017
3,MSc in Anthropology and Development,14640.0,22608.0,2020.0,MSc in Anthropology and Development,14088.0,21744.0,2019.0,MSc in A frican Development,13536.0,20904.0,2018.0,MSc in African Development,13008,20112,2017
4,MSc in Anthropology and Development Management,22608.0,22608.0,2020.0,MSc in Anthropology and Development Management,21744.0,21744.0,2019.0,MSc in Anthropology and Development,13536.0,20904.0,2018.0,MSc in Anthropology and Development,13008,20112,2017
5,MSc in Applicable Mathematics,14640.0,22608.0,2020.0,MSc in Applicable Mathematics,14088.0,21744.0,2019.0,MSc in Anthropology and Development Management,20904.0,20904.0,2018.0,MSc in Anthropology and Development Management,20112,20112,2017
6,MSc in Applied Social Data Science,29184.0,29760.0,2020.0,MSc in Applied Social Data Science,28056.0,28608.0,2019.0,MSc in Applicable Mathematics,13536.0,20904.0,2018.0,MSc in Applicable Mathematics,13008,20112,2017
7,MSc in Behavioural Science,22608.0,22608.0,2020.0,MSc in Behavioural Science,21744.0,21744.0,2019.0,MSc in Applied Social Data Science,26976.0,27504.0,2018.0,MSc in China in Comparative Perspective,13008,20112,2017
8,MSc in China in Comparative Perspective,14640.0,22608.0,2020.0,MSc in China in Comparative Perspective,14088.0,21744.0,2019.0,MSc in China in Comparative Perspective,13536.0,20904.0,2018.0,MSc in City Design and Social Science,25944,26448,2017
9,MSc in City Design and Social Science,29184.0,29760.0,2020.0,MSc in City Design and Social Science,28056.0,28608.0,2019.0,MSc in City Design and Social Science,26976.0,27504.0,2018.0,MSc in Comparative Politics (All tracks),20112,20112,2017


In [42]:
def add_columns_after_difference(df):
    # Get all columns that contain 'Difference'
    difference_columns = [col for col in df.columns if 'Difference' in col]

    # Sort columns to ensure they are in original order when modifying DataFrame
    difference_columns.sort(key=lambda x: df.columns.get_loc(x))

    # Iterate over these columns in reverse order to avoid indexing issues when inserting new columns
    for col in reversed(difference_columns):
        new_col_name = col + "New"  # Name of the new column
        location = df.columns.get_loc(col) + 1  # Find the index of the column + 1 for the next position
        df.insert(location, new_col_name, pd.NA)  # Insert a new column with NA values

    return df

# Apply the function to 'final_df'
final_merged_df = add_columns_after_difference(final_merged_df)
final_merged_df

Unnamed: 0,Program_2020,Home fees_2020,Overseas fees_2020,Year_2020,Program_2019,Home fees_2019,Overseas fees_2019,Year_2019,Program_2018,Home fees_2018,Overseas fees_2018,Year_2018,Program_2017,Home fees_2017,Overseas fees_2017,Year_2017
0,UG,9250.0,21570.0,2020.0,UG,9250.0,19920.0,2019.0,UG,9250.0,19152.0,2018.0,UG,9250,18408,2017
1,MSc in Accounting and Finance,29184.0,29760.0,2020.0,MSc in Accounting and Finance,28056.0,28608.0,2019.0,MSc in Accounting and Finance,26976.0,27504.0,2018.0,MSc in Accounting and Finance,25944,26448,2017
2,"MSc in Accounting, Organisations and Institutions",29184.0,29760.0,2020.0,"MSc in Accounting, Organisations and Institutions",28056.0,28608.0,2019.0,"MSc in Accounting, Organisations and Institutions",26976.0,27504.0,2018.0,"MSc in Accounting, Organisations and Institutions",25944,26448,2017
3,MSc in Anthropology and Development,14640.0,22608.0,2020.0,MSc in Anthropology and Development,14088.0,21744.0,2019.0,MSc in A frican Development,13536.0,20904.0,2018.0,MSc in African Development,13008,20112,2017
4,MSc in Anthropology and Development Management,22608.0,22608.0,2020.0,MSc in Anthropology and Development Management,21744.0,21744.0,2019.0,MSc in Anthropology and Development,13536.0,20904.0,2018.0,MSc in Anthropology and Development,13008,20112,2017
5,MSc in Applicable Mathematics,14640.0,22608.0,2020.0,MSc in Applicable Mathematics,14088.0,21744.0,2019.0,MSc in Anthropology and Development Management,20904.0,20904.0,2018.0,MSc in Anthropology and Development Management,20112,20112,2017
6,MSc in Applied Social Data Science,29184.0,29760.0,2020.0,MSc in Applied Social Data Science,28056.0,28608.0,2019.0,MSc in Applicable Mathematics,13536.0,20904.0,2018.0,MSc in Applicable Mathematics,13008,20112,2017
7,MSc in Behavioural Science,22608.0,22608.0,2020.0,MSc in Behavioural Science,21744.0,21744.0,2019.0,MSc in Applied Social Data Science,26976.0,27504.0,2018.0,MSc in China in Comparative Perspective,13008,20112,2017
8,MSc in China in Comparative Perspective,14640.0,22608.0,2020.0,MSc in China in Comparative Perspective,14088.0,21744.0,2019.0,MSc in China in Comparative Perspective,13536.0,20904.0,2018.0,MSc in City Design and Social Science,25944,26448,2017
9,MSc in City Design and Social Science,29184.0,29760.0,2020.0,MSc in City Design and Social Science,28056.0,28608.0,2019.0,MSc in City Design and Social Science,26976.0,27504.0,2018.0,MSc in Comparative Politics (All tracks),20112,20112,2017


In [43]:
csv_data = pd.read_csv('Data/PhilipOutput.csv')

# Function to update final_merged_df based on matches in csv_data
def update_dataframe(df, csv):
    # Get columns that contain 'program'
    program_columns = [col for col in df.columns if 'Program' in col]

    for program_col in program_columns:
        # Iterate over rows in df
        for index, row in df.iterrows():
            # Get the program value
            program_value = row[program_col]
            # Find match in csv_data
            match = csv_data[csv_data['Course Name'] == program_value]
            if not match.empty:
                # Extract value from 5th column of csv
                value_to_place = match.iloc[0, 4]  # 'Simple Department' is the 5th column
                # Calculate target column index (3 to the right)
                target_col_index = df.columns.get_loc(program_col) + 3
                if target_col_index < len(df.columns):
                    # Replace the value in target column
                    df.iat[index, target_col_index] = value_to_place

    return df

# Update final_merged_df
final_merged_df = update_dataframe(final_merged_df, csv_data)
final_merged_df

Unnamed: 0,Program_2020,Home fees_2020,Overseas fees_2020,Year_2020,Program_2019,Home fees_2019,Overseas fees_2019,Year_2019,Program_2018,Home fees_2018,Overseas fees_2018,Year_2018,Program_2017,Home fees_2017,Overseas fees_2017,Year_2017
0,UG,9250.0,21570.0,2020.0,UG,9250.0,19920.0,2019.0,UG,9250.0,19152.0,2018.0,UG,9250,18408,2017
1,MSc in Accounting and Finance,29184.0,29760.0,2020.0,MSc in Accounting and Finance,28056.0,28608.0,2019.0,MSc in Accounting and Finance,26976.0,27504.0,2018.0,MSc in Accounting and Finance,25944,26448,2017
2,"MSc in Accounting, Organisations and Institutions",29184.0,29760.0,2020.0,"MSc in Accounting, Organisations and Institutions",28056.0,28608.0,2019.0,"MSc in Accounting, Organisations and Institutions",26976.0,27504.0,2018.0,"MSc in Accounting, Organisations and Institutions",25944,26448,2017
3,MSc in Anthropology and Development,14640.0,22608.0,2020.0,MSc in Anthropology and Development,14088.0,21744.0,2019.0,MSc in A frican Development,13536.0,20904.0,2018.0,MSc in African Development,13008,20112,2017
4,MSc in Anthropology and Development Management,22608.0,22608.0,2020.0,MSc in Anthropology and Development Management,21744.0,21744.0,2019.0,MSc in Anthropology and Development,13536.0,20904.0,2018.0,MSc in Anthropology and Development,13008,20112,2017
5,MSc in Applicable Mathematics,14640.0,22608.0,2020.0,MSc in Applicable Mathematics,14088.0,21744.0,2019.0,MSc in Anthropology and Development Management,20904.0,20904.0,2018.0,MSc in Anthropology and Development Management,20112,20112,2017
6,MSc in Applied Social Data Science,29184.0,29760.0,2020.0,MSc in Applied Social Data Science,28056.0,28608.0,2019.0,MSc in Applicable Mathematics,13536.0,20904.0,2018.0,MSc in Applicable Mathematics,13008,20112,2017
7,MSc in Behavioural Science,22608.0,22608.0,2020.0,MSc in Behavioural Science,21744.0,21744.0,2019.0,MSc in Applied Social Data Science,26976.0,27504.0,2018.0,MSc in China in Comparative Perspective,13008,20112,2017
8,MSc in China in Comparative Perspective,14640.0,22608.0,2020.0,MSc in China in Comparative Perspective,14088.0,21744.0,2019.0,MSc in China in Comparative Perspective,13536.0,20904.0,2018.0,MSc in City Design and Social Science,25944,26448,2017
9,MSc in City Design and Social Science,29184.0,29760.0,2020.0,MSc in City Design and Social Science,28056.0,28608.0,2019.0,MSc in City Design and Social Science,26976.0,27504.0,2018.0,MSc in Comparative Politics (All tracks),20112,20112,2017
