In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

def setup_driver(download_dir):
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    # Disable PDF viewer to automatically download PDF files
    options.add_experimental_option("prefs", {
        "download.default_directory": download_dir,
        "download.prompt_for_download": False,  # Disable download prompt
        "plugins.always_open_pdf_externally": True  # It will not show PDF directly in chrome
    })
    # Set up Chrome driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def download_pdfs_by_class(base_url, class_name, download_dir):
    driver = setup_driver(download_dir)
    driver.get(base_url)

    # Wait for the page to load
    time.sleep(1)  # Increase or decrease based on your internet speed

    # Find all elements with the specified class and download the linked files
    links = driver.find_elements(By.CLASS_NAME, class_name)
    for link in links:
        href = link.get_attribute('href')
        if href and href.endswith('.pdf'):
            # Open the link in a new tab
            driver.execute_script(f"window.open('{href}');")
            time.sleep(1)  # Adjust time for page load as necessary
            # Switch back to the main window
            driver.switch_to.window(driver.window_handles[0])

    # Close the driver
    driver.quit()

# Base URL of the page containing the links
base_url = 'https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees'
# Class shared by PDF links
class_name = 'sys_21'
# Path to the download directory
download_dir = 'Data\TuitionFees'

download_pdfs_by_class(base_url, class_name, download_dir)


In [2]:
## 2020

import pdfplumber
import csv
import pandas as pd
import re

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Specify the path to your PDF and the output CSV file
pdf_path = 'Data/2020-Table-of-Fees-25Jun20.pdf'
output_csv_path = 'Data/2020_Fees.csv'

extract_tables_from_pdf(pdf_path, output_csv_path)


# Load the CSV file containing the tuition fees and program data
output_csv_path = 'Data/2020_Fees.csv'  
csv_path = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

# Read the CSV files
data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
data_df = pd.read_csv(csv_path)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df['Dept_Program'] = data_df['Department'] + " " + data_df['Program']

# Get unique combinations
unique_dept_programs = data_df['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed = {}

# Process only Master's programs and adjust department names
processed_dept_programs = []
for combo in unique_dept_programs:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"
            
            # Save mapping
            original_to_transformed[transformed_department] = original_department
            processed_dept_programs.append(transformed_department)

# Create DataFrame from processed list
processed_df = pd.DataFrame(processed_dept_programs, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df.insert(1, 'Program', 'PG Taught')

def find_matching_data(dept_program):
    matches = data_df_one[data_df_one.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df[['Home fees', 'Overseas fees']] = processed_df['Department'].apply(find_matching_data)

# Remove rows where either column contains NA or NaN values
processed_df.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df['Home fees'] = processed_df['Home fees'].astype(int)
processed_df['Overseas fees'] = processed_df['Overseas fees'].astype(int)

# Revert department names to original values
processed_df['Department'] = processed_df['Department'].map(original_to_transformed)

# Display the final DataFrame
processed_df

# Find the row index for the specific phrase and extract the fees and year
idx = data_df_one.index[data_df_one.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx:
    target_idx = idx[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one.iloc[target_idx - 1, 0]:
        year_match = re.search(r'\b(\d{4})\b', data_df_one.iloc[target_idx, 0])
        year = year_match.group(0) if year_match else "Unknown"
        home_fee_match = re.search(r'£(\d{4})', data_df_one.iloc[target_idx, 1].replace(',', ''))
        overseas_fee_match = re.search(r'£(\d{5})', data_df_one.iloc[target_idx, 2].replace(',', ''))
        home_fee = int(home_fee_match.group(1)) if home_fee_match else None
        overseas_fee = int(overseas_fee_match.group(1)) if overseas_fee_match else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row = pd.DataFrame({
    'Department': [f"All {year}"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee],
    'Overseas fees': [overseas_fee]
})

# Append the additional row to the processed_df DataFrame
processed_df = pd.concat([additional_row, processed_df]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df



  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
Skipping line 244: expected 5 fields, saw 8
Skipping line 245: expected 5 fields, saw 8
Skipping line 246: expected 5 fields, saw 8
Skipping line 247: expected 5 fields, saw 8
Skipping line 248: expected 5 fields, saw 8
Skipping line 249: expected 5 fields, saw 8
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fields, saw 9
Skipping line 253: expected 5 fields, saw 9
Skipping line 254: expected 5 fields, saw 9
Skipping line 255: expected 5 fields, saw 9
Skipping line 256: expected 5 fields, saw 9
Skipping line 257: expected 5 fields, saw 9
Skipping line 258: expected 5 fields, saw 9
Skipping line 259: expected 5 fields, saw 9
Skipping line 260: expected 5 fields, saw 9
Skipping line 261: expected 5 fields, saw 9
Skipping line 262: expecte

Unnamed: 0,Department,Program,Home fees,Overseas fees
0,All 2020,UG Degree,9250,21570
1,Accounting,PG Taught,28080,28464
2,Economic History,PG Taught,14640,22608
3,Economics,PG Taught,23179,24134
4,European Institute,PG Taught,19952,22608
5,Finance,PG Taught,28969,29185
6,Gender Studies,PG Taught,14640,22608
7,Geography And Environment,PG Taught,14640,22608
8,Health Policy,PG Taught,18160,25768
9,International Development,PG Taught,14640,22608


In [3]:
## 2018

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Specify the path to your PDF and the output CSV file
pdf_path = 'Data/2018-19-Fees-Table.pdf'
output_csv_path = 'Data/2018_Fees.csv'

extract_tables_from_pdf(pdf_path, output_csv_path)


# Load the CSV file containing the tuition fees and program data
output_csv_path = 'Data/2018_Fees.csv'  
csv_path = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

# Read the CSV files
data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
data_df = pd.read_csv(csv_path)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df['Dept_Program'] = data_df['Department'] + " " + data_df['Program']

# Get unique combinations
unique_dept_programs = data_df['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed = {}

# Process only Master's programs and adjust department names
processed_dept_programs = []
for combo in unique_dept_programs:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"
            
            # Save mapping
            original_to_transformed[transformed_department] = original_department
            processed_dept_programs.append(transformed_department)

# Create DataFrame from processed list
processed_df = pd.DataFrame(processed_dept_programs, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df.insert(1, 'Program', 'PG Taught')

def find_matching_data(dept_program):
    matches = data_df_one[data_df_one.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df[['Home fees', 'Overseas fees']] = processed_df['Department'].apply(find_matching_data)

# Remove rows where either column contains NA or NaN values
processed_df.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df['Home fees'] = processed_df['Home fees'].astype(int)
processed_df['Overseas fees'] = processed_df['Overseas fees'].astype(int)

# Revert department names to original values
processed_df['Department'] = processed_df['Department'].map(original_to_transformed)

# Display the final DataFrame
processed_df

# Find the row index for the specific phrase and extract the fees and year
idx = data_df_one.index[data_df_one.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx:
    target_idx = idx[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one.iloc[target_idx - 1, 0]:
        year_match = re.search(r'\b(\d{4})\b', data_df_one.iloc[target_idx, 0])
        year = year_match.group(0) if year_match else "Unknown"
        home_fee_match = re.search(r'£(\d{4})', data_df_one.iloc[target_idx, 1].replace(',', ''))
        overseas_fee_match = re.search(r'£(\d{5})', data_df_one.iloc[target_idx, 2].replace(',', ''))
        home_fee = int(home_fee_match.group(1)) if home_fee_match else None
        overseas_fee = int(overseas_fee_match.group(1)) if overseas_fee_match else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row = pd.DataFrame({
    'Department': [f"All {year}"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee],
    'Overseas fees': [overseas_fee]
})

# Append the additional row to the processed_df DataFrame
processed_df = pd.concat([additional_row, processed_df]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df



  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
  avg_col3 = pd.Series(col3_values).dropna().mean()


Unnamed: 0,Department,Program,Home fees,Overseas fees
0,All 2018,UG Degree,9250,19152
1,Accounting,PG Taught,25080,25344
2,Economic History,PG Taught,13536,20904
3,Economics,PG Taught,20848,21796
4,European Institute,PG Taught,19936,20904
5,Finance,PG Taught,26599,26797
6,Gender Studies,PG Taught,13536,20904
7,Geography And Environment,PG Taught,13536,20904
8,Health Policy,PG Taught,16392,23448
9,International Development,PG Taught,17480,20904


In [4]:
## 2019

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Specify the path to your PDF and the output CSV file
pdf_path = 'Data/2019-Table-of-Fees.pdf'
output_csv_path = 'Data/2019_Fees.csv'

extract_tables_from_pdf(pdf_path, output_csv_path)


# Load the CSV file containing the tuition fees and program data
output_csv_path = 'Data/2019_Fees.csv'  
csv_path = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

# Read the CSV files
data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
data_df = pd.read_csv(csv_path)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df['Dept_Program'] = data_df['Department'] + " " + data_df['Program']

# Get unique combinations
unique_dept_programs = data_df['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed = {}

# Process only Master's programs and adjust department names
processed_dept_programs = []
for combo in unique_dept_programs:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"
            
            # Save mapping
            original_to_transformed[transformed_department] = original_department
            processed_dept_programs.append(transformed_department)

# Create DataFrame from processed list
processed_df = pd.DataFrame(processed_dept_programs, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df.insert(1, 'Program', 'PG Taught')

def find_matching_data(dept_program):
    matches = data_df_one[data_df_one.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df[['Home fees', 'Overseas fees']] = processed_df['Department'].apply(find_matching_data)

# Remove rows where either column contains NA or NaN values
processed_df.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df['Home fees'] = processed_df['Home fees'].astype(int)
processed_df['Overseas fees'] = processed_df['Overseas fees'].astype(int)

# Revert department names to original values
processed_df['Department'] = processed_df['Department'].map(original_to_transformed)

# Display the final DataFrame
processed_df

# Find the row index for the specific phrase and extract the fees and year
idx = data_df_one.index[data_df_one.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx:
    target_idx = idx[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one.iloc[target_idx - 1, 0]:
        year_match = re.search(r'\b(\d{4})\b', data_df_one.iloc[target_idx, 0])
        year = year_match.group(0) if year_match else "Unknown"
        home_fee_match = re.search(r'£(\d{4})', data_df_one.iloc[target_idx, 1].replace(',', ''))
        overseas_fee_match = re.search(r'£(\d{5})', data_df_one.iloc[target_idx, 2].replace(',', ''))
        home_fee = int(home_fee_match.group(1)) if home_fee_match else None
        overseas_fee = int(overseas_fee_match.group(1)) if overseas_fee_match else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row = pd.DataFrame({
    'Department': [f"All {year}"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee],
    'Overseas fees': [overseas_fee]
})

# Append the additional row to the processed_df DataFrame
processed_df = pd.concat([additional_row, processed_df]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df



  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
Skipping line 234: expected 5 fields, saw 8
Skipping line 235: expected 5 fields, saw 8
Skipping line 236: expected 5 fields, saw 8
Skipping line 237: expected 5 fields, saw 8
Skipping line 238: expected 5 fields, saw 8
Skipping line 239: expected 5 fields, saw 8
Skipping line 240: expected 5 fields, saw 9
Skipping line 241: expected 5 fields, saw 9
Skipping line 242: expected 5 fields, saw 9
Skipping line 243: expected 5 fields, saw 9
Skipping line 244: expected 5 fields, saw 9
Skipping line 245: expected 5 fields, saw 9
Skipping line 246: expected 5 fields, saw 9
Skipping line 247: expected 5 fields, saw 9
Skipping line 248: expected 5 fields, saw 9
Skipping line 249: expected 5 fields, saw 9
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expecte

Unnamed: 0,Department,Program,Home fees,Overseas fees
0,All 2019,UG Degree,9250,19920
1,Accounting,PG Taught,26082,26358
2,Economic History,PG Taught,14088,21744
3,Economics,PG Taught,21917,22870
4,European Institute,PG Taught,19192,21744
5,Finance,PG Taught,27663,27870
6,Gender Studies,PG Taught,14088,21744
7,Geography And Environment,PG Taught,14088,21744
8,Health Policy,PG Taught,17512,24832
9,International Development,PG Taught,14088,21744


In [12]:
## 2017

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Specify the path to your PDF and the output CSV file
pdf_path = 'Data/2017-18-Fees-Table.pdf'
output_csv_path = 'Data/2017_Fees.csv'

extract_tables_from_pdf(pdf_path, output_csv_path)


# Load the CSV file containing the tuition fees and program data
output_csv_path = 'Data/2017_Fees.csv'  
csv_path = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

# Read the CSV files
data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
data_df = pd.read_csv(csv_path)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df['Dept_Program'] = data_df['Department'] + " " + data_df['Program']

# Get unique combinations
unique_dept_programs = data_df['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed = {}

# Process only Master's programs and adjust department names
processed_dept_programs = []
for combo in unique_dept_programs:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"
            
            # Save mapping
            original_to_transformed[transformed_department] = original_department
            processed_dept_programs.append(transformed_department)

# Create DataFrame from processed list
processed_df = pd.DataFrame(processed_dept_programs, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df.insert(1, 'Program', 'PG Taught')

def find_matching_data(dept_program):
    matches = data_df_one[data_df_one.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df[['Home fees', 'Overseas fees']] = processed_df['Department'].apply(find_matching_data)

# Remove rows where either column contains NA or NaN values
processed_df.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df['Home fees'] = processed_df['Home fees'].astype(int)
processed_df['Overseas fees'] = processed_df['Overseas fees'].astype(int)

# Revert department names to original values
processed_df['Department'] = processed_df['Department'].map(original_to_transformed)

# Display the final DataFrame
processed_df

# Find the row index for the specific phrase and extract the fees and year
idx = data_df_one.index[data_df_one.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx:
    target_idx = idx[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one.iloc[target_idx - 1, 0]:
        year_match = re.search(r'\b(\d{4})\b', data_df_one.iloc[target_idx, 0])
        year = year_match.group(0) if year_match else "Unknown"
        home_fee_match = re.search(r'£(\d{4})', data_df_one.iloc[target_idx, 1].replace(',', ''))
        overseas_fee_match = re.search(r'£(\d{5})', data_df_one.iloc[target_idx, 2].replace(',', ''))
        home_fee = int(home_fee_match.group(1)) if home_fee_match else None
        overseas_fee = int(overseas_fee_match.group(1)) if overseas_fee_match else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row = pd.DataFrame({
    'Department': [f"All {year}"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee],
    'Overseas fees': [overseas_fee]
})

# Append the additional row to the processed_df DataFrame
processed_df = pd.concat([additional_row, processed_df]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df



  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


Unnamed: 0,Department,Program,Home fees,Overseas fees
0,All 2019,UG Degree,9250,19920
1,Accounting,PG Taught,24120,24372
2,Economic History,PG Taught,13008,20112
3,Economics,PG Taught,19607,21054
4,European Institute,PG Taught,17587,20112
5,Finance,PG Taught,25578,25767
6,Gender Studies,PG Taught,13008,20112
7,Geography And Environment,PG Taught,13008,20112
8,Government,PG Taught,13208,20312
9,Health Policy,PG Taught,15792,22584


In [6]:
## 2021

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Specify the path to your PDF and the output CSV file
pdf_path = 'Data/ToF-3Aug21FinalComb.pdf'
output_csv_path = 'Data/2021_Fees.csv'

extract_tables_from_pdf(pdf_path, output_csv_path)


# Load the CSV file containing the tuition fees and program data
output_csv_path = 'Data/2021_Fees.csv'  
csv_path = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

# Read the CSV files
data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
data_df = pd.read_csv(csv_path)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df['Dept_Program'] = data_df['Department'] + " " + data_df['Program']

# Get unique combinations
unique_dept_programs = data_df['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed = {}

# Process only Master's programs and adjust department names
processed_dept_programs = []
for combo in unique_dept_programs:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"
            
            # Save mapping
            original_to_transformed[transformed_department] = original_department
            processed_dept_programs.append(transformed_department)

# Create DataFrame from processed list
processed_df = pd.DataFrame(processed_dept_programs, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df.insert(1, 'Program', 'PG Taught')

def find_matching_data(dept_program):
    matches = data_df_one[data_df_one.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df[['Home fees', 'Overseas fees']] = processed_df['Department'].apply(find_matching_data)

# Remove rows where either column contains NA or NaN values
processed_df.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df['Home fees'] = processed_df['Home fees'].astype(int)
processed_df['Overseas fees'] = processed_df['Overseas fees'].astype(int)

# Revert department names to original values
processed_df['Department'] = processed_df['Department'].map(original_to_transformed)

# Display the final DataFrame
processed_df

# Find the row index for the specific phrase and extract the fees and year
idx = data_df_one.index[data_df_one.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx:
    target_idx = idx[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one.iloc[target_idx - 1, 0]:
        year_match = re.search(r'\b(\d{4})\b', data_df_one.iloc[target_idx, 0])
        year = year_match.group(0) if year_match else "Unknown"
        home_fee_match = re.search(r'£(\d{4})', data_df_one.iloc[target_idx, 1].replace(',', ''))
        overseas_fee_match = re.search(r'£(\d{5})', data_df_one.iloc[target_idx, 2].replace(',', ''))
        home_fee = int(home_fee_match.group(1)) if home_fee_match else None
        overseas_fee = int(overseas_fee_match.group(1)) if overseas_fee_match else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row = pd.DataFrame({
    'Department': [f"All {year}"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee],
    'Overseas fees': [overseas_fee]
})

# Append the additional row to the processed_df DataFrame
processed_df = pd.concat([additional_row, processed_df]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df



  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)


  data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
Skipping line 7: expected 3 fields, saw 5
Skipping line 8: expected 3 fields, saw 5
Skipping line 9: expected 3 fields, saw 5
Skipping line 10: expected 3 fields, saw 5
Skipping line 11: expected 3 fields, saw 5
Skipping line 12: expected 3 fields, saw 5
Skipping line 13: expected 3 fields, saw 5
Skipping line 14: expected 3 fields, saw 5
Skipping line 15: expected 3 fields, saw 5
Skipping line 16: expected 3 fields, saw 5
Skipping line 17: expected 3 fields, saw 5
Skipping line 18: expected 3 fields, saw 5
Skipping line 19: expected 3 fields, saw 5
Skipping line 20: expected 3 fields, saw 5
Skipping line 21: expected 3 fields, saw 5
Skipping line 22: expected 3 fields, saw 5
Skipping line 23: expected 3 fields, saw 5
Skipping line 24: expected 3 fields, saw 5
Skipping line 25: expected 3 fields, saw 5
Skip

Unnamed: 0,Department,Program,Home fees,Overseas fees
0,All 2019,UG Degree,9250,19920


In [11]:
import pandas as pd
import pdfplumber
import csv
import re

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                # Ensuring that each row in the table has the same number of columns
                if all(len(row) == len(tables[0][0]) for row in table):
                    all_tables.extend(table)
                else:
                    print("Skipped a malformed table")
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

def process_csv_data(output_csv_path):
    try:
        # Attempt to read the CSV with a fixed number of columns to avoid skipping lines
        data_df_one = pd.read_csv(output_csv_path, header=None, names=['Column1', 'Column2', 'Column3'], on_bad_lines='warn')
    except Exception as e:
        print(f"Failed to read CSV: {str(e)}")

    return data_df_one

# Specify paths
pdf_path = 'Data/ToF-3Aug21FinalComb.pdf'
output_csv_path = 'Data/2021_Fees.csv'

# Extract tables from PDF
extract_tables_from_pdf(pdf_path, output_csv_path)

# Process CSV data
data_df_one = process_csv_data(output_csv_path)

# Debugging: Output the first few lines to check if they are read correctly
print(data_df_one.head())


Skipped a malformed table
Skipped a malformed table
                                             Column1    Column2  \
0                        1\nUndergraduate Programmes        NaN   
1                                                NaN  Full time   
2                                                NaN       Home   
3  Undergraduate first degree - New entrants in 2...        NaN   
4  Home students (does not include EU students) c...   £9,250 2   

                    Column3  
0                       NaN  
1                       NaN  
2  Overseas\n(including EU)  
3                       NaN  
4                       NaN  


Skipping line 7: expected 3 fields, saw 5
Skipping line 8: expected 3 fields, saw 5
Skipping line 9: expected 3 fields, saw 5
Skipping line 10: expected 3 fields, saw 5
Skipping line 11: expected 3 fields, saw 5
Skipping line 12: expected 3 fields, saw 5
Skipping line 13: expected 3 fields, saw 5
Skipping line 14: expected 3 fields, saw 5
Skipping line 15: expected 3 fields, saw 5
Skipping line 16: expected 3 fields, saw 5
Skipping line 17: expected 3 fields, saw 5
Skipping line 18: expected 3 fields, saw 5
Skipping line 19: expected 3 fields, saw 5
Skipping line 20: expected 3 fields, saw 5
Skipping line 21: expected 3 fields, saw 5
Skipping line 22: expected 3 fields, saw 5
Skipping line 23: expected 3 fields, saw 5
Skipping line 24: expected 3 fields, saw 5
Skipping line 25: expected 3 fields, saw 5
Skipping line 26: expected 3 fields, saw 5
Skipping line 27: expected 3 fields, saw 5
Skipping line 28: expected 3 fields, saw 5
Skipping line 29: expected 3 fields, saw 5
Skipping line 