In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import re

def setup_driver(download_dir_absolute):
    """Sets up the WebDriver for Chrome."""
    options = webdriver.ChromeOptions()
    options.add_experimental_option("prefs", {
        "download.default_directory": download_dir_absolute,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True
    })
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def extract_year_from_filename(filename):
    # Try to find a four-digit year first
    four_digit_year_match = re.search(r'(\d{4})', filename)
    if four_digit_year_match:
        return four_digit_year_match.group(1)
    # If not found, look for a two-digit year
    two_digit_year_match = re.search(r'(\d{2})', filename)
    if two_digit_year_match:
        return '20' + two_digit_year_match.group(1)
    # Return None if no year pattern is found
    return None

def rename_downloaded_file(download_dir, original_filename, year):
    original_path = os.path.join(download_dir, original_filename)
    new_filename = f"Fees{year}.pdf"
    new_path = os.path.join(download_dir, new_filename)
    os.rename(original_path, new_path)
    print(f"Renamed {original_filename} to {new_filename}")

def download_pdfs_by_class(base_url, class_name, download_dir):
    driver = setup_driver(download_dir)
    driver.get(base_url)
    time.sleep(1)  # Adjust based on your internet speed
    links = driver.find_elements(By.CLASS_NAME, class_name)

    for link in links:
        href = link.get_attribute('href')
        # Skip the unwanted PDF
        if "Fee-approval-cycle-2024.pdf" in href:
            continue
        if href and href.endswith('.pdf'):
            # Extract the original file name
            original_filename = href.split('/')[-1]
            # Extract year from the file name
            year = extract_year_from_filename(original_filename)
            if year:
                # Open the link in a new tab and download the file
                driver.execute_script(f"window.open('{href}');")
                time.sleep(1)  # Adjust for page load
                # The file is automatically downloaded to `download_dir`
                # Need to wait for the download to complete here (omitted for simplicity)
                # Rename the file after ensuring the download has completed
                rename_downloaded_file(download_dir, original_filename, year)
            # Switch back to the main window
            driver.switch_to.window(driver.window_handles[0])
    
    # Close the driver
    driver.quit()

# Base URL and class name remain the same
base_url = 'https://info.lse.ac.uk/staff/divisions/Planning-Division/Table-of-Fees'
class_name = 'sys_21'
download_dir_relative = 'Data/TuitionFees'

# Create the download directory if it doesn't exist
download_dir_absolute = os.path.abspath(download_dir_relative)
if not os.path.exists(download_dir_absolute):
    os.makedirs(download_dir_absolute)

# Call the download function
download_pdfs_by_class(base_url, class_name, download_dir_absolute)


Renamed Table-of-fees-2024-25-20Feb24-Updated-Home-PGR-fee.pdf to Fees2024.pdf
Renamed Table-of-fees-2023-24-7Nov23.pdf to Fees2023.pdf
Renamed Comb2022ToF-Final-19July23.pdf to Fees2022.pdf
Renamed ToF-3Aug21FinalComb.pdf to Fees2021.pdf
Renamed 2020-Table-of-Fees-25Jun20.pdf to Fees2020.pdf
Renamed 2019-Table-of-Fees.pdf to Fees2019.pdf
Renamed 2018-19-Fees-Table.pdf to Fees2018.pdf
Renamed 2017-18-Fees-Table.pdf to Fees2017.pdf


In [5]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020):
    with pdfplumber.open(pdf_path_2020) as pdf_2020:
        all_tables_2020 = []
        for page_2020 in pdf_2020.pages:
            tables_2020 = page_2020.extract_tables()
            for table_2020 in tables_2020:
                all_tables_2020.extend(table_2020)
       
        with open(output_csv_path_2020, 'w', newline='') as csvfile_2020:
            writer_2020 = csv.writer(csvfile_2020)
            for row_2020 in all_tables_2020:
                writer_2020.writerow(row_2020)

pdf_path_2020 = 'Data/2020-Table-of-Fees-25Jun20.pdf'
output_csv_path_2020 = 'Data/2020_Fees.csv'

extract_tables_from_pdf_2020(pdf_path_2020, output_csv_path_2020)

output_csv_file_2020 = 'Data/2020_Fees.csv'  
csv_file_2020 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

data_df_one_2020 = pd.read_csv(output_csv_file_2020, error_bad_lines=False, warn_bad_lines=True)
data_df_2020 = pd.read_csv(csv_file_2020)

data_df_one_2020['Dept_Program_2020'] = data_df_2020['Department'] + " " + data_df_2020['Program']

unique_dept_programs_2020 = data_df_one_2020['Dept_Program_2020'].unique()

original_to_transformed_2020 = {}
processed_dept_programs_2020 = []
for combo_2020 in unique_dept_programs_2020:
    if isinstance(combo_2020, str):
        if "PG Taught" in combo_2020:
            original_department_2020 = combo_2020.replace(" PG Taught", "")
            transformed_department_2020 = original_department_2020
            if "And" in transformed_department_2020:
                transformed_department_2020 = transformed_department_2020.split("And")[0].strip()
            if transformed_department_2020 == "International History":
                transformed_department_2020 = "History"
            if transformed_department_2020 == "European Institute":
                transformed_department_2020 = "European"
            if transformed_department_2020 == "Law School" or transformed_department_2020 == "Law":
                transformed_department_2020 = "LLM"
            if transformed_department_2020 == "Philosophy Logic":
                transformed_department_2020 = "Philosophy"
            if transformed_department_2020 == "School of Public Policy":
                transformed_department_2020 = "Public Policy"
            if transformed_department_2020 == "Gender Studies":
                transformed_department_2020 = "Gender"
            if "Psychological" in transformed_department_2020:
                transformed_department_2020 = "Psychology"
            
            original_to_transformed_2020[transformed_department_2020] = original_department_2020
            processed_dept_programs_2020.append(transformed_department_2020)

processed_df_2020 = pd.DataFrame(processed_dept_programs_2020, columns=['Department_2020'])

processed_df_2020.insert(1, 'Program_2020', 'PG Taught')

def find_matching_data_2020(dept_program_2020):
    matches_2020 = data_df_one_2020[data_df_one_2020.iloc[:, 0].str.contains(dept_program_2020, na=False)]
    if not matches_2020.empty:
        col2_values_2020 = []
        col3_values_2020 = []
        for _, row_2020 in matches_2020.iterrows():
            try:
                cleaned_value_col2_2020 = float(str(row_2020[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2_2020.is_integer():
                    col2_values_2020.append(int(cleaned_value_col2_2020))
            except ValueError:
                col2_values_2020.append(pd.NA)
            try:
                cleaned_value_col3_2020 = float(str(row_2020[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3_2020.is_integer():
                    col3_values_2020.append(int(cleaned_value_col3_2020))
            except ValueError:
                col3_values_2020.append(pd.NA)
        
        avg_col2_2020 = pd.Series(col2_values_2020).dropna().mean()
        avg_col3_2020 = pd.Series(col3_values_2020).dropna().mean()
        return pd.Series([avg_col2_2020, avg_col3_2020])
    return pd.Series([pd.NA, pd.NA])

processed_df_2020[['Home_fees_2020', 'Overseas_fees_2020']] = processed_df_2020['Department_2020'].apply(find_matching_data_2020)

processed_df_2020.dropna(subset=['Home_fees_2020', 'Overseas_fees_2020'], inplace=True)

processed_df_2020['Home_fees_2020'] = processed_df_2020['Home_fees_2020'].astype(int)
processed_df_2020['Overseas_fees_2020'] = processed_df_2020['Overseas_fees_2020'].astype(int)

processed_df_2020['Department_2020'] = processed_df_2020['Department_2020'].map(original_to_transformed_2020)

processed_df_2020

idx_2020 = data_df_one_2020.index[data_df_one_2020.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2020:
    target_idx_2020 = idx_2020[0]
    if "Undergraduate" in data_df_one_2020.iloc[target_idx_2020 - 1, 0]:
        year_match_2020 = re.search(r'\b(\d{4})\b', data_df_one_2020.iloc[target_idx_2020, 0])
        year_2020 = year_match_2020.group(0) if year_match_2020 else "Unknown"
        home_fee_match_2020 = re.search(r'£(\d{4})', data_df_one_2020.iloc[target_idx_2020, 1].replace(',', ''))
        overseas_fee_match_2020 = re.search(r'£(\d{5})', data_df_one_2020.iloc[target_idx_2020, 2].replace(',', ''))
        home_fee_2020 = int(home_fee_match_2020.group(1)) if home_fee_match_2020 else None
        overseas_fee_2020 = int(overseas_fee_match_2020.group(1)) if overseas_fee_match_2020 else None

additional_row_2020 = pd.DataFrame({
    'Department_2020': ["All"],
    'Program_2020': ["UG Degree"],
    'Home_fees_2020': [home_fee_2020],
    'Overseas_fees_2020': [overseas_fee_2020]
})

processed_df_2020 = pd.concat([additional_row_2020, processed_df_2020]).reset_index(drop=True)

processed_df_2020




  data_df_one_2020 = pd.read_csv(output_csv_file_2020, error_bad_lines=False, warn_bad_lines=True)


  data_df_one_2020 = pd.read_csv(output_csv_file_2020, error_bad_lines=False, warn_bad_lines=True)
Skipping line 244: expected 5 fields, saw 8
Skipping line 245: expected 5 fields, saw 8
Skipping line 246: expected 5 fields, saw 8
Skipping line 247: expected 5 fields, saw 8
Skipping line 248: expected 5 fields, saw 8
Skipping line 249: expected 5 fields, saw 8
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skipping line 252: expected 5 fields, saw 9
Skipping line 253: expected 5 fields, saw 9
Skipping line 254: expected 5 fields, saw 9
Skipping line 255: expected 5 fields, saw 9
Skipping line 256: expected 5 fields, saw 9
Skipping line 257: expected 5 fields, saw 9
Skipping line 258: expected 5 fields, saw 9
Skipping line 259: expected 5 fields, saw 9
Skipping line 260: expected 5 fields, saw 9
Skipping line 261: expected 5 fields, saw 9
Skippi

Unnamed: 0,Department_2020,Program_2020,Home_fees_2020,Overseas_fees_2020
0,All,UG Degree,9250,21570
1,Accounting,PG Taught,28080,28464
2,Economic History,PG Taught,14640,22608
3,Economics,PG Taught,23179,24134
4,European Institute,PG Taught,19952,22608
5,Finance,PG Taught,28969,29185
6,Gender Studies,PG Taught,14640,22608
7,Geography And Environment,PG Taught,14640,22608
8,Health Policy,PG Taught,18160,25768
9,International Development,PG Taught,14640,22608


In [6]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2018(pdf_path_2018, output_csv_path_2018):
    with pdfplumber.open(pdf_path_2018) as pdf_2018:
        all_tables_2018 = []
        for page_2018 in pdf_2018.pages:
            tables_2018 = page_2018.extract_tables()
            for table_2018 in tables_2018:
                all_tables_2018.extend(table_2018)
       
        with open(output_csv_path_2018, 'w', newline='') as csvfile_2018:
            writer_2018 = csv.writer(csvfile_2018)
            for row_2018 in all_tables_2018:
                writer_2018.writerow(row_2018)

pdf_path_2018 = 'Data/2018-19-Fees-Table.pdf'
output_csv_path_2018 = 'Data/2018_Fees.csv'

extract_tables_from_pdf_2018(pdf_path_2018, output_csv_path_2018)

output_csv_file_2018 = 'Data/2018_Fees.csv'  
csv_file_2018 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

data_df_one_2018 = pd.read_csv(output_csv_file_2018, error_bad_lines=False, warn_bad_lines=True)
data_df_2018 = pd.read_csv(csv_file_2018)

data_df_one_2018['Dept_Program_2018'] = data_df_2018['Department'] + " " + data_df_2018['Program']

unique_dept_programs_2018 = data_df_one_2018['Dept_Program_2018'].unique()

original_to_transformed_2018 = {}
processed_dept_programs_2018 = []
for combo_2018 in unique_dept_programs_2018:
    if isinstance(combo_2018, str):
        if "PG Taught" in combo_2018:
            original_department_2018 = combo_2018.replace(" PG Taught", "")
            transformed_department_2018 = original_department_2018
            if "And" in transformed_department_2018:
                transformed_department_2018 = transformed_department_2018.split("And")[0].strip()
            if transformed_department_2018 == "International History":
                transformed_department_2018 = "History"
            if transformed_department_2018 == "European Institute":
                transformed_department_2018 = "European"
            if transformed_department_2018 == "Law School" or transformed_department_2018 == "Law":
                transformed_department_2018 = "LLM"
            if transformed_department_2018 == "Philosophy Logic":
                transformed_department_2018 = "Philosophy"
            if transformed_department_2018 == "School of Public Policy":
                transformed_department_2018 = "Public Policy"
            if transformed_department_2018 == "Gender Studies":
                transformed_department_2018 = "Gender"
            if "Psychological" in transformed_department_2018:
                transformed_department_2018 = "Psychology"
            
            original_to_transformed_2018[transformed_department_2018] = original_department_2018
            processed_dept_programs_2018.append(transformed_department_2018)

processed_df_2018 = pd.DataFrame(processed_dept_programs_2018, columns=['Department_2018'])

processed_df_2018.insert(1, 'Program_2018', 'PG Taught')

def find_matching_data_2018(dept_program_2018):
    matches_2018 = data_df_one_2018[data_df_one_2018.iloc[:, 0].str.contains(dept_program_2018, na=False)]
    if not matches_2018.empty:
        col2_values_2018 = []
        col3_values_2018 = []
        for _, row_2018 in matches_2018.iterrows():
            try:
                cleaned_value_col2_2018 = float(str(row_2018[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2_2018.is_integer():
                    col2_values_2018.append(int(cleaned_value_col2_2018))
            except ValueError:
                col2_values_2018.append(pd.NA)
            try:
                cleaned_value_col3_2018 = float(str(row_2018[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3_2018.is_integer():
                    col3_values_2018.append(int(cleaned_value_col3_2018))
            except ValueError:
                col3_values_2018.append(pd.NA)
        
        avg_col2_2018 = pd.Series(col2_values_2018).dropna().mean()
        avg_col3_2018 = pd.Series(col3_values_2018).dropna().mean()
        return pd.Series([avg_col2_2018, avg_col3_2018])
    return pd.Series([pd.NA, pd.NA])

processed_df_2018[['Home_fees_2018', 'Overseas_fees_2018']] = processed_df_2018['Department_2018'].apply(find_matching_data_2018)

processed_df_2018.dropna(subset=['Home_fees_2018', 'Overseas_fees_2018'], inplace=True)

processed_df_2018['Home_fees_2018'] = processed_df_2018['Home_fees_2018'].astype(int)
processed_df_2018['Overseas_fees_2018'] = processed_df_2018['Overseas_fees_2018'].astype(int)

processed_df_2018['Department_2018'] = processed_df_2018['Department_2018'].map(original_to_transformed_2018)

processed_df_2018

idx_2018 = data_df_one_2018.index[data_df_one_2018.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2018:
    target_idx_2018 = idx_2018[0]
    if "Undergraduate" in data_df_one_2018.iloc[target_idx_2018 - 1, 0]:
        year_match_2018 = re.search(r'\b(\d{4})\b', data_df_one_2018.iloc[target_idx_2018, 0])
        year_2018 = year_match_2018.group(0) if year_match_2018 else "Unknown"
        home_fee_match_2018 = re.search(r'£(\d{4})', data_df_one_2018.iloc[target_idx_2018, 1].replace(',', ''))
        overseas_fee_match_2018 = re.search(r'£(\d{5})', data_df_one_2018.iloc[target_idx_2018, 2].replace(',', ''))
        home_fee_2018 = int(home_fee_match_2018.group(1)) if home_fee_match_2018 else None
        overseas_fee_2018 = int(overseas_fee_match_2018.group(1)) if overseas_fee_match_2018 else None

additional_row_2018 = pd.DataFrame({
    'Department_2018': ["All"],
    'Program_2018': ["UG Degree"],
    'Home_fees_2018': [home_fee_2018],
    'Overseas_fees_2018': [overseas_fee_2018]
})

processed_df_2018 = pd.concat([additional_row_2018, processed_df_2018]).reset_index(drop=True)

processed_df_2018




  data_df_one_2018 = pd.read_csv(output_csv_file_2018, error_bad_lines=False, warn_bad_lines=True)


  data_df_one_2018 = pd.read_csv(output_csv_file_2018, error_bad_lines=False, warn_bad_lines=True)
  avg_col3_2018 = pd.Series(col3_values_2018).dropna().mean()


Unnamed: 0,Department_2018,Program_2018,Home_fees_2018,Overseas_fees_2018
0,All,UG Degree,9250,19152
1,Accounting,PG Taught,25080,25344
2,Economic History,PG Taught,13536,20904
3,Economics,PG Taught,20848,21796
4,European Institute,PG Taught,19936,20904
5,Finance,PG Taught,26599,26797
6,Gender Studies,PG Taught,13536,20904
7,Geography And Environment,PG Taught,13536,20904
8,Health Policy,PG Taught,16392,23448
9,International Development,PG Taught,17480,20904


In [7]:
import pdfplumber
import pandas as pd
import csv
import re

def extract_tables_from_pdf_2019(pdf_path_2019, output_csv_path_2019):
    with pdfplumber.open(pdf_path_2019) as pdf_2019:
        all_tables_2019 = []
        for page_2019 in pdf_2019.pages:
            tables_2019 = page_2019.extract_tables()
            for table_2019 in tables_2019:
                all_tables_2019.extend(table_2019)
       
        with open(output_csv_path_2019, 'w', newline='') as csvfile_2019:
            writer_2019 = csv.writer(csvfile_2019)
            for row_2019 in all_tables_2019:
                writer_2019.writerow(row_2019)

pdf_path_2019 = 'Data/2019-Table-of-Fees.pdf'
output_csv_path_2019 = 'Data/2019_Fees.csv'

extract_tables_from_pdf_2019(pdf_path_2019, output_csv_path_2019)

output_csv_file_2019 = 'Data/2019_Fees.csv'  
csv_file_2019 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

data_df_one_2019 = pd.read_csv(output_csv_file_2019, error_bad_lines=False, warn_bad_lines=True)
data_df_2019 = pd.read_csv(csv_file_2019)

data_df_one_2019['Dept_Program_2019'] = data_df_2019['Department'] + " " + data_df_2019['Program']

unique_dept_programs_2019 = data_df_one_2019['Dept_Program_2019'].unique()

original_to_transformed_2019 = {}
processed_dept_programs_2019 = []
for combo_2019 in unique_dept_programs_2019:
    if isinstance(combo_2019, str):
        if "PG Taught" in combo_2019:
            original_department_2019 = combo_2019.replace(" PG Taught", "")
            transformed_department_2019 = original_department_2019
            if "And" in transformed_department_2019:
                transformed_department_2019 = transformed_department_2019.split("And")[0].strip()
            if transformed_department_2019 == "International History":
                transformed_department_2019 = "History"
            if transformed_department_2019 == "European Institute":
                transformed_department_2019 = "European"
            if transformed_department_2019 == "Law School" or transformed_department_2019 == "Law":
                transformed_department_2019 = "LLM"
            if transformed_department_2019 == "Philosophy Logic":
                transformed_department_2019 = "Philosophy"
            if transformed_department_2019 == "School of Public Policy":
                transformed_department_2019 = "Public Policy"
            if transformed_department_2019 == "Gender Studies":
                transformed_department_2019 = "Gender"
            if "Psychological" in transformed_department_2019:
                transformed_department_2019 = "Psychology"
            
            original_to_transformed_2019[transformed_department_2019] = original_department_2019
            processed_dept_programs_2019.append(transformed_department_2019)

processed_df_2019 = pd.DataFrame(processed_dept_programs_2019, columns=['Department_2019'])

processed_df_2019.insert(1, 'Program_2019', 'PG Taught')

def find_matching_data_2019(dept_program_2019):
    matches_2019 = data_df_one_2019[data_df_one_2019.iloc[:, 0].str.contains(dept_program_2019, na=False)]
    if not matches_2019.empty:
        col2_values_2019 = []
        col3_values_2019 = []
        for _, row_2019 in matches_2019.iterrows():
            try:
                cleaned_value_col2_2019 = float(str(row_2019[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2_2019.is_integer():
                    col2_values_2019.append(int(cleaned_value_col2_2019))
            except ValueError:
                col2_values_2019.append(pd.NA)
            try:
                cleaned_value_col3_2019 = float(str(row_2019[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3_2019.is_integer():
                    col3_values_2019.append(int(cleaned_value_col3_2019))
            except ValueError:
                col3_values_2019.append(pd.NA)
        
        avg_col2_2019 = pd.Series(col2_values_2019).dropna().mean()
        avg_col3_2019 = pd.Series(col3_values_2019).dropna().mean()
        return pd.Series([avg_col2_2019, avg_col3_2019])
    return pd.Series([pd.NA, pd.NA])

processed_df_2019[['Home_fees_2019', 'Overseas_fees_2019']] = processed_df_2019['Department_2019'].apply(find_matching_data_2019)

processed_df_2019.dropna(subset=['Home_fees_2019', 'Overseas_fees_2019'], inplace=True)

processed_df_2019['Home_fees_2019'] = processed_df_2019['Home_fees_2019'].astype(int)
processed_df_2019['Overseas_fees_2019'] = processed_df_2019['Overseas_fees_2019'].astype(int)

processed_df_2019['Department_2019'] = processed_df_2019['Department_2019'].map(original_to_transformed_2019)

processed_df_2019

idx_2019 = data_df_one_2019.index[data_df_one_2019.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2019:
    target_idx_2019 = idx_2019[0]
    if "Undergraduate" in data_df_one_2019.iloc[target_idx_2019 - 1, 0]:
        year_match_2019 = re.search(r'\b(\d{4})\b', data_df_one_2019.iloc[target_idx_2019, 0])
        year_2019 = year_match_2019.group(0) if year_match_2019 else "Unknown"
        home_fee_match_2019 = re.search(r'£(\d{4})', data_df_one_2019.iloc[target_idx_2019, 1].replace(',', ''))
        overseas_fee_match_2019 = re.search(r'£(\d{5})', data_df_one_2019.iloc[target_idx_2019, 2].replace(',', ''))
        home_fee_2019 = int(home_fee_match_2019.group(1)) if home_fee_match_2019 else None
        overseas_fee_2019 = int(overseas_fee_match_2019.group(1)) if overseas_fee_match_2019 else None

additional_row_2019 = pd.DataFrame({
    'Department_2019': ["All"],
    'Program_2019': ["UG Degree"],
    'Home_fees_2019': [home_fee_2019],
    'Overseas_fees_2019': [overseas_fee_2019]
})

processed_df_2019 = pd.concat([additional_row_2019, processed_df_2019]).reset_index(drop=True)

processed_df_2019




  data_df_one_2019 = pd.read_csv(output_csv_file_2019, error_bad_lines=False, warn_bad_lines=True)


  data_df_one_2019 = pd.read_csv(output_csv_file_2019, error_bad_lines=False, warn_bad_lines=True)
Skipping line 234: expected 5 fields, saw 8
Skipping line 235: expected 5 fields, saw 8
Skipping line 236: expected 5 fields, saw 8
Skipping line 237: expected 5 fields, saw 8
Skipping line 238: expected 5 fields, saw 8
Skipping line 239: expected 5 fields, saw 8
Skipping line 240: expected 5 fields, saw 9
Skipping line 241: expected 5 fields, saw 9
Skipping line 242: expected 5 fields, saw 9
Skipping line 243: expected 5 fields, saw 9
Skipping line 244: expected 5 fields, saw 9
Skipping line 245: expected 5 fields, saw 9
Skipping line 246: expected 5 fields, saw 9
Skipping line 247: expected 5 fields, saw 9
Skipping line 248: expected 5 fields, saw 9
Skipping line 249: expected 5 fields, saw 9
Skipping line 250: expected 5 fields, saw 9
Skipping line 251: expected 5 fields, saw 9
Skippi

Unnamed: 0,Department_2019,Program_2019,Home_fees_2019,Overseas_fees_2019
0,All,UG Degree,9250,19920
1,Accounting,PG Taught,26082,26358
2,Economic History,PG Taught,14088,21744
3,Economics,PG Taught,21917,22870
4,European Institute,PG Taught,19192,21744
5,Finance,PG Taught,27663,27870
6,Gender Studies,PG Taught,14088,21744
7,Geography And Environment,PG Taught,14088,21744
8,Health Policy,PG Taught,17512,24832
9,International Development,PG Taught,14088,21744


In [9]:
def extract_tables_from_pdf_2017(pdf_path_2017, output_csv_path_2017):
    with pdfplumber.open(pdf_path_2017) as pdf_2017:
        all_tables_2017 = []
        # Iterate through each page of the PDF
        for page_2017 in pdf_2017.pages:
            # Extract tables from the current page
            tables_2017 = page_2017.extract_tables()
            for table_2017 in tables_2017:
                all_tables_2017.extend(table_2017)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path_2017, 'w', newline='') as csvfile_2017:
            writer_2017 = csv.writer(csvfile_2017)
            for row_2017 in all_tables_2017:
                writer_2017.writerow(row_2017)

# Specify the path to your PDF and the output CSV file
pdf_path_2017 = 'Data/2017-18-Fees-Table.pdf'
output_csv_path_2017 = 'Data/2017_Fees.csv'

extract_tables_from_pdf_2017(pdf_path_2017, output_csv_path_2017)


# Load the CSV file containing the tuition fees and program data
output_csv_path_2017 = 'Data/2017_Fees.csv'  
csv_path_2017 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  

# Read the CSV files
data_df_one_2017 = pd.read_csv(output_csv_path_2017, error_bad_lines=False, warn_bad_lines=True)
data_df_2017 = pd.read_csv(csv_path_2017)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df_2017['Dept_Program_2017'] = data_df_2017['Department'] + " " + data_df_2017['Program']

# Get unique combinations
unique_dept_programs_2017 = data_df_2017['Dept_Program_2017'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed_2017 = {}

# Process only Master's programs and adjust department names
processed_dept_programs_2017 = []
for combo_2017 in unique_dept_programs_2017:
    if isinstance(combo_2017, str):  # Check if the item is a string
        if "PG Taught" in combo_2017:
            original_department_2017 = combo_2017.replace(" PG Taught", "")
            transformed_department_2017 = original_department_2017
            # Handle specific naming transformations
            if "And" in transformed_department_2017:
                transformed_department_2017 = transformed_department_2017.split("And")[0].strip()
            if transformed_department_2017 == "International History":
                transformed_department_2017 = "History"
            if transformed_department_2017 == "European Institute":
                transformed_department_2017 = "European"
            if transformed_department_2017 == "Law School" or transformed_department_2017 == "Law":
                transformed_department_2017 = "LLM"
            if transformed_department_2017 == "Philosophy Logic":
                transformed_department_2017 = "Philosophy"
            if transformed_department_2017 == "School of Public Policy":
                transformed_department_2017 = "Public Policy"
            if transformed_department_2017 == "Gender Studies":
                transformed_department_2017 = "Gender"
            if "Psychological" in transformed_department_2017:
                transformed_department_2017 = "Psychology"
            
            # Save mapping
            original_to_transformed_2017[transformed_department_2017] = original_department_2017
            processed_dept_programs_2017.append(transformed_department_2017)

# Create DataFrame from processed list
processed_df_2017 = pd.DataFrame(processed_dept_programs_2017, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df_2017.insert(1, 'Program', 'PG Taught')

def find_matching_data_2017(dept_program_2017):
    matches_2017 = data_df_one_2017[data_df_one_2017.iloc[:, 0].str.contains(dept_program_2017, na=False)]
    if not matches_2017.empty:
        col2_values_2017 = []
        col3_values_2017 = []
        for _, row_2017 in matches_2017.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2_2017 = float(str(row_2017[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2_2017.is_integer():
                    col2_values_2017.append(int(cleaned_value_col2_2017))
            except ValueError:
                col2_values_2017.append(pd.NA)
            try:
                cleaned_value_col3_2017 = float(str(row_2017[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3_2017.is_integer():
                    col3_values_2017.append(int(cleaned_value_col3_2017))
            except ValueError:
                col3_values_2017.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2_2017 = pd.Series(col2_values_2017).dropna().mean()
        avg_col3_2017 = pd.Series(col3_values_2017).dropna().mean()
        return pd.Series([avg_col2_2017, avg_col3_2017])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df_2017[['Home fees', 'Overseas fees']] = processed_df_2017['Department'].apply(find_matching_data_2017)

# Remove rows where either column contains NA or NaN values
processed_df_2017.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df_2017['Home fees'] = processed_df_2017['Home fees'].astype(int)
processed_df_2017['Overseas fees'] = processed_df_2017['Overseas fees'].astype(int)

# Revert department names to original values
processed_df_2017['Department'] = processed_df_2017['Department'].map(original_to_transformed_2017)

# Display the final DataFrame
processed_df_2017

# Find the row index for the specific phrase and extract the fees and year
idx_2017 = data_df_one_2017.index[data_df_one_2017.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2017:
    target_idx_2017 = idx_2017[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one_2017.iloc[target_idx_2017 - 1, 0]:
        year_match_2017 = re.search(r'\b(\d{4})\b', data_df_one_2017.iloc[target_idx_2017, 0])
        year_2017 = year_match_2017.group(0) if year_match_2017 else "Unknown"
        home_fee_match_2017 = re.search(r'£(\d{4})', data_df_one_2017.iloc[target_idx_2017, 1].replace(',', ''))
        overseas_fee_match_2017 = re.search(r'£(\d{5})', data_df_one_2017.iloc[target_idx_2017, 2].replace(',', ''))
        home_fee_2017 = int(home_fee_match_2017.group(1)) if home_fee_match_2017 else None
        overseas_fee_2017 = int(overseas_fee_match_2017.group(1)) if overseas_fee_match_2017 else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row_2017 = pd.DataFrame({
    'Department': ["All"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee_2017],
    'Overseas fees': [overseas_fee_2017]
})

# Append the additional row to the processed_df DataFrame
processed_df_2017 = pd.concat([additional_row_2017, processed_df_2017]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df_2017




  data_df_one_2017 = pd.read_csv(output_csv_path_2017, error_bad_lines=False, warn_bad_lines=True)


  data_df_one_2017 = pd.read_csv(output_csv_path_2017, error_bad_lines=False, warn_bad_lines=True)


NameError: name 'home_fee_2017' is not defined

In [None]:
# Merge all processed_df dataframes horizontally
merged_df = pd.concat([processed_df_2017, processed_df_2018, processed_df_2019, processed_df_2020], axis=1)

# Filter columns to keep only the first column and those without "Program" in their names
columns_to_keep = [column for column in merged_df.columns if "Program" not in column or column == merged_df.columns[0]]

# Select the filtered columns
filtered_df = merged_df[columns_to_keep]

# Filter columns to keep only the first column and those without "Department" in their names
columns_to_keep_dept = [column for column in filtered_df.columns if "Department" not in column or column == filtered_df.columns[0]]

# Select the filtered columns
filtered_df_dept = filtered_df[columns_to_keep_dept]

# Rename columns for Home and Overseas Fees with corresponding years
filtered_df_dept.columns = [filtered_df_dept.columns[0].replace('_', ' ')] + \
                           [f"{col.replace('_', ' ')} 2017" if idx == 0 else f"{col.replace('_', ' ')} 2017" if idx == 1 else col.replace('_', ' ') for idx, col in enumerate(filtered_df_dept.columns[1:])]

# Replace entry of the first department "All 2017" with simply "All"
filtered_df_dept.iloc[0, 0] = "All UG Degree"

# Display the renamed dataframe
filtered_df_dept


In [None]:
merged_df = pd.concat([processed_df_2017, processed_df_2018, processed_df_2019, processed_df_2020], axis=1)
merged_df