In [34]:
import pandas as pd
import pdfplumber
import csv

def find_next_containing_row(data_frame, start_index, column_index, text):
    for idx in range(start_index, len(data_frame)):
        cell_content = str(data_frame.iloc[idx, column_index])
        if pd.notna(cell_content) and text in cell_content:
            return idx
    return None

def find_first_non_empty_cell_and_extract_fee(data_frame, start_index, column_index):
    for idx in range(start_index, min(start_index + 10, len(data_frame))):
        cell_content = data_frame.iloc[idx, column_index]
        if pd.notna(cell_content) and cell_content != '':
            first_word = cell_content.split()[0]
            fee = ''.join(filter(str.isdigit, first_word))
            fee = fee.replace(',', '')
            return fee
    print("No non-empty cell found within the specified range.")
    return None

def clean_course_name(course_name):
    return str(course_name).replace('MSc in', 'MSc').strip()

def find_department(course_name, department_df):
    match = department_df[department_df['Course Name'].str.strip().eq(course_name.strip())]
    if not match.empty:
        return match['Department'].iloc[0]
    if "LLB" in course_name:
        return "Law School"  # Assign LLB courses to Law School
    return "Department not found"

def integrate_and_process_data(year):
    cleaned_fees_output_path = f'AllYears/CleanedFees{year}.csv'
    grouped_output_path = f'AllYears/Grouped_Fees{year}_by_Department_and_Level.csv'

    cleaned_fees_df = pd.read_csv(cleaned_fees_output_path)
    department_info_df = pd.read_csv('data/cleaned_output2804.csv', encoding='ISO-8859-1')

    cleaned_fees_df['Department'] = cleaned_fees_df['Course'].apply(lambda x: find_department(x, department_info_df))
    cleaned_fees_df['Home Fee'] = pd.to_numeric(cleaned_fees_df['Home Fee'], errors='coerce')
    cleaned_fees_df['Overseas Fee'] = pd.to_numeric(cleaned_fees_df['Overseas Fee'], errors='coerce')

    grouped_data = cleaned_fees_df.groupby(['Department', 'Level']).agg({
        'Home Fee': 'mean',
        'Overseas Fee': 'mean'
    }).reset_index()

    grouped_data.to_csv(grouped_output_path, index=False, encoding='utf-8-sig')
    print(f"Grouped data for {year} saved to:", grouped_output_path)

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    while len(row) < 10:
                        row.append('')
                    processed_row = row[:3] + [''] * 7
                    all_tables.append(processed_row)
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

for year in range(2018, 2025):
    pdf_path = f'Data/TuitionFees/Fees{year}.pdf'
    output_csv_path = f'AllYears/Fees{year}.csv'
    updated_all_courses_output_path = f'AllYears/CleanedFees{year}.csv'

    extract_tables_from_pdf(pdf_path, output_csv_path)
    fees_df = pd.read_csv(output_csv_path, encoding='latin-1')

    home_fee_index = find_next_containing_row(fees_df, 0, 1, "Home")
    undergrad_home_fee = find_first_non_empty_cell_and_extract_fee(fees_df, home_fee_index + 1, 1)
    overseas_fee_index = find_next_containing_row(fees_df, 0, 2, "Overseas")
    undergrad_overseas_fee = find_first_non_empty_cell_and_extract_fee(fees_df, overseas_fee_index + 1, 2)

    has_undergraduate = False
    all_courses_fees_detailed = []
    for index, row in fees_df.iterrows():
        if isinstance(row.iloc[0], str) and any(x in row.iloc[0] for x in ['BSc','LLB', 'BA', 'MSc']):
            course_name = clean_course_name(row.iloc[0].split(' - ')[0])
            level = "Postgraduate" if "MSc" in course_name else "Undergraduate"
            if level == "Undergraduate":
                has_undergraduate = True
            home_fee = row.iloc[1] if not pd.isna(row.iloc[1]) else (undergrad_home_fee if level == "Undergraduate" else '')
            overseas_fee = row.iloc[2] if not pd.isna(row.iloc[2]) else ''
            home_fee = ''.join(filter(str.isdigit, home_fee))
            overseas_fee = ''.join(filter(str.isdigit, overseas_fee))
            all_courses_fees_detailed.append([course_name, home_fee, overseas_fee, level])

    if not has_undergraduate:
        all_courses_fees_detailed.append(["Undergraduate Courses", undergrad_home_fee, undergrad_overseas_fee, "Undergraduate"])

    all_courses_df_detailed = pd.DataFrame(all_courses_fees_detailed, columns=['Course', 'Home Fee', 'Overseas Fee', 'Level'])
    all_courses_df_detailed.to_csv(updated_all_courses_output_path, index=False, encoding='utf-8-sig')

    print(f"Processed data for {year}.")
    integrate_and_process_data(year)


Processed data for 2018.
Grouped data for 2018 saved to: AllYears/Grouped_Fees2018_by_Department_and_Level.csv
Processed data for 2019.
Grouped data for 2019 saved to: AllYears/Grouped_Fees2019_by_Department_and_Level.csv
Processed data for 2020.
Grouped data for 2020 saved to: AllYears/Grouped_Fees2020_by_Department_and_Level.csv
Processed data for 2021.
Grouped data for 2021 saved to: AllYears/Grouped_Fees2021_by_Department_and_Level.csv
Processed data for 2022.
Grouped data for 2022 saved to: AllYears/Grouped_Fees2022_by_Department_and_Level.csv
Processed data for 2023.
Grouped data for 2023 saved to: AllYears/Grouped_Fees2023_by_Department_and_Level.csv
Processed data for 2024.
Grouped data for 2024 saved to: AllYears/Grouped_Fees2024_by_Department_and_Level.csv


In [35]:
import pandas as pd

# Load the CSV files
for year in range(2019,2025):
    globals()[f'fees_{year}'] = pd.read_csv(f'AllYears/Grouped_Fees{year}_by_Department_and_Level.csv')

# Initialize final merged DataFrame
final_merged_fees = fees_2024.rename(columns={
    'Home Fee': 'Home Fee_2024',
    'Overseas Fee': 'Overseas Fee_2024'
})

# Merge and rename for all years
for year in reversed(range(2019, 2024)):
    fees = eval(f'fees_{year}').rename(columns={
        'Home Fee': f'Home Fee_{year}',
        'Overseas Fee': f'Overseas Fee_{year}'
    })
    final_merged_fees = pd.merge(final_merged_fees, fees[['Department', 'Level', f'Home Fee_{year}', f'Overseas Fee_{year}']], 
                                 on=['Department', 'Level'], how='outer')

# Fill missing values
# Assuming universal fees are found in the 'fees_2022' DataFrame
universal_home_fee = fees_2022.loc[
    (fees_2022['Department'] == 'Department not found') & (fees_2022['Level'] == 'Undergraduate'),
    'Home Fee'
].values[0]
universal_overseas_fee = fees_2022.loc[
    (fees_2022['Department'] == 'Department not found') & (fees_2022['Level'] == 'Undergraduate'),
    'Overseas Fee'
].values[0]

# Fill missing undergraduate fees
for year in range(2019, 2025):
    final_merged_fees[f'Home Fee_{year}'].fillna(universal_home_fee, inplace=True)
    final_merged_fees[f'Overseas Fee_{year}'].fillna(universal_overseas_fee, inplace=True)

# Format fees to show only two decimal places
for year in range(2019, 2025):
    final_merged_fees[f'Home Fee_{year}'] = final_merged_fees[f'Home Fee_{year}'].astype(float).round(2)
    final_merged_fees[f'Overseas Fee_{year}'] = final_merged_fees[f'Overseas Fee_{year}'].astype(float).round(2)

# Display the final table
display(final_merged_fees)

final_merged_fees.to_csv('Data/FinalMergedFeesAllYears.csv', index=False, encoding='utf-8-sig')



Unnamed: 0,Department,Level,Home Fee_2024,Overseas Fee_2024,Home Fee_2023,Overseas Fee_2023,Home Fee_2022,Overseas Fee_2022,Home Fee_2021,Overseas Fee_2021,Home Fee_2020,Overseas Fee_2020,Home Fee_2019,Overseas Fee_2019
0,Accounting,Postgraduate,35472.0,36168.0,33480.0,34128.0,31584.0,32208.0,30360.0,30960.0,29184.0,29760.0,28056.0,28608.0
1,Accounting,Undergraduate,9250.0,27192.0,9250.0,25656.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0
2,Anthropology,Undergraduate,9250.0,26184.0,9250.0,24720.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0
3,Department not found,Postgraduate,26493.71,30643.43,24956.53,28816.8,23257.56,26662.93,22518.67,25754.52,21482.42,24662.64,20178.65,23178.2
4,Department not found,Undergraduate,9250.0,26940.0,9250.0,25188.0,9250.0,23330.0,9250.0,22430.0,9250.0,21570.0,9250.0,19920.0
5,Economic History,Postgraduate,22452.0,28476.0,16440.0,25920.0,15816.0,24456.0,15216.0,23520.0,14640.0,22608.0,14088.0,21744.0
6,Economic History,Undergraduate,9250.0,26848.0,9250.0,25344.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0
7,Economics,Postgraduate,33144.0,33376.0,29700.0,30024.0,28020.0,28332.0,28080.0,28480.0,26992.0,27376.0,25952.0,26320.0
8,Economics,Undergraduate,9250.0,28176.0,9250.0,26592.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0,9250.0,23330.0
9,European Institute,Postgraduate,27480.0,27480.0,25920.0,25920.0,24456.0,24456.0,23520.0,23520.0,22608.0,22608.0,21744.0,21744.0


In [36]:
import pandas as pd

# Load data
country_eu_indicator = pd.read_csv('Data/countryEUindicator.csv')
final_merged_fees_all_years = pd.read_csv('Data/FinalMergedFeesAllYears.csv')

# Define the cleaning function for department names
def clean_department_name(name):
    # Standardize to "Department of X" format
    name = " ".join(name.title().split())  # Clean up extra spaces and capitalize correctly
    if not name.startswith("Department Of"):
        name = "Department of " + name
    return name.replace("Department Of ", "Department of ")

# Apply the cleaning function to the department names
country_eu_indicator['Department'] = country_eu_indicator['Department'].astype(str).apply(clean_department_name)
final_merged_fees_all_years['Department'] = final_merged_fees_all_years['Department'].astype(str).apply(clean_department_name)

# Identify the years 2019 to 2023 from the fee data columns
years = [year for year in range(2019, 2024)]

# Prepare the results DataFrame
results_df = pd.DataFrame()

# Calculate fees for each department and each year
for year in years:
    total_fees_by_department = {}
    for department in country_eu_indicator['Department'].unique():
        fee_data = final_merged_fees_all_years[
            (final_merged_fees_all_years['Department'] == department) &
            final_merged_fees_all_years['Level'].isin(['Undergraduate', 'Postgraduate'])
        ][['Level', f'Home Fee_{year}', f'Overseas Fee_{year}']].drop_duplicates()

        if not fee_data.empty:
            entries = country_eu_indicator[
                (country_eu_indicator['Department'] == department) &
                country_eu_indicator['Program'].str.contains('UG|PG', regex=True)
            ][['Nationality', 'Program', f'Entrances {year}', 'EU']].dropna()

            entries['Total Fees'] = entries.apply(
                lambda row: row[f'Entrances {year}'] * (
                    fee_data.loc[fee_data['Level'] == ('Undergraduate' if 'UG' in row['Program'] else 'Postgraduate'),
                                 f'Home Fee_{year}' if row['EU'] == 'Yes' and year <= 2020 else f'Overseas Fee_{year}'].iloc[0]
                    if not fee_data[fee_data['Level'] == ('Undergraduate' if 'UG' in row['Program'] else 'Postgraduate')].empty else 0),
                axis=1
            )
            total_fees_by_department[department] = entries['Total Fees'].sum()
        # Remove departments where fee data is not available
        elif department not in total_fees_by_department:
            total_fees_by_department[department] = "Fee data not available."

    # Append the results for the year to the results DataFrame
    year_results_df = pd.DataFrame.from_dict(total_fees_by_department, orient='index', columns=[year])
    results_df = pd.concat([results_df, year_results_df], axis=1)

# Display the final results table
results_df = results_df.loc[results_df.min(axis=1) != "Fee data not available."]  # Remove rows with fee data not available
display(results_df)


Unnamed: 0,2019,2020,2021,2022,2023
Department of Accounting,8188479.0,8156484.0,8881405.0,8237279.0,8194524.0
Department of Anthropology,1655405.0,1830175.0,1901395.0,2169690.0,2051760.0
Department of Economic History,4127372.0,5396615.0,4817495.0,6122149.0,6521184.0
Department of Economics,9306974.0,10676435.0,9640105.0,9846689.0,9823332.0
Department of European Institute,5707800.0,7098912.0,6914880.0,6407472.0,7452000.0
Department of Finance,7131769.0,8508526.0,7896590.0,9527769.0,10826160.0
Department of Gender Studies,2598876.0,3343128.0,4515840.0,4683324.0,4950720.0
Department of Geography And Environment,7221519.0,8450746.0,8629743.78,9192720.545,10223121.075
Department of Government,8124777.0,8633997.0,9508500.0,8915200.0,11317377.0
Department of Health Policy,4173405.0,5239705.0,5808546.0,6978960.0,6569280.0
