In [7]:
import pdfplumber
import csv
import pandas as pd
import re

# Function to extract tables from a PDF and write to CSV
def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list

        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Process each PDF and extract data

# 2017
pdf_path_2017 = 'Data/2017-18-Fees-Table.pdf'
output_csv_path_2017 = 'Data/2017_Fees'

extract_tables_from_pdf(pdf_path_2017, output_csv_path_2017)

# Load the CSV file containing the tuition fees and program data
output_csv_path_2017 = 'Data/2017_Fees.csv'
csv_path_2017 = 'Data/Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'

# Read the CSV files
data_df_one_2017 = pd.read_csv(output_csv_path_2017, error_bad_lines=False, warn_bad_lines=True)
data_df_2017 = pd.read_csv(csv_path_2017)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df_2017['Dept_Program'] = data_df_2017['Department'] + " " + data_df_2017['Program']

# Get unique combinations
unique_dept_programs_2017 = data_df_2017['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed_2017 = {}

# Process only Master's programs and adjust department names
processed_dept_programs_2017 = []
for combo in unique_dept_programs_2017:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"

            # Save mapping
            original_to_transformed_2017[transformed_department] = original_department
            processed_dept_programs_2017.append(transformed_department)

# Create DataFrame from processed list
processed_df_2017 = pd.DataFrame(processed_dept_programs_2017, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df_2017.insert(1, 'Program', 'PG Taught')

def find_matching_data_2017(dept_program):
    matches = data_df_one_2017[data_df_one_2017.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)

        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df_2017[['Home fees', 'Overseas fees']] = processed_df_2017['Department'].apply(find_matching_data_2017)

# Remove rows where either column contains NA or NaN values
processed_df_2017.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df_2017['Home fees'] = processed_df_2017['Home fees'].astype(int)
processed_df_2017['Overseas fees'] = processed_df_2017['Overseas fees'].astype(int)

# Revert department names to original values
processed_df_2017['Department'] = processed_df_2017['Department'].map(original_to_transformed_2017)

# Find the row index for the specific phrase and extract the fees and year
idx_2017 = data_df_one_2017.index[data_df_one_2017.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2017:
    target_idx_2017 = idx_2017[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one_2017.iloc[target_idx_2017 - 1, 0]:
        year_match_2017 = re.search(r'\b(\d{4})\b', data_df_one_2017.iloc[target_idx_2017, 0])
        year_2017 = year_match_2017.group(0) if year_match_2017 else "Unknown"
        home_fee_match_2017 = re.search(r'£(\d{4})', data_df_one_2017.iloc[target_idx_2017, 1].replace(',', ''))
        overseas_fee_match_2017 = re.search(r'£(\d{5})', data_df_one_2017.iloc[target_idx_2017, 2].replace(',', ''))
        home_fee_2017 = int(home_fee_match_2017.group(1)) if home_fee_match_2017 else None
        overseas_fee_2017 = int(overseas_fee_match_2017.group(1)) if overseas_fee_match_2017 else None
        print("Year:", year_2017)
        print("Home Fee:", home_fee_2017)
        print("Overseas Fee:", overseas_fee_2017)

# 2018
pdf_path_2018 = 'Data/2018-19-Fees-Table.pdf'
output_csv_path_2018 = 'Data/2018_Fees.csv'

extract_tables_from_pdf(pdf_path_2018, output_csv_path_2018)

# Load the CSV file containing the tuition fees and program data
output_csv_path_2018 = 'Data/2018_Fees.csv'
csv_path_2018 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'

# Read the CSV files
data_df_one_2018 = pd.read_csv(output_csv_path_2018, error_bad_lines=False, warn_bad_lines=True)
data_df_2018 = pd.read_csv(csv_path_2018)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df_2018['Dept_Program'] = data_df_2018['Department'] + " " + data_df_2018['Program']

# Get unique combinations
unique_dept_programs_2018 = data_df_2018['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed_2018 = {}

# Process only Master's programs and adjust department names
processed_dept_programs_2018 = []
for combo in unique_dept_programs_2018:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"

            # Save mapping
            original_to_transformed_2018[transformed_department] = original_department
            processed_dept_programs_2018.append(transformed_department)

# Create DataFrame from processed list
processed_df_2018 = pd.DataFrame(processed_dept_programs_2018, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df_2018.insert(1, 'Program', 'PG Taught')

def find_matching_data_2018(dept_program):
    matches = data_df_one_2018[data_df_one_2018.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)

        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df_2018[['Home fees', 'Overseas fees']] = processed_df_2018['Department'].apply(find_matching_data_2018)

# Remove rows where either column contains NA or NaN values
processed_df_2018.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df_2018['Home fees'] = processed_df_2018['Home fees'].astype(int)
processed_df_2018['Overseas fees'] = processed_df_2018['Overseas fees'].astype(int)

# Revert department names to original values
processed_df_2018['Department'] = processed_df_2018['Department'].map(original_to_transformed_2018)

# Find the row index for the specific phrase and extract the fees and year
idx_2018 = data_df_one_2018.index[data_df_one_2018.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2018:
    target_idx_2018 = idx_2018[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one_2018.iloc[target_idx_2018 - 1, 0]:
        year_match_2018 = re.search(r'\b(\d{4})\b', data_df_one_2018.iloc[target_idx_2018, 0])
        year_2018 = year_match_2018.group(0) if year_match_2018 else "Unknown"
        home_fee_match_2018 = re.search(r'£(\d{4})', data_df_one_2018.iloc[target_idx_2018, 1].replace(',', ''))
        overseas_fee_match_2018 = re.search(r'£(\d{5})', data_df_one_2018.iloc[target_idx_2018, 2].replace(',', ''))
        home_fee_2018 = int(home_fee_match_2018.group(1)) if home_fee_match_2018 else None
        overseas_fee_2018 = int(overseas_fee_match_2018.group(1)) if overseas_fee_match_2018 else None
        print("Year:", year_2018)
        print("Home Fee:", home_fee_2018)
        print("Overseas Fee:", overseas_fee_2018)

# 2019
pdf_path_2019 = 'Data/2019-Table-of-Fees.pdf'
output_csv_path_2019 = 'Data/2019_Fees.csv'

extract_tables_from_pdf(pdf_path_2019, output_csv_path_2019)

# Load the CSV file containing the tuition fees and program data
output_csv_path_2019 = 'Data/2019_Fees.csv'
csv_path_2019 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'

# Read the CSV files
data_df_one_2019 = pd.read_csv(output_csv_path_2019, error_bad_lines=False, warn_bad_lines=True)
data_df_2019 = pd.read_csv(csv_path_2019)

# Combine 'Department' and 'Program' into a new column for unique combinations

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df_2019['Dept_Program'] = data_df_2019['Department'] + " " + data_df_2019['Program']

# Get unique combinations
unique_dept_programs_2019 = data_df_2019['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed_2019 = {}

# Process only Master's programs and adjust department names
processed_dept_programs_2019 = []
for combo in unique_dept_programs_2019:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"

            # Save mapping
            original_to_transformed_2019[transformed_department] = original_department
            processed_dept_programs_2019.append(transformed_department)

# Create DataFrame from processed list
processed_df_2019 = pd.DataFrame(processed_dept_programs_2019, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df_2019.insert(1, 'Program', 'PG Taught')

def find_matching_data_2019(dept_program):
    matches = data_df_one_2019[data_df_one_2019.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)

        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df_2019[['Home fees', 'Overseas fees']] = processed_df_2019['Department'].apply(find_matching_data_2019)

# Remove rows where either column contains NA or NaN values
processed_df_2019.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df_2019['Home fees'] = processed_df_2019['Home fees'].astype(int)
processed_df_2019['Overseas fees'] = processed_df_2019['Overseas fees'].astype(int)

# Revert department names to original values
processed_df_2019['Department'] = processed_df_2019['Department'].map(original_to_transformed_2019)

# Find the row index for the specific phrase and extract the fees and year
idx_2019 = data_df_one_2019.index[data_df_one_2019.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2019:
    target_idx_2019 = idx_2019[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one_2019.iloc[target_idx_2019 - 1, 0]:
        year_match_2019 = re.search(r'\b(\d{4})\b', data_df_one_2019.iloc[target_idx_2019, 0])
        year_2019 = year_match_2019.group(0) if year_match_2019 else "Unknown"
        home_fee_match_2019 = re.search(r'£(\d{4})', data_df_one_2019.iloc[target_idx_2019, 1].replace(',', ''))
        overseas_fee_match_2019 = re.search(r'£(\d{5})', data_df_one_2019.iloc[target_idx_2019, 2].replace(',', ''))
        home_fee_2019 = int(home_fee_match_2019.group(1)) if home_fee_match_2019 else None
        overseas_fee_2019 = int(overseas_fee_match_2019.group(1)) if overseas_fee_match_2019 else None
        print("Year:", year_2019)
        print("Home Fee:", home_fee_2019)
        print("Overseas Fee:", overseas_fee_2019)

# 2020
pdf_path_2020 = "Data/2020-Table-of-Fees-25Jun20.pdf"
output_csv_path_2020 = 'Data/2020_Fees.csv'

extract_tables_from_pdf(pdf_path_2020, output_csv_path_2020)

# Load the CSV file containing the tuition fees and program data
output_csv_path_2020 = 'Data/2020_Fees.csv'
csv_path_2020 = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'

# Read the CSV files
data_df_one_2020 = pd.read_csv(output_csv_path_2020, error_bad_lines=False, warn_bad_lines=True)
data_df_2020 = pd.read_csv(csv_path_2020)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df_2020['Dept_Program'] = data_df_2020['Department'] + " " + data_df_2020['Program']

# Get unique combinations
unique_dept_programs_2020 = data_df_2020['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed_2020 = {}

# Process only Master's programs and adjust department names
processed_dept_programs_2020 = []
for combo in unique_dept_programs_2020:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"

            # Save mapping
            original_to_transformed_2020[transformed_department] = original_department
            processed_dept_programs_2020.append(transformed_department)

# Create DataFrame from processed list
processed_df_2020 = pd.DataFrame(processed_dept_programs_2020, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df_2020.insert(1, 'Program', 'PG Taught')

def find_matching_data_2020(dept_program):
    matches = data_df_one_2020[data_df_one_2020.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)

        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df_2020[['Home fees', 'Overseas fees']] = processed_df_2020['Department'].apply(find_matching_data_2020)

# Remove rows where either column contains NA or NaN values
processed_df_2020.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df_2020['Home fees'] = processed_df_2020['Home fees'].astype(int)
processed_df_2020['Overseas fees'] = processed_df_2020['Overseas fees'].astype(int)

# Revert department names to original values
processed_df_2020['Department'] = processed_df_2020['Department'].map(original_to_transformed_2020)

# Find the row index for the specific phrase and extract the fees and year
idx_2020 = data_df_one_2020.index[data_df_one_2020.iloc[:, 0].str.contains("Students commencing their degree in" or "New Entrants", na=False)].tolist()
if idx_2020:
    target_idx_2020 = idx_2020[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one_2020.iloc[target_idx_2020 - 1, 0]:
        year_match_2020 = re.search(r'\b(\d{4})\b', data_df_one_2020.iloc[target_idx_2020, 0])
        year_2020 = year_match_2020.group(0) if year_match_2020 else "Unknown"
        home_fee_match_2020 = re.search(r'£(\d{4})', data_df_one_2020.iloc[target_idx_2020, 1].replace(',', ''))
        overseas_fee_match_2020 = re.search(r'£(\d{5})', data_df_one_2020.iloc[target_idx_2020, 2].replace(',', ''))
        home_fee_2020 = int(home_fee_match_2020.group(1)) if home_fee_match_2020 else None
        overseas_fee_2020 = int(overseas_fee_match_2020.group(1)) if overseas_fee_match_2020 else None
        print("Year:", year_2020)
        print("Home Fee:", home_fee_2020)
        print("Overseas Fee:", overseas_fee_2020)

# Merge dataframes from different years
frames = [processed_df_2017, processed_df_2018, processed_df_2019, processed_df_2020]
result_df = pd.concat(frames, ignore_index=True)

# Display the combined dataframe
result_df




  data_df_one_2017 = pd.read_csv(output_csv_path_2017, error_bad_lines=False, warn_bad_lines=True)


  data_df_one_2017 = pd.read_csv(output_csv_path_2017, error_bad_lines=False, warn_bad_lines=True)


FileNotFoundError: [Errno 2] No such file or directory: 'Data/Your_File_2017.csv'