In [None]:
import pdfplumber
import csv

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        # Iterate through each page of the PDF
        for page in pdf.pages:
            # Extract tables from the current page
            tables = page.extract_tables()
            for table in tables:
                all_tables.extend(table)  # Add the rows of the table to all_tables list
       
        # Write all extracted tables to a CSV file
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

# Specify the path to your PDF and the output CSV file
pdf_path = 'Data/2020-Table-of-Fees-25Jun20.pdf'
output_csv_path = 'Data/2020_Fees.csv'

extract_tables_from_pdf(pdf_path, output_csv_path)


---------------------------


import pandas as pd
import re

# Load the CSV file containing the tuition fees and program data
output_csv_path = 'Data/2020_Fees.csv'  # Adjust path as necessary
csv_path = 'Data/Florian_Wirtz_eigentlich_noch_was_mit_der_v2.csv'  # Adjust path as necessary

# Read the CSV files
data_df_one = pd.read_csv(output_csv_path, error_bad_lines=False, warn_bad_lines=True)
data_df = pd.read_csv(csv_path)

# Combine 'Department' and 'Program' into a new column for unique combinations
data_df['Dept_Program'] = data_df['Department'] + " " + data_df['Program']

# Get unique combinations
unique_dept_programs = data_df['Dept_Program'].unique()

# Dictionary to store original to transformed mappings
original_to_transformed = {}

# Process only Master's programs and adjust department names
processed_dept_programs = []
for combo in unique_dept_programs:
    if isinstance(combo, str):  # Check if the item is a string
        if "PG Taught" in combo:
            original_department = combo.replace(" PG Taught", "")
            transformed_department = original_department
            # Handle specific naming transformations
            if "And" in transformed_department:
                transformed_department = transformed_department.split("And")[0].strip()
            if transformed_department == "International History":
                transformed_department = "History"
            if transformed_department == "European Institute":
                transformed_department = "European"
            if transformed_department == "Law School" or transformed_department == "Law":
                transformed_department = "LLM"
            if transformed_department == "Philosophy Logic":
                transformed_department = "Philosophy"
            if transformed_department == "School of Public Policy":
                transformed_department = "Public Policy"
            if transformed_department == "Gender Studies":
                transformed_department = "Gender"
            if "Psychological" in transformed_department:
                transformed_department = "Psychology"
            
            # Save mapping
            original_to_transformed[transformed_department] = original_department
            processed_dept_programs.append(transformed_department)

# Create DataFrame from processed list
processed_df = pd.DataFrame(processed_dept_programs, columns=['Department'])

# Insert "Program" column with "PG Taught" as the value for all entries
processed_df.insert(1, 'Program', 'PG Taught')

def find_matching_data(dept_program):
    matches = data_df_one[data_df_one.iloc[:, 0].str.contains(dept_program, na=False)]
    if not matches.empty:
        col2_values = []
        col3_values = []
        for _, row in matches.iterrows():
            # Process and clean fee data for averaging
            try:
                cleaned_value_col2 = float(str(row[1]).replace('£', '').replace(',', ''))
                if cleaned_value_col2.is_integer():
                    col2_values.append(int(cleaned_value_col2))
            except ValueError:
                col2_values.append(pd.NA)
            try:
                cleaned_value_col3 = float(str(row[2]).replace('£', '').replace(',', ''))
                if cleaned_value_col3.is_integer():
                    col3_values.append(int(cleaned_value_col3))
            except ValueError:
                col3_values.append(pd.NA)
        
        # Calculate averages while ignoring N/A values
        avg_col2 = pd.Series(col2_values).dropna().mean()
        avg_col3 = pd.Series(col3_values).dropna().mean()
        return pd.Series([avg_col2, avg_col3])
    return pd.Series([pd.NA, pd.NA])

# Apply the function to find and average matching tuition fees
processed_df[['Home fees', 'Overseas fees']] = processed_df['Department'].apply(find_matching_data)

# Remove rows where either column contains NA or NaN values
processed_df.dropna(subset=['Home fees', 'Overseas fees'], inplace=True)

# Ensure all remaining values are integers
processed_df['Home fees'] = processed_df['Home fees'].astype(int)
processed_df['Overseas fees'] = processed_df['Overseas fees'].astype(int)

# Revert department names to original values
processed_df['Department'] = processed_df['Department'].map(original_to_transformed)

# Display the final DataFrame
processed_df

# Find the row index for the specific phrase and extract the fees and year
idx = data_df_one.index[data_df_one.iloc[:, 0].str.contains("Students commencing their degree in", na=False)].tolist()
if idx:
    target_idx = idx[0]  # Assume the first occurrence
    if "Undergraduate" in data_df_one.iloc[target_idx - 1, 0]:
        year_match = re.search(r'\b(\d{4})\b', data_df_one.iloc[target_idx, 0])
        year = year_match.group(0) if year_match else "Unknown"
        home_fee_match = re.search(r'£(\d{4})', data_df_one.iloc[target_idx, 1].replace(',', ''))
        overseas_fee_match = re.search(r'£(\d{5})', data_df_one.iloc[target_idx, 2].replace(',', ''))
        home_fee = int(home_fee_match.group(1)) if home_fee_match else None
        overseas_fee = int(overseas_fee_match.group(1)) if overseas_fee_match else None

# Add the extracted row to the top of the processed_df DataFrame
additional_row = pd.DataFrame({
    'Department': [f"All {year}"],
    'Program': ["UG Degree"],
    'Home fees': [home_fee],
    'Overseas fees': [overseas_fee]
})

# Append the additional row to the processed_df DataFrame
processed_df = pd.concat([additional_row, processed_df]).reset_index(drop=True)

# Display the final DataFrame including the new row
processed_df