In [4]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Setup ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
search_url = "https://eprints.lse.ac.uk/cgi/search/advanced"
driver.get(search_url)

# Prepare a DataFrame for storing results
columns = ['Department'] + [str(year) for year in range(2010, 2024)]
results_df = pd.DataFrame(columns=columns)

# List of departments
departments = [
    'Geography & Environment', 'Philosophy, Logic and Scientific Method', 
    'Psychological and Behavioural Science', 'Government', 'Law', 
    'Social Policy', 'Mathematics', 'Economic History', 'Sociology', 
    'International History', 'Statistics', 'Management', 'International Relations', 
    'Anthropology', 'Economics', 'Language Centre', 'Accounting', 'Finance', 
    'Methodology', 'School of Public Policy', 
    'European Institute', 'Media and Communications', 'Health Policy', 
    'International Development', 'Gender Studies'
]

# Process each department
for department in departments:
    row_data = {'Department': department}
    driver.get(search_url)  # Navigate back to the main search page for each department
    try:
        divisions_select = Select(driver.find_element(By.ID, "divisions"))  # Locate the dropdown again
        divisions_select.select_by_visible_text(department)
        available = True
    except NoSuchElementException:
        available = False
        print(f"Department {department} not found.")
    
    if available:
        for year in range(2010, 2024):
            try:
                wait = WebDriverWait(driver, 5)
                date_input = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[name='date']")))
                date_input.clear()
                date_input.send_keys(str(year))
                date_input.send_keys(Keys.RETURN)

                # Wait for the page to load and scrape the total results
                total_results_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "span.ep_search_number")))
                total_results = total_results_elements[-1].text  # Get the text of the last element
                row_data[str(year)] = total_results
                print(f"Results for {department} in {year}: {total_results}")
            except NoSuchElementException:
                row_data[str(year)] = 'Element not found'
                print(f"Element not found for {department} in {year}.")
            except TimeoutException:
                row_data[str(year)] = 'Timeout or no results'
                print(f"Timeout or no results for {department} in {year}.")
            driver.get('https://eprints.lse.ac.uk/cgi/search/archive/advanced')
            divisions_select = Select(driver.find_element(By.ID, "divisions"))  # Locate the dropdown again
            divisions_select.select_by_visible_text(department)
    # Append the results of this department to the DataFrame and save incrementally
    new_row = pd.DataFrame([row_data])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_df.to_csv('department_yearly_results.csv', index=False)

# Close the browser
driver.quit()

print("Data scraping completed and saved to 'Data/department_yearly_results.csv'.")


Results for Geography & Environment in 2010: 220
Results for Geography & Environment in 2011: 296
Results for Geography & Environment in 2012: 243
Results for Geography & Environment in 2013: 160
Results for Geography & Environment in 2014: 160
Results for Geography & Environment in 2015: 108
Results for Geography & Environment in 2016: 129
Results for Geography & Environment in 2017: 119
Results for Geography & Environment in 2018: 119
Results for Geography & Environment in 2019: 117
Results for Geography & Environment in 2020: 175
Results for Geography & Environment in 2021: 177
Results for Geography & Environment in 2022: 170
Results for Geography & Environment in 2023: 145
Results for Philosophy, Logic and Scientific Method in 2010: 98
Results for Philosophy, Logic and Scientific Method in 2011: 60
Results for Philosophy, Logic and Scientific Method in 2012: 66
Results for Philosophy, Logic and Scientific Method in 2013: 68
Results for Philosophy, Logic and Scientific Method in 201

Results for Anthropology in 2022: 57
Results for Anthropology in 2023: 50
Results for Economics in 2010: 164
Results for Economics in 2011: 171
Results for Economics in 2012: 149
Results for Economics in 2013: 164
Results for Economics in 2014: 115
Results for Economics in 2015: 100
Results for Economics in 2016: 120
Results for Economics in 2017: 107
Results for Economics in 2018: 97
Results for Economics in 2019: 114
Results for Economics in 2020: 194
Results for Economics in 2021: 139
Results for Economics in 2022: 142
Results for Economics in 2023: 134
Timeout or no results for Language Centre in 2010.
Timeout or no results for Language Centre in 2011.
Timeout or no results for Language Centre in 2012.
Timeout or no results for Language Centre in 2013.
Timeout or no results for Language Centre in 2014.
Timeout or no results for Language Centre in 2015.
Timeout or no results for Language Centre in 2016.
Timeout or no results for Language Centre in 2017.
Timeout or no results for Lan

In [1]:
###########FINISHED 15:08


import pandas as pd
import pdfplumber
import csv

def find_next_containing_row(data_frame, start_index, column_index, text):
    for idx in range(start_index, len(data_frame)):
        cell_content = str(data_frame.iloc[idx, column_index])
        if pd.notna(cell_content) and text in cell_content:
            return idx
    return None

def find_first_non_empty_cell_and_extract_fee(data_frame, start_index, column_index):
    for idx in range(start_index, min(start_index + 10, len(data_frame))):
        cell_content = data_frame.iloc[idx, column_index]
        if pd.notna(cell_content) and cell_content != '':
            first_word = cell_content.split()[0]
            fee = ''.join(filter(str.isdigit, first_word))
            fee = fee.replace(',', '')
            return fee
    print("No non-empty cell found within the specified range.")
    return None

def clean_course_name(course_name):
    return str(course_name).replace('MSc in', 'MSc').strip()  # Removes 'MSc in' and trims any leading/trailing whitespace

def integrate_and_process_data(year):
    cleaned_fees_output_path = f'Test/CleanedFees{year}.csv'
    grouped_output_path = f'Test/Grouped_Fees{year}_by_Department_and_Level.csv'

    # Load the cleaned fees data and the department information
    cleaned_fees_df = pd.read_csv(cleaned_fees_output_path)
    department_info_df = pd.read_csv('data/PhilipOutput.csv')

    # Define a function to find the department for a given course name
    def find_department(course_name, department_df):
        match = department_df[department_df['Course Name'].str.strip().eq(course_name.strip())]
        if not match.empty:
            return match['Department'].iloc[0]
        return "Department not found"

    # Apply the function to add a new 'Department' column to the cleaned fees DataFrame
    cleaned_fees_df['Department'] = cleaned_fees_df['Course'].apply(lambda x: find_department(x, department_info_df))

    # Convert the Home Fee and Overseas Fee columns to numeric values for calculation
    cleaned_fees_df['Home Fee'] = pd.to_numeric(cleaned_fees_df['Home Fee'], errors='coerce')
    cleaned_fees_df['Overseas Fee'] = pd.to_numeric(cleaned_fees_df['Overseas Fee'], errors='coerce')

    # Calculate the fee difference and add it as a new column
    cleaned_fees_df['Fee Difference'] = cleaned_fees_df['Overseas Fee'] - cleaned_fees_df['Home Fee']

    # Group the data by 'Department' and 'Level', and calculate the average fee difference for each group
    grouped_data = cleaned_fees_df.groupby(['Department', 'Level']).agg({
        'Fee Difference': 'mean'
    }).reset_index()

    # Save the grouped data to a new CSV file
    grouped_data.to_csv(grouped_output_path, index=False, encoding='utf-8-sig')

    # Output the path to the saved file to confirm where it has been saved
    print(f"Grouped data for {year} saved to:", grouped_output_path)

    # Display the grouped data for verification
    display(grouped_data)

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    while len(row) < 10:
                        row.append('')
                    processed_row = row[:3] + [''] * 7
                    all_tables.append(processed_row)
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

for year in range(2018, 2025):  # from 2018 to 2024
    pdf_path = f'Data/TuitionFees/Fees{year}.pdf'
    output_csv_path = f'Test/Fees{year}.csv'
    updated_all_courses_output_path = f'Test/CleanedFees{year}.csv'

    extract_tables_from_pdf(pdf_path, output_csv_path)
    fees_df = pd.read_csv(output_csv_path, encoding='latin-1')

    home_fee_index = find_next_containing_row(fees_df, 0, 1, "Home")
    undergrad_home_fee = find_first_non_empty_cell_and_extract_fee(fees_df, home_fee_index + 1, 1)
    overseas_fee_index = find_next_containing_row(fees_df, 0, 2, "Overseas")
    undergrad_overseas_fee = find_first_non_empty_cell_and_extract_fee(fees_df, overseas_fee_index + 1, 2)

    has_undergraduate = False
    all_courses_fees_detailed = []
    for index, row in fees_df.iterrows():
        if isinstance(row.iloc[0], str) and any(x in row.iloc[0] for x in ['BSc', 'BA', 'MSc']):
            course_name = clean_course_name(row.iloc[0].split(' - ')[0])
            level = "Postgraduate" if "MSc" in course_name else "Undergraduate"
            if level == "Undergraduate":
                has_undergraduate = True
            home_fee = row.iloc[1] if not pd.isna(row.iloc[1]) else (undergrad_home_fee if level == "Undergraduate" else '')
            overseas_fee = row.iloc[2] if not pd.isna(row.iloc[2]) else ''
            home_fee = ''.join(filter(str.isdigit, home_fee))
            overseas_fee = ''.join(filter(str.isdigit, overseas_fee))
            all_courses_fees_detailed.append([course_name, home_fee, overseas_fee, level])

    if not has_undergraduate:
        all_courses_fees_detailed.append(["Undergraduate Courses", undergrad_home_fee, undergrad_overseas_fee, "Undergraduate"])

    all_courses_df_detailed = pd.DataFrame(all_courses_fees_detailed, columns=['Course', 'Home Fee', 'Overseas Fee', 'Level'])
    all_courses_df_detailed.to_csv(updated_all_courses_output_path, index=False, encoding='utf-8-sig')

    print(f"Processed data for {year}.")

    # Integrate and process the data for the current year
    integrate_and_process_data(year)


Processed data for 2018.
Grouped data for 2018 saved to: Test/Grouped_Fees2018_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2542.5
1,Department not found,Undergraduate,9902.0
2,Department of Accounting,Postgraduate,528.0
3,Department of Anthropology,Postgraduate,7368.0
4,Department of Economic History,Postgraduate,7368.0
5,Department of Gender Studies,Postgraduate,7368.0
6,Department of Geography and Environment,Postgraduate,6315.428571
7,Department of Health Policy,Postgraduate,
8,Department of International Development,Postgraduate,4912.0
9,Department of Mathematics,Postgraduate,3948.0


Processed data for 2019.
Grouped data for 2019 saved to: Test/Grouped_Fees2019_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2321.540984
1,Department not found,Undergraduate,10670.0
2,Department of Accounting,Postgraduate,552.0
3,Department of Anthropology,Postgraduate,7656.0
4,Department of Economic History,Postgraduate,7656.0
5,Department of Gender Studies,Postgraduate,7656.0
6,Department of Geography and Environment,Postgraduate,6657.0
7,Department of Government,Postgraduate,7656.0
8,Department of Health Policy,Postgraduate,7320.0
9,Department of International Development,Postgraduate,3828.0


Processed data for 2020.
Grouped data for 2020 saved to: Test/Grouped_Fees2020_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2362.983051
1,Department not found,Undergraduate,12320.0
2,Department of Accounting,Postgraduate,576.0
3,Department of Anthropology,Postgraduate,7968.0
4,Department of Economic History,Postgraduate,7968.0
5,Department of Gender Studies,Postgraduate,7968.0
6,Department of Geography and Environment,Postgraduate,6837.0
7,Department of Government,Postgraduate,7968.0
8,Department of Health Policy,Postgraduate,7608.0
9,Department of International Development,Postgraduate,3984.0


Processed data for 2021.
Grouped data for 2021 saved to: Test/Grouped_Fees2021_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2409.684211
1,Department not found,Undergraduate,13180.0
2,Department of Accounting,Postgraduate,600.0
3,Department of Anthropology,Postgraduate,8304.0
4,Department of Economic History,Postgraduate,8304.0
5,Department of Gender Studies,Postgraduate,8304.0
6,Department of Geography and Environment,Postgraduate,6328.0
7,Department of Government,Postgraduate,8304.0
8,Department of Health Policy,Postgraduate,5264.0
9,Department of International Development,Postgraduate,4152.0


Processed data for 2022.
Grouped data for 2022 saved to: Test/Grouped_Fees2022_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2761.655172
1,Department not found,Undergraduate,14080.0
2,Department of Accounting,Postgraduate,624.0
3,Department of Anthropology,Postgraduate,8640.0
4,Department of Economic History,Postgraduate,8640.0
5,Department of Gender Studies,Postgraduate,8640.0
6,Department of Geography and Environment,Postgraduate,6536.0
7,Department of Government,Postgraduate,0.0
8,Department of Health Policy,Postgraduate,5472.0
9,Department of International Development,Postgraduate,4320.0


Processed data for 2023.
Grouped data for 2023 saved to: Test/Grouped_Fees2023_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2865.5
1,Department not found,Undergraduate,15626.0
2,Department of Accounting,Postgraduate,648.0
3,Department of Accounting,Undergraduate,16406.0
4,Department of Anthropology,Postgraduate,9480.0
5,Department of Anthropology,Undergraduate,15470.0
6,Department of Economic History,Postgraduate,9480.0
7,Department of Economic History,Undergraduate,16094.0
8,Department of Economics,Undergraduate,17342.0
9,Department of Finance,Undergraduate,17342.0


Processed data for 2024.
Grouped data for 2024 saved to: Test/Grouped_Fees2024_by_Department_and_Level.csv


Unnamed: 0,Department,Level,Fee Difference
0,Department not found,Postgraduate,2964.0
1,Department not found,Undergraduate,17270.0
2,Department of Accounting,Postgraduate,696.0
3,Department of Accounting,Undergraduate,17942.0
4,Department of Anthropology,Postgraduate,10056.0
5,Department of Anthropology,Undergraduate,16934.0
6,Department of Economic History,Postgraduate,6024.0
7,Department of Economic History,Undergraduate,17598.0
8,Department of Economics,Undergraduate,18926.0
9,Department of Finance,Undergraduate,18926.0


In [5]:
###########FINISHED 15:08


import pandas as pd

FinalPath='Test/FinalTableFees.csv'

# Load the CSV files
fees_2018 = pd.read_csv('Test/Grouped_Fees2019_by_Department_and_Level.csv')
fees_2019 = pd.read_csv('Test/Grouped_Fees2019_by_Department_and_Level.csv')
fees_2020 = pd.read_csv('Test/Grouped_Fees2020_by_Department_and_Level.csv')
fees_2021 = pd.read_csv('Test/Grouped_Fees2021_by_Department_and_Level.csv')
fees_2022 = pd.read_csv('Test/Grouped_Fees2022_by_Department_and_Level.csv')
fees_2023 = pd.read_csv('Test/Grouped_Fees2023_by_Department_and_Level.csv')
fees_2024 = pd.read_csv('Test/Grouped_Fees2024_by_Department_and_Level.csv')



# Merge 2023 and 2024 data
merged_fees = pd.merge(fees_2023, fees_2024, on=['Department', 'Level'], suffixes=('_2023', '_2024'))



# Rename the Fee Difference column for 2022 data
fees_2022.rename(columns={'Fee Difference': 'Fee Difference_2022'}, inplace=True)

# Extract the universal undergraduate fee for 2022
universal_undergrad_fee_2022 = fees_2022.loc[
    (fees_2022['Department'] == 'Department not found') & (fees_2022['Level'] == 'Undergraduate'),
    'Fee Difference_2022'
].values[0]

# Merge the 2022 data into the merged 2023 and 2024 data
final_merged_fees = pd.merge(merged_fees, fees_2022[['Department', 'Level', 'Fee Difference_2022']], on=['Department', 'Level'], how='left')

# Fill NaN values for undergraduate levels with the universal undergraduate fee for 2022
final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2022'].isna()), 'Fee Difference_2022'] = universal_undergrad_fee_2022

# Reorder columns so that 'Fee Difference_2022' is to the left of 'Fee Difference_2023'
column_order = ['Department', 'Level', 'Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]




fees_2021.rename(columns={'Fee Difference': 'Fee Difference_2021'}, inplace=True)

universal_undergrad_fee_2021 = fees_2021.loc[
    (fees_2021['Department'] == 'Department not found') & (fees_2021['Level'] == 'Undergraduate'),
    'Fee Difference_2021'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2021[['Department', 'Level', 'Fee Difference_2021']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2021'].isna()), 'Fee Difference_2021'] = universal_undergrad_fee_2021

column_order = ['Department', 'Level', 'Fee Difference_2021', 'Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]



fees_2020.rename(columns={'Fee Difference': 'Fee Difference_2020'}, inplace=True)

universal_undergrad_fee_2020 = fees_2020.loc[
    (fees_2020['Department'] == 'Department not found') & (fees_2020['Level'] == 'Undergraduate'),
    'Fee Difference_2020'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2020[['Department', 'Level', 'Fee Difference_2020']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2020'].isna()), 'Fee Difference_2020'] = universal_undergrad_fee_2020

column_order = ['Department', 'Level', 'Fee Difference_2020', 'Fee Difference_2021','Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]



fees_2019.rename(columns={'Fee Difference': 'Fee Difference_2019'}, inplace=True)

universal_undergrad_fee_2019 = fees_2019.loc[
    (fees_2019['Department'] == 'Department not found') & (fees_2019['Level'] == 'Undergraduate'),
    'Fee Difference_2019'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2019[['Department', 'Level', 'Fee Difference_2019']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2019'].isna()), 'Fee Difference_2019'] = universal_undergrad_fee_2019

column_order = ['Department', 'Level','Fee Difference_2019', 'Fee Difference_2020', 'Fee Difference_2021','Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]



fees_2018.rename(columns={'Fee Difference': 'Fee Difference_2018'}, inplace=True)

universal_undergrad_fee_2018 = fees_2018.loc[
    (fees_2018['Department'] == 'Department not found') & (fees_2018['Level'] == 'Undergraduate'),
    'Fee Difference_2018'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2018[['Department', 'Level', 'Fee Difference_2018']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2018'].isna()), 'Fee Difference_2018'] = universal_undergrad_fee_2018

column_order = ['Department', 'Level','Fee Difference_2018','Fee Difference_2019', 'Fee Difference_2020', 'Fee Difference_2021','Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]


final_merged_fees.to_csv(FinalPath, index=False, encoding='utf-8-sig')


# Display the final table
display(final_merged_fees)


Unnamed: 0,Department,Level,Fee Difference_2018,Fee Difference_2019,Fee Difference_2020,Fee Difference_2021,Fee Difference_2022,Fee Difference_2023,Fee Difference_2024
0,Department not found,Postgraduate,2321.540984,2321.540984,2362.983051,2409.684211,2761.655172,2865.5,2964.0
1,Department not found,Undergraduate,10670.0,10670.0,12320.0,13180.0,14080.0,15626.0,17270.0
2,Department of Accounting,Postgraduate,552.0,552.0,576.0,600.0,624.0,648.0,696.0
3,Department of Accounting,Undergraduate,10670.0,10670.0,12320.0,13180.0,14080.0,16406.0,17942.0
4,Department of Anthropology,Postgraduate,7656.0,7656.0,7968.0,8304.0,8640.0,9480.0,10056.0
5,Department of Anthropology,Undergraduate,10670.0,10670.0,12320.0,13180.0,14080.0,15470.0,16934.0
6,Department of Economic History,Postgraduate,7656.0,7656.0,7968.0,8304.0,8640.0,9480.0,6024.0
7,Department of Economic History,Undergraduate,10670.0,10670.0,12320.0,13180.0,14080.0,16094.0,17598.0
8,Department of Economics,Undergraduate,10670.0,10670.0,12320.0,13180.0,14080.0,17342.0,18926.0
9,Department of Finance,Undergraduate,10670.0,10670.0,12320.0,13180.0,14080.0,17342.0,18926.0
