import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Setup ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
search_url = "https://eprints.lse.ac.uk/cgi/search/advanced"
driver.get(search_url)

# Prepare a DataFrame for storing results
columns = ['Department'] + [str(year) for year in range(2010, 2024)]
results_df = pd.DataFrame(columns=columns)

# List of departments
departments = [
    'Geography & Environment', 'Philosophy, Logic and Scientific Method', 
    'Psychological and Behavioural Science', 'Government', 'Law', 
    'Social Policy', 'Mathematics', 'Economic History', 'Sociology', 
    'International History', 'Statistics', 'Management', 'International Relations', 
    'Anthropology', 'Economics', 'Language Centre', 'Accounting', 'Finance', 
    'Methodology', 'School of Public Policy', 
    'European Institute', 'Media and Communications', 'Health Policy', 
    'International Development', 'Gender Studies'
]

# Process each department
for department in departments:
    row_data = {'Department': department}
    driver.get(search_url)  # Navigate back to the main search page for each department
    try:
        divisions_select = Select(driver.find_element(By.ID, "divisions"))  # Locate the dropdown again
        divisions_select.select_by_visible_text(department)
        available = True
    except NoSuchElementException:
        available = False
        print(f"Department {department} not found.")
    
    if available:
        for year in range(2010, 2024):
            try:
                wait = WebDriverWait(driver, 5)
                date_input = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[name='date']")))
                date_input.clear()
                date_input.send_keys(str(year))
                date_input.send_keys(Keys.RETURN)

                # Wait for the page to load and scrape the total results
                total_results_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "span.ep_search_number")))
                total_results = total_results_elements[-1].text  # Get the text of the last element
                row_data[str(year)] = total_results
                print(f"Results for {department} in {year}: {total_results}")
            except NoSuchElementException:
                row_data[str(year)] = 'Element not found'
                print(f"Element not found for {department} in {year}.")
            except TimeoutException:
                row_data[str(year)] = 'Timeout or no results'
                print(f"Timeout or no results for {department} in {year}.")
            driver.get('https://eprints.lse.ac.uk/cgi/search/archive/advanced')
            divisions_select = Select(driver.find_element(By.ID, "divisions"))  # Locate the dropdown again
            divisions_select.select_by_visible_text(department)
    # Append the results of this department to the DataFrame and save incrementally
    new_row = pd.DataFrame([row_data])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_df.to_csv('department_yearly_results.csv', index=False)

# Close the browser
driver.quit()

print("Data scraping completed and saved to 'Data/department_yearly_results.csv'.")


###########FINISHED 15:08


import pandas as pd
import pdfplumber
import csv

def find_next_containing_row(data_frame, start_index, column_index, text):
    for idx in range(start_index, len(data_frame)):
        cell_content = str(data_frame.iloc[idx, column_index])
        if pd.notna(cell_content) and text in cell_content:
            return idx
    return None

def find_first_non_empty_cell_and_extract_fee(data_frame, start_index, column_index):
    for idx in range(start_index, min(start_index + 10, len(data_frame))):
        cell_content = data_frame.iloc[idx, column_index]
        if pd.notna(cell_content) and cell_content != '':
            first_word = cell_content.split()[0]
            fee = ''.join(filter(str.isdigit, first_word))
            fee = fee.replace(',', '')
            return fee
    print("No non-empty cell found within the specified range.")
    return None

def clean_course_name(course_name):
    return str(course_name).replace('MSc in', 'MSc').strip()  # Removes 'MSc in' and trims any leading/trailing whitespace

def integrate_and_process_data(year):
    cleaned_fees_output_path = f'Test/CleanedFees{year}.csv'
    grouped_output_path = f'Test/Grouped_Fees{year}_by_Department_and_Level.csv'

    # Load the cleaned fees data and the department information
    cleaned_fees_df = pd.read_csv(cleaned_fees_output_path)
    department_info_df = pd.read_csv('data/PhilipOutput.csv')

    # Define a function to find the department for a given course name
    def find_department(course_name, department_df):
        match = department_df[department_df['Course Name'].str.strip().eq(course_name.strip())]
        if not match.empty:
            return match['Department'].iloc[0]
        return "Department not found"

    # Apply the function to add a new 'Department' column to the cleaned fees DataFrame
    cleaned_fees_df['Department'] = cleaned_fees_df['Course'].apply(lambda x: find_department(x, department_info_df))

    # Convert the Home Fee and Overseas Fee columns to numeric values for calculation
    cleaned_fees_df['Home Fee'] = pd.to_numeric(cleaned_fees_df['Home Fee'], errors='coerce')
    cleaned_fees_df['Overseas Fee'] = pd.to_numeric(cleaned_fees_df['Overseas Fee'], errors='coerce')

    # Calculate the fee difference and add it as a new column
    cleaned_fees_df['Fee Difference'] = cleaned_fees_df['Overseas Fee'] - cleaned_fees_df['Home Fee']

    # Group the data by 'Department' and 'Level', and calculate the average fee difference for each group
    grouped_data = cleaned_fees_df.groupby(['Department', 'Level']).agg({
        'Fee Difference': 'mean'
    }).reset_index()

    # Save the grouped data to a new CSV file
    grouped_data.to_csv(grouped_output_path, index=False, encoding='utf-8-sig')

    # Output the path to the saved file to confirm where it has been saved
    print(f"Grouped data for {year} saved to:", grouped_output_path)

    # Display the grouped data for verification
    display(grouped_data)

def extract_tables_from_pdf(pdf_path, output_csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    while len(row) < 10:
                        row.append('')
                    processed_row = row[:3] + [''] * 7
                    all_tables.append(processed_row)
        with open(output_csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in all_tables:
                writer.writerow(row)

for year in range(2018, 2025):  # from 2018 to 2024
    pdf_path = f'Data/TuitionFees/Fees{year}.pdf'
    output_csv_path = f'Test/Fees{year}.csv'
    updated_all_courses_output_path = f'Test/CleanedFees{year}.csv'

    extract_tables_from_pdf(pdf_path, output_csv_path)
    fees_df = pd.read_csv(output_csv_path, encoding='latin-1')

    home_fee_index = find_next_containing_row(fees_df, 0, 1, "Home")
    undergrad_home_fee = find_first_non_empty_cell_and_extract_fee(fees_df, home_fee_index + 1, 1)
    overseas_fee_index = find_next_containing_row(fees_df, 0, 2, "Overseas")
    undergrad_overseas_fee = find_first_non_empty_cell_and_extract_fee(fees_df, overseas_fee_index + 1, 2)

    has_undergraduate = False
    all_courses_fees_detailed = []
    for index, row in fees_df.iterrows():
        if isinstance(row.iloc[0], str) and any(x in row.iloc[0] for x in ['BSc', 'BA', 'MSc']):
            course_name = clean_course_name(row.iloc[0].split(' - ')[0])
            level = "Postgraduate" if "MSc" in course_name else "Undergraduate"
            if level == "Undergraduate":
                has_undergraduate = True
            home_fee = row.iloc[1] if not pd.isna(row.iloc[1]) else (undergrad_home_fee if level == "Undergraduate" else '')
            overseas_fee = row.iloc[2] if not pd.isna(row.iloc[2]) else ''
            home_fee = ''.join(filter(str.isdigit, home_fee))
            overseas_fee = ''.join(filter(str.isdigit, overseas_fee))
            all_courses_fees_detailed.append([course_name, home_fee, overseas_fee, level])

    if not has_undergraduate:
        all_courses_fees_detailed.append(["Undergraduate Courses", undergrad_home_fee, undergrad_overseas_fee, "Undergraduate"])

    all_courses_df_detailed = pd.DataFrame(all_courses_fees_detailed, columns=['Course', 'Home Fee', 'Overseas Fee', 'Level'])
    all_courses_df_detailed.to_csv(updated_all_courses_output_path, index=False, encoding='utf-8-sig')

    print(f"Processed data for {year}.")

    # Integrate and process the data for the current year
    integrate_and_process_data(year)


In [None]:
###########FINISHED 15:08


import pandas as pd

FinalPath='Test/FinalTableFees.csv'

# Load the CSV files
fees_2018 = pd.read_csv('Test/Grouped_Fees2019_by_Department_and_Level.csv')
fees_2019 = pd.read_csv('Test/Grouped_Fees2019_by_Department_and_Level.csv')
fees_2020 = pd.read_csv('Test/Grouped_Fees2020_by_Department_and_Level.csv')
fees_2021 = pd.read_csv('Test/Grouped_Fees2021_by_Department_and_Level.csv')
fees_2022 = pd.read_csv('Test/Grouped_Fees2022_by_Department_and_Level.csv')
fees_2023 = pd.read_csv('Test/Grouped_Fees2023_by_Department_and_Level.csv')
fees_2024 = pd.read_csv('Test/Grouped_Fees2024_by_Department_and_Level.csv')



# Merge 2023 and 2024 data
merged_fees = pd.merge(fees_2023, fees_2024, on=['Department', 'Level'], suffixes=('_2023', '_2024'))



# Rename the Fee Difference column for 2022 data
fees_2022.rename(columns={'Fee Difference': 'Fee Difference_2022'}, inplace=True)

# Extract the universal undergraduate fee for 2022
universal_undergrad_fee_2022 = fees_2022.loc[
    (fees_2022['Department'] == 'Department not found') & (fees_2022['Level'] == 'Undergraduate'),
    'Fee Difference_2022'
].values[0]

# Merge the 2022 data into the merged 2023 and 2024 data
final_merged_fees = pd.merge(merged_fees, fees_2022[['Department', 'Level', 'Fee Difference_2022']], on=['Department', 'Level'], how='left')

# Fill NaN values for undergraduate levels with the universal undergraduate fee for 2022
final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2022'].isna()), 'Fee Difference_2022'] = universal_undergrad_fee_2022

# Reorder columns so that 'Fee Difference_2022' is to the left of 'Fee Difference_2023'
column_order = ['Department', 'Level', 'Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]




fees_2021.rename(columns={'Fee Difference': 'Fee Difference_2021'}, inplace=True)

universal_undergrad_fee_2021 = fees_2021.loc[
    (fees_2021['Department'] == 'Department not found') & (fees_2021['Level'] == 'Undergraduate'),
    'Fee Difference_2021'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2021[['Department', 'Level', 'Fee Difference_2021']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2021'].isna()), 'Fee Difference_2021'] = universal_undergrad_fee_2021

column_order = ['Department', 'Level', 'Fee Difference_2021', 'Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]



fees_2020.rename(columns={'Fee Difference': 'Fee Difference_2020'}, inplace=True)

universal_undergrad_fee_2020 = fees_2020.loc[
    (fees_2020['Department'] == 'Department not found') & (fees_2020['Level'] == 'Undergraduate'),
    'Fee Difference_2020'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2020[['Department', 'Level', 'Fee Difference_2020']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2020'].isna()), 'Fee Difference_2020'] = universal_undergrad_fee_2020

column_order = ['Department', 'Level', 'Fee Difference_2020', 'Fee Difference_2021','Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]



fees_2019.rename(columns={'Fee Difference': 'Fee Difference_2019'}, inplace=True)

universal_undergrad_fee_2019 = fees_2019.loc[
    (fees_2019['Department'] == 'Department not found') & (fees_2019['Level'] == 'Undergraduate'),
    'Fee Difference_2019'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2019[['Department', 'Level', 'Fee Difference_2019']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2019'].isna()), 'Fee Difference_2019'] = universal_undergrad_fee_2019

column_order = ['Department', 'Level','Fee Difference_2019', 'Fee Difference_2020', 'Fee Difference_2021','Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]



fees_2018.rename(columns={'Fee Difference': 'Fee Difference_2018'}, inplace=True)

universal_undergrad_fee_2018 = fees_2018.loc[
    (fees_2018['Department'] == 'Department not found') & (fees_2018['Level'] == 'Undergraduate'),
    'Fee Difference_2018'
].values[0]

final_merged_fees = pd.merge(final_merged_fees, fees_2018[['Department', 'Level', 'Fee Difference_2018']], on=['Department', 'Level'], how='left')

final_merged_fees.loc[(final_merged_fees['Level'] == 'Undergraduate') & (final_merged_fees['Fee Difference_2018'].isna()), 'Fee Difference_2018'] = universal_undergrad_fee_2018

column_order = ['Department', 'Level','Fee Difference_2018','Fee Difference_2019', 'Fee Difference_2020', 'Fee Difference_2021','Fee Difference_2022', 'Fee Difference_2023', 'Fee Difference_2024']
final_merged_fees = final_merged_fees[column_order]


final_merged_fees.to_csv(FinalPath, index=False, encoding='utf-8-sig')


# Display the final table
display(final_merged_fees)


In [None]:

final_merged_fees = final_merged_fees[final_merged_fees['Department'] != "Department not found"]
final_merged_fees['Department'] = final_merged_fees['Department'].str.replace("Department of ", "", regex=False)

second_column_name = final_merged_fees.columns[1]  # Adjust the index if your column order is different
final_merged_fees[second_column_name] = final_merged_fees[second_column_name].str.replace("Undergraduate", "UG", regex=False)
final_merged_fees[second_column_name] = final_merged_fees[second_column_name].str.replace("Postgraduate", "PG", regex=False)

final_merged_fees.columns = final_merged_fees.columns.str.replace("_", " ", regex=False)

final_merged_fees.drop(final_merged_fees.columns[2], axis=1, inplace=True)
final_merged_fees.drop(final_merged_fees.columns[7], axis=1, inplace=True)

final_merged_fees.reset_index(drop=True, inplace=True)

final_merged_fees


In [None]:
entrances_df_eu = pd.read_csv("Data/EU_Entrances_Question_Three.csv")
entrances_df_eu

In [None]:
# Merging the dataframes on 'Department' and 'Level'
extra_eu_fees_merged = pd.merge(final_merged_fees, entrances_df_eu, on=['Department', 'Level'], suffixes=('_fees', '_entrances'))

# Prepare a new dataframe with the same 'Department' and 'Level' columns
extra_eu_fees_result = extra_eu_fees_merged[['Department', 'Level']].copy()

# Multiply corresponding year columns for extra EU fees
years = ['2019', '2020', '2021', '2022', '2023']
for year in years:
    fee_col = f'Fee Difference {year}'
    entrance_col = f'Entrances {year}'
    extra_eu_fees_result[f'Extra Fees through EU {year}'] = extra_eu_fees_merged[fee_col] * extra_eu_fees_merged[entrance_col]
    
# List of year columns in the extra_eu_fees_result dataframe that need conversion to integers
year_columns = ['Extra Fees through EU 2019', 'Extra Fees through EU 2020', 'Extra Fees through EU 2021', 'Extra Fees through EU 2022', 'Extra Fees through EU 2023']

# Convert these columns to integers to remove decimals
for column in year_columns:
    extra_eu_fees_result[column] = extra_eu_fees_result[column].astype(int)


# Print or use extra_eu_fees_result as needed
extra_eu_fees_result

In [None]:
extra_eu_fees_result = extra_eu_fees_result.drop('Level', axis=1)
extra_eu_fees_result

In [None]:
new_dataframe = extra_eu_fees_result.groupby('Department')[['Extra Fees through EU 2019', 'Extra Fees through EU 2020', 'Extra Fees through EU 2021', 'Extra Fees through EU 2022', 'Extra Fees through EU 2023']].sum()
new_dataframe

In [None]:
publications = pd.read_csv("Data/department_yearly_results.csv")
publications['Department'] = publications['Department'].str.replace('&', 'and')
years_to_drop = [str(year) for year in range(2010, 2019)]
publications = publications.drop(columns=years_to_drop)

publications

In [None]:
# Calculate total publications for each department
publications['Total'] = publications.iloc[:, 1:].sum(axis=1)

# Sort the dataframe by total publications in descending order
publications = publications.sort_values(by='Total', ascending=False)

# Reindexing to maintain the new order
publications = publications.reset_index(drop=True)

# Now, proceed with plotting the bar chart as before
fig, ax = plt.subplots(figsize=(14, 8))
indices = np.arange(len(publications['Department']))

# Plotting
colors = ['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b2']  # Muted blue, soft terracotta, muted green, dark red, purplish blue
widths = [
    publications['2019'].values,
    publications['2020'].values,
    publications['2021'].values,
    publications['2022'].values,
    publications['2023'].values
]

# Ensure widths are numeric and handle potential NaNs
widths = [np.nan_to_num(w.astype(float)) for w in widths]

# Cumulative width for the 'left' argument
cumulative_width = np.zeros(len(publications))

for width, color, year in zip(widths, colors, range(2019, 2024)):
    ax.barh(indices, width, color=color, label=str(year), left=cumulative_width)
    cumulative_width += width

ax.set(yticks=indices, yticklabels=publications['Department'])
ax.set_xlabel('Number of Publications')
ax.set_title('Publications by Department over 2019-2023')
ax.legend()

plt.show()


The horizontally stacked bar chart above visualises the number of annual publications of different departments over the years 2019 to 2023. 

While the relative ranking of departments by the number of publications appears relatively stable, there are exceptions with notable increases and fluctuations, particularly around 2021. Departments such as Gender Studies, Law, and Economics showed a significant increase in publications during this year. The increase is visually identifiable by a longer green section compared to the blue and orange ones. On the other hand some department's publicaiton output decreased from 2020 to 2021.

This might be a direct consequence as through the change in tuition fees in 2021 due to Brexit, the fees that these departments collected might have changed significantly and as a result their funding avaiable for research increased or decreasing accordingly.

However this analysis is limited in the sense that there are many other factors that could have caused these changes in 2021. Particularly the trend for each department depends on  its size, funding, and the nature of the research field itself. Thus it is hard to directly identify the changed tuition fee structure of Brexit as the cause of this change in trend. Therefore a more rigorous analysis is required.



Now the publication data will be merged with the data on the extra fees that were collected from EU students:

In [None]:
department_fees_publications_df = pd.merge(publications, new_dataframe, on='Department', how='inner')


# Reorder the DataFrame columns

department_fees_publications_df

In [None]:

# Extract the column names
column_names = department_fees_publications_df.columns

# Reorganize the column names
new_column_order = ['Department']
for year in range(2019, 2024):
    new_column_order.append(str(year))
    new_column_order.append(f'Extra Fees through EU {year}')

# Reorder the DataFrame columns
department_fees_publications_df = department_fees_publications_df[new_column_order]
department_fees_publications_df

In [None]:
renamed_columns = {}
for column in department_fees_publications_df.columns:
    if column.isdigit():
        renamed_columns[column] = f'Publications {column}'

department_fees_publications_df = department_fees_publications_df.rename(columns=renamed_columns)
department_fees_publications_df

In [None]:
# Drop columns containing "2019" or "2020" in their name
columns_to_drop = department_fees_publications_df.columns[department_fees_publications_df.columns.str.contains('2019|2020')]
department_fees_publications_df = department_fees_publications_df.drop(columns=columns_to_drop)
department_fees_publications_df

In [None]:

# Correct the script based on the printed column names
import seaborn as sns
import matplotlib.pyplot as plt

# Correctly referencing DataFrame and column names
heatmap_data = department_fees_publications_df.pivot_table(index="Department", values=['Extra Fees through EU 2021', 'Extra Fees through EU 2022', 'Extra Fees through EU 2023'], aggfunc='sum') / 1_000_000  # Convert to millions
fig, ax = plt.subplots(figsize=(10, 12))
heat_map = sns.heatmap(heatmap_data, ax=ax, annot=True, fmt=".2f", cmap='viridis', annot_kws={'size':10}, vmax=1.00)
ax.set_title('Heatmap of Extra Fees by Department and Year (in Millions)', fontsize=16)
ax.set_xlabel('Year', fontsize=14)
ax.set_ylabel('Department', fontsize=14)
ax.set_xticklabels(['2021', '2022', '2023'], rotation=0, fontsize=12)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)

# Annotate each cell with "M"
for text in ax.texts:
    text.set_text(text.get_text() + "M")

# Adjusting the colorbar labels to exclude values over 1.00M
colorbar = ax.collections[0].colorbar
colorbar.set_label('Extra Fees (Millions M)', fontsize=12)
colorbar.set_ticks([x for x in colorbar.get_ticks() if x <= 1.00])
colorbar.set_ticklabels([f"{x:.2f}M" for x in colorbar.get_ticks()])

plt.show()


The heatmap above provides an initial insight into which departments have benefited most from the changes in tuition fee structures following Brexit. It was generated by calculating the increase in fees paid by EU students, defined as the difference between the overseas fee and the home fee, and then multiplying this by the number of EU entrants to these departments over the specified years. This effectively represents the total increase in tuition fees received from EU students by each department

Immediately the yellow government field stands out which showed that the government department incurred an additional 1 million pounds in tuition fees from European students in 2021. However, this result appears to be an anomaly as it declines quickly in the following year before increasing again so its hard to interpret this more comprehensively.

However as this heatmap does not provide any information on department size it does not make sense to look at absolute values further because it is uncertain whether high numbers are a result of high number of EU entrants or high fees. 

Certain departments such as Finance, Geography, Economic histroy and international development showed consistent growth througout these periods. This suggests that these courses have increased in popularity among EU students despite them now paying more fees for these courses. This is an important metric for LSE as it can perhaps see what makes these courses increase in attractiveness despite raised fees to ensure the university can continue attracting good talent.

While it is hard to find more general trends and there seem to be much fluctuations across departments, the overall trend can be considered to be increasing. This again implies that LSE continues to attract EU students after the changed tuition fees. 

Moreover as the tuition fee structure only changed for students entering in 2021 there is not that much historical data from which trends can be interpreted more clearly. Moreover, it is uncertain how this increase in fees is allocated across departments, for example whether it goes into research, etc. Even more importantly this does not provide any insights into the total fees generated by each department which is arguably the more important metric. For example perhaps following Brexit these departments generated more fees from EU students but overall their collected fees went down due to other reasons.

In [None]:
# Prepare a new DataFrame to hold transformed data
new_rows = []

# Loop through each year and create new rows
for year in [2021, 2022, 2023]:
    temp_df = department_fees_publications_df[['Department', f'Publications {year}', f'Extra Fees through EU {year}']].copy()
    temp_df['Year'] = year
    temp_df.rename(columns={f'Publications {year}': 'Publications',
                            f'Extra Fees through EU {year}': 'Extra Fees'}, inplace=True)
    temp_df['Department'] = temp_df['Department'] + ' ' + str(year)
    new_rows.append(temp_df)

# Concatenate all new rows
long_publications_extraFees_df = pd.concat(new_rows)

# Sort and reset index for better readability
long_publications_extraFees_df = long_publications_extraFees_df.sort_values(by=['Department']).reset_index(drop=True)

# Show the resulting DataFrame
long_publications_extraFees_df

In [None]:
long_publications_extraFees_df.drop(columns=['Year'], inplace=True)
long_publications_extraFees_df

As previously mentioned, it is hard to determine how the increased tuition fees generated from EU students is allocated within departments. We will now try to examine this more precisely.

Research notably is a very important component of any university department. We try to proxy for research using the number of publications of each department as a higher number of publications indicates a higher research activity.


NUR PAAR JAHRE, VLLT IST DER EFFEKT AUF RESEARCH MIT EINEM LAG DA RESEARCH JA RELATIV LANG DAUERN KANN UND DESHALB MAN DAS EHER NACH PAAR JAHREN SEHEN KANN

In [None]:
import statsmodels.api as sm
import plotly.express as px

# Assuming long_publications_extraFees_df is already loaded

# Convert data types to float (if they are not already floats)
long_publications_extraFees_df['Extra Fees'] = pd.to_numeric(long_publications_extraFees_df['Extra Fees'], errors='coerce')
long_publications_extraFees_df['Publications'] = pd.to_numeric(long_publications_extraFees_df['Publications'], errors='coerce')

# Drop any rows with missing data after the conversion
long_publications_extraFees_df = long_publications_extraFees_df.dropna()

# Perform linear regression
X = sm.add_constant(long_publications_extraFees_df['Extra Fees'])  # adding a constant
model = sm.OLS(long_publications_extraFees_df['Publications'], X).fit()

# Print the regression summary
print(model.summary())

# Plotting the result without text parameter
fig = px.scatter(long_publications_extraFees_df, x='Extra Fees', y='Publications', trendline="ols",
                 labels={"Extra_Fees": "Extra Fees", "Publications": "Publications"},
                 title="Regression of Publications on Extra Fees")

# Customizing hover data to only show department name when hovered over
fig.update_traces(
    hovertemplate="Extra Fees: %{x}<br>Publications: %{y}<br>Department: %{text}"
)

# Adding the department names as hover text
fig.add_scatter(x=long_publications_extraFees_df['Extra Fees'], y=long_publications_extraFees_df['Publications'],
                mode='markers', hoverinfo='text', text=long_publications_extraFees_df['Department'],
                showlegend=False)

# Update layout for centered title
fig.update_layout(
    title={
        'text': "Regression of Publications on Extra Fees through EU students",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    hovermode="closest"
)

fig.show()


The above regresses the number of publications on the additional fees generated by EU students across departments through the years 2021 to 2023. This is meant to examine whether a higher extra fees generated EU students in a particular years corresponds to a higher number of publications of that department within a particular year. Continuing the notion of publications proxying research, the idea here is that if there would be a positive regression coefficient it would imply that these additional fees are allocated towards research.

However, as already seen by just inspecting the dots across the graph, these are scattered without indication of any trend. This is confirmed by the coefficient of extra fees being almost 0 and not significant at all (p-value of 0.961). 

Therefore, no trend can be interpreted from this graph and a higher amount of fees generated by EU students does not seem to result in more/less research. Thus, the data is still not revealing of how these additional fees are allocated. 

At least from this simplified lense the effect of changed tuition structure due to Brexit did not seem to impact research of these departments. 