In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from rapidfuzz import process, fuzz


In [11]:
# Load the dataset
data_path = 'GraduateEmploymentSurvey.csv'
df = pd.read_csv(data_path)

# Display the first few rows
print("Dataset Head:")
print(df.head())

# Check dataset info and summary statistics
print("\nDataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())



Dataset Head:
   year                        university  \
0  2013  Nanyang Technological University   
1  2013  Nanyang Technological University   
2  2013  Nanyang Technological University   
3  2013  Nanyang Technological University   
4  2013  Nanyang Technological University   

                                          school  \
0  College of Business (Nanyang Business School)   
1  College of Business (Nanyang Business School)   
2  College of Business (Nanyang Business School)   
3  College of Business (Nanyang Business School)   
4                         College of Engineering   

                                        degree employment_rate_overall  \
0                     Accountancy and Business                    97.4   
1  Accountancy (3-yr direct Honours Programme)                    97.1   
2     Business (3-yr direct Honours Programme)                    90.9   
3                       Business and Computing                    87.5   
4                        Aerospa

In [14]:
print("Columns:", df.columns.tolist())


Columns: ['year', 'university', 'school', 'degree', 'employment_rate_overall', 'employment_rate_ft_perm', 'basic_monthly_mean', 'basic_monthly_median', 'gross_monthly_mean', 'gross_monthly_median', 'gross_mthly_25_percentile', 'gross_mthly_75_percentile', 'cleaned_degree', 'canonical_degree']


In [28]:

# Clean the course names
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower().strip()
    text = re.sub(r'[\*\#\^]+', '', text)  # remove special symbols
    text = re.sub(r'\s+', ' ', text)       # collapse extra spaces
    return text

df['cleaned_degree'] = df['degree'].apply(clean_text)

# We'll keep a separate list of canonical names for each university
canonical_dict = defaultdict(list)

def get_canonical(row):
    """
    For each row, only compare the 'cleaned_degree' to the canonical names
    that belong to that row's university. This prevents merging across unis.
    """
    uni = row['university']
    course = row['cleaned_degree'].title()

    # If no canonical names yet for this university, just add this course
    if not canonical_dict[uni]:
        canonical_dict[uni].append(course)
        return course

    # Otherwise, find the best match among this uni's canonical names
    match, score, _ = process.extractOne(course, canonical_dict[uni], scorer=fuzz.ratio)
    if score >= 90:
        # If it's similar enough, reuse that canonical name
        return match
    else:
        # Otherwise, treat it as a new canonical name
        canonical_dict[uni].append(course)
        return course

# Apply the fuzzy matching row-by-row
df['canonical_degree'] = df.apply(get_canonical, axis=1)

# Now each university has its own set of canonical names.
# For example, let's group and see the unique canonical degrees per uni
grouped = df.groupby('university')['canonical_degree'].unique()
grouped

display(df[['university', 'degree', 'canonical_degree']].head(10))

# 3) Write the entire DataFrame to Excel (or CSV).
#    This will include the 'canonical_degree' column.
df.to_csv("standardized_degrees.csv", index=False)


Unnamed: 0,university,degree,canonical_degree
0,Nanyang Technological University,Accountancy and Business,Accountancy And Business
1,Nanyang Technological University,Accountancy (3-yr direct Honours Programme),Accountancy (3-Yr Direct Honours Programme)
2,Nanyang Technological University,Business (3-yr direct Honours Programme),Business (3-Yr Direct Honours Programme)
3,Nanyang Technological University,Business and Computing,Business And Computing
4,Nanyang Technological University,Aerospace Engineering,Aerospace Engineering
5,Nanyang Technological University,Bioengineering,Bioengineering
6,Nanyang Technological University,Chemical and Biomolecular Engineering,Chemical And Biomolecular Engineering
7,Nanyang Technological University,Computer Engineering,Computer Engineering
8,Nanyang Technological University,Civil Engineering,Civil Engineering
9,Nanyang Technological University,Computer Science,Computer Science
