In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from rapidfuzz import process, fuzz


In [11]:
# Load the dataset
data_path = 'GraduateEmploymentSurvey.csv'
df = pd.read_csv(data_path)

# Display the first few rows
print("Dataset Head:")
print(df.head())

# Check dataset info and summary statistics
print("\nDataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())



Dataset Head:
   year                        university  \
0  2013  Nanyang Technological University   
1  2013  Nanyang Technological University   
2  2013  Nanyang Technological University   
3  2013  Nanyang Technological University   
4  2013  Nanyang Technological University   

                                          school  \
0  College of Business (Nanyang Business School)   
1  College of Business (Nanyang Business School)   
2  College of Business (Nanyang Business School)   
3  College of Business (Nanyang Business School)   
4                         College of Engineering   

                                        degree employment_rate_overall  \
0                     Accountancy and Business                    97.4   
1  Accountancy (3-yr direct Honours Programme)                    97.1   
2     Business (3-yr direct Honours Programme)                    90.9   
3                       Business and Computing                    87.5   
4                        Aerospa

In [14]:
print("Columns:", df.columns.tolist())


Columns: ['year', 'university', 'school', 'degree', 'employment_rate_overall', 'employment_rate_ft_perm', 'basic_monthly_mean', 'basic_monthly_median', 'gross_monthly_mean', 'gross_monthly_median', 'gross_mthly_25_percentile', 'gross_mthly_75_percentile', 'cleaned_degree', 'canonical_degree']


In [26]:

# Clean the course names
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower().strip()
    text = re.sub(r'[\*\#\^]+', '', text)  # remove special symbols
    text = re.sub(r'\s+', ' ', text)       # collapse extra spaces
    return text

df['cleaned_degree'] = df['degree'].apply(clean_text)

# We'll keep a separate list of canonical names for each university
canonical_dict = defaultdict(list)

def get_canonical(row):
    """
    For each row, only compare the 'cleaned_degree' to the canonical names
    that belong to that row's university. This prevents merging across unis.
    """
    uni = row['university']
    course = row['cleaned_degree'].title()

    # If no canonical names yet for this university, just add this course
    if not canonical_dict[uni]:
        canonical_dict[uni].append(course)
        return course

    # Otherwise, find the best match among this uni's canonical names
    match, score, _ = process.extractOne(course, canonical_dict[uni], scorer=fuzz.ratio)
    if score >= 80:
        # If it's similar enough, reuse that canonical name
        return match
    else:
        # Otherwise, treat it as a new canonical name
        canonical_dict[uni].append(course)
        return course

# Apply the fuzzy matching row-by-row
df['canonical_degree'] = df.apply(get_canonical, axis=1)

# Now each university has its own set of canonical names.
# For example, let's group and see the unique canonical degrees per uni
grouped = df.groupby('university')['canonical_degree'].unique()
grouped

display(df[['university', 'degree', 'canonical_degree']].head(10))

# 3) Write the entire DataFrame to Excel (or CSV).
#    This will include the 'canonical_degree' column.
df.to_csv("standardized_degrees.csv", index=False)


Unnamed: 0,university,degree,canonical_degree
0,Nanyang Technological University,Accountancy and Business,Accountancy And Business
1,Nanyang Technological University,Accountancy (3-yr direct Honours Programme),Accountancy (3-Yr Direct Honours Programme)
2,Nanyang Technological University,Business (3-yr direct Honours Programme),Accountancy (3-Yr Direct Honours Programme)
3,Nanyang Technological University,Business and Computing,Business And Computing
4,Nanyang Technological University,Aerospace Engineering,Aerospace Engineering
5,Nanyang Technological University,Bioengineering,Bioengineering
6,Nanyang Technological University,Chemical and Biomolecular Engineering,Chemical And Biomolecular Engineering
7,Nanyang Technological University,Computer Engineering,Computer Engineering
8,Nanyang Technological University,Civil Engineering,Civil Engineering
9,Nanyang Technological University,Computer Science,Computer Science


In [16]:
# Define a cleaning function for course names
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercase, strip whitespace, and remove symbols like *, #, ^
    text = text.lower().strip()
    text = re.sub(r'[\*\#\^]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Create a cleaned version of the 'degree' column
df['cleaned_degree'] = df['degree'].apply(clean_text)

# Group by 'university' and 'year' to see unique cleaned course names per group
grouped_clean = df.groupby(['university', 'year'])['cleaned_degree'].unique()
print("Unique course names by university and year (cleaned):")
print(grouped_clean)

# Now, use RapidFuzz to create canonical names automatically.
# We'll maintain a list of canonical names and, for each course, check if a similar name exists.
canonical_names = []

def get_canonical(course):
    # Convert course name to title case for uniformity
    course_title = course.title()
    if not canonical_names:
        canonical_names.append(course_title)
        return course_title
    # Find the best match among the canonical names using a threshold of 80
    match, score, _ = process.extractOne(course_title, canonical_names, scorer=fuzz.ratio)
    if score >= 80:
        return match
    else:
        canonical_names.append(course_title)
        return course_title

# Apply the canonicalization to the cleaned degree names
df['canonical_degree'] = df['cleaned_degree'].apply(get_canonical)

# Group by 'university' and 'year' using the canonical names
grouped_canonical = df.groupby(['university', 'year'])['canonical_degree'].unique()
print("\nStandardized course names by university and year (canonical):")
print(grouped_canonical)

Unique course names by university and year (cleaned):
university                                     year
Nanyang Technological University               2013    [accountancy and business, accountancy (3-yr d...
                                               2014    [aerospace engineering, accountancy and busine...
                                               2015    [accountancy and business, accountancy (3-yr d...
                                               2016    [accountancy, accountancy and business, busine...
                                               2017    [bachelor of accountancy (hons), double degree...
                                               2018    [accountancy, accountancy and business, busine...
                                               2019    [accountancy, accountancy and business, busine...
                                               2020    [accountancy, accountancy and business, busine...
                                               2021   

In [21]:
# 1) Convert the grouped Series to a DataFrame
df_grouped = grouped_canonical.reset_index()

# Show entire column content without truncation
pd.set_option('display.max_colwidth', None)


df_grouped['courses_str'] = df_grouped['canonical_degree'].apply(lambda x: ', '.join(x))
df_grouped = df_grouped.drop(columns='canonical_degree')
df_grouped



Unnamed: 0,university,year,courses_str
0,Nanyang Technological University,2013,"Accountancy And Business, Accountancy (3-Yr Direct Honours Programme), Business And Computing, Aerospace Engineering, Bioengineering, Chemical And Biomolecular Engineering, Computer Engineering, Civil Engineering, Computer Science, Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Materials Engineering, Mechanical Engineering, Maritime Studies, Art, Design & Media, Chinese, Communication Studies, Economics, English, Linguistics And Multilingual Studies, Psychology, Sociology, Biomedical Sciences , Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Mathematics & Economics , Mathematical Science, Physics / Applied Physics, Sports Science And Management, Science (With Education), Arts (With Education)"
1,Nanyang Technological University,2014,"Aerospace Engineering, Accountancy And Business, Accountancy (3-Yr Direct Honours Programme), Business And Computing, Bioengineering, Chemical And Biomolecular Engineering, Computer Engineering, Civil Engineering, Computer Science, Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Materials Engineering, Mechanical Engineering, Maritime Studies, Art, Design & Media, Chinese, Communication Studies, Economics, English, Linguistics And Multilingual Studies, Psychology, Sociology, Biomedical Sciences , Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Mathematics & Economics , Mathematical Science, Physics / Applied Physics, Sports Science And Management, Science (With Education), Arts (With Education)"
2,Nanyang Technological University,2015,"Accountancy And Business, Accountancy (3-Yr Direct Honours Programme), Business And Computing, Aerospace Engineering, Aerospace Engineering And Economics , Bioengineering, Business And Computer Engineering , Chemical And Biomolecular Engineering, Computer Engineering, Civil Engineering, Computer Science, Electrical And Electronic Engineering, Environmental Engineering, Environmental Engineering And Economics , Information Engineering And Media, Materials Engineering, Mechanical Engineering, Maritime Studies, Art, Design & Media, Chinese, Communication Studies, Economics, English, Linguistics And Multilingual Studies, Psychology, Sociology, Biomedical Sciences (Traditional Chinese Medicine) , Biomedical Sciences , Chemistry & Biological Chemistry, Mathematics & Economics , Mathematical Science, Physics / Applied Physics, Sports Science And Management, Science (With Education), Arts (With Education)"
3,Nanyang Technological University,2016,"Accountancy, Accountancy And Business, Business, Business And Computing, Aerospace Engineering, Aerospace Engineering And Economics , Bioengineering, Business And Computer Engineering , Chemical And Biomolecular Engineering, Civil Engineering, Computer Engineering, Computer Science, Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Maritime Studies, Materials Engineering, Mechanical Engineering, Art, Design & Media, Chinese, Communication Studies, Economics, English, History, Linguistics And Multilingual Studies, Psychology, Public Policy And Global Affairs , Sociology, Biomedical Sciences (Traditional Chinese Medicine) , Biomedical Sciences , Chemistry & Biological Chemistry, Mathematical Science, Mathematics & Economics , Physics / Applied Physics, Sports Science And Management, Arts (With Education), Science (With Education)"
4,Nanyang Technological University,2017,"Bachelor Of Accountancy (Hons), Double Degree In Bachelor Of Accountancy (Hons) And Bachelor Of Business (Hons), Bachelor Of Business (Hons), Double Degree In Bachelor Of Business (Hons) And Bachelor Of Engineering (Hons) (Computer Science), Bachelor Of Engineering (Bioengineering), Double Degree In Bachelor Of Engineering (Hons) (Aerospace Engineering) And Bachelor Of Arts (Hons) In Economics , Bachelor Of Engineering (Hons) (Chemical And Biomolecular Engineering), Bachelor Of Engineering (Hons) (Computer Science), Bachelor Of Engineering (Hons) (Environmental Engineering), Bachelor Of Science (Hons) (Maritime Studies), Bachelor Of Arts (Hons) In Chinese, Bachelor Of Communication Studies (Hons), Bachelor Of Arts (Hons) In Linguistics And Multilingual Studies, Bachelor Of Arts (Hons) In Psychology, Bachelor Of Arts (Hons) In Public Policy And Global Affairs, Double Degree In Bachelor Of Science (Hons) In Biomedical Sciences And Bachelor Of Medicine (Chinese Medicine) , Bachelor Of Science (Hons) In Biological Sciences, Bachelor Of Science (Hons) In Chemistry & Biological Chemistry, Bachelor Of Science (Hons) In Mathematics & Economics, Bachelor Of Science (Hons) In Physics/Applied Physics, Bachelor Of Science (Hons) (Sport Science & Management), Bachelor Of Arts (Hons) (Education)"
5,Nanyang Technological University,2018,"Accountancy, Accountancy And Business, Business, Business And Computing, Aerospace Engineering, Aerospace Engineering And Economics , Bioengineering, Business And Computer Engineering , Chemical And Biomolecular Engineering, Chemical & Biomolecular Engineering And Economics , Civil Engineering, Computer Engineering, Computer Science, Electrical And Electronic Engineering, Environmental Engineering, Environmental Engineering And Economics , Information Engineering And Media, Maritime Studies, Materials Engineering, Mechanical Engineering, Art, Design & Media, Chinese, Communication Studies, Economics, English, History, Linguistics And Multilingual Studies, Philosophy, Psychology, Public Policy And Global Affairs , Sociology, Biomedical Sciences , Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Environmental Earth Systems Sciences , Mathematical Science, Mathematics & Economics , Physics / Applied Physics, Arts (With Education), Science (With Education), Sports Science And Management, Medicine"
6,Nanyang Technological University,2019,"Accountancy, Accountancy And Business, Business, Business And Computing, Aerospace Engineering, Aerospace Engineering And Economics , Bioengineering, Business And Computer Engineering , Chemical And Biomolecular Engineering, Chemical & Biomolecular Engineering And Economics , Civil Engineering, Environmental Engineering And Economics , Computer Engineering, Computer Science, Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Maritime Studies, Materials Engineering, Mechanical Engineering, Art, Design & Media, Chinese, Communication Studies, Economics, English, History, Linguistics And Multilingual Studies, Philosophy, Psychology, Public Policy And Global Affairs , Sociology, Biomedical Sciences , Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Environmental Earth Systems Sciences , Mathematical Science, Mathematics & Economics , Physics / Applied Physics, Sports Science And Management, Arts (With Education), Science (With Education), Medicine"
7,Nanyang Technological University,2020,"Accountancy, Accountancy And Business, Business, Double Degree In Business And Computer Engineering/Computing, Aerospace Engineering, Bioengineering, Chemical And Biomolecular Engineering, Civil Engineering, Computer Engineering, Computer Science, Double Degree In Engineering And Economics , Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Maritime Studies, Materials Engineering, Mechanical Engineering, Art, Design & Media, Chinese, Communication Studies, Economics, English, History, Inter-Disciplinary Double Major , Linguistics And Multilingual Studies, Philosophy, Psychology, Public Policy And Global Affairs , Sociology, Biomedical Sciences , Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Environmental Earth Systems Sciences , Mathematical Science, Mathematics & Economics , Physics / Applied Physics, Sports Science And Management, Arts (With Education), Science (With Education), Medicine"
8,Nanyang Technological University,2021,"Accountancy, Accountancy And Business, Business, Double Degree In Business And Computer Engineering/Computing, Aerospace Engineering, Bioengineering, Chemical And Biomolecular Engineering, Civil Engineering, Computer Engineering, Computer Science, Double Degree In Engineering And Economics , Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Maritime Studies, Materials Engineering, Mechanical Engineering, Art, Design & Media, Chinese, Communication Studies, Economics, English, History, Inter-Disciplinary Double Major , Linguistics And Multilingual Studies, Philosophy, Psychology, Public Policy And Global Affairs , Sociology, Biological Sciences / Biomedical Sciences, Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Environmental Earth Systems Sciences , Mathematical Sciences / Mathematical Sciences And Economics, Physics / Applied Physics, Sports Science And Management, Arts (With Education), Science (With Education), Medicine"
9,Nanyang Technological University,2022,"Accountancy, Accountancy And Business, Business, Double Degree In Business And Computer Engineering/Computing, Aerospace Engineering, Bioengineering, Chemical And Biomolecular Engineering, Civil Engineering, Computer Engineering, Computer Science, Data Science And Artificial Intelligence , Double Degree In Engineering And Economics , Electrical And Electronic Engineering, Environmental Engineering, Information Engineering And Media, Maritime Studies, Materials Engineering, Mechanical Engineering, Art, Design & Media, Chinese, Communication Studies, Economics, English, History, Inter-Disciplinary Double Major , Linguistics And Multilingual Studies, Philosophy, Psychology, Public Policy And Global Affairs , Sociology, Biological Sciences / Biomedical Sciences, Biological Sciences And Psychology , Biomedical Sciences (Traditional Chinese Medicine) , Chemistry & Biological Chemistry, Environmental Earth Systems Sciences , Mathematical Sciences / Mathematical Sciences And Economics, Physics / Applied Physics, Arts (With Education), Science (With Education), Sports Science And Management, Medicine"
