In [63]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from rapidfuzz import process, fuzz


In [64]:
# Load the dataset
data_path = 'GraduateEmploymentSurvey.csv'
df = pd.read_csv(data_path)

# Display the first few rows
print("Dataset Head:")
print(df.head())

# Check dataset info and summary statistics
print("\nDataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())



Dataset Head:
   year                        university  \
0  2013  Nanyang Technological University   
1  2013  Nanyang Technological University   
2  2013  Nanyang Technological University   
3  2013  Nanyang Technological University   
4  2013  Nanyang Technological University   

                                          school  \
0  College of Business (Nanyang Business School)   
1  College of Business (Nanyang Business School)   
2  College of Business (Nanyang Business School)   
3  College of Business (Nanyang Business School)   
4                         College of Engineering   

                                        degree employment_rate_overall  \
0                     Accountancy and Business                    97.4   
1  Accountancy (3-yr direct Honours Programme)                    97.1   
2     Business (3-yr direct Honours Programme)                    90.9   
3                       Business and Computing                    87.5   
4                        Aerospa

In [65]:
print("Columns:", df.columns.tolist())


Columns: ['year', 'university', 'school', 'degree', 'employment_rate_overall', 'employment_rate_ft_perm', 'basic_monthly_mean', 'basic_monthly_median', 'gross_monthly_mean', 'gross_monthly_median', 'gross_mthly_25_percentile', 'gross_mthly_75_percentile']


## SMU change 4 years to 4 year programme
## NTU change and remove double degree

In [66]:
mask_smu = (df['university'] == 'SMU') & (df['school'].str.contains('Accountancy|Business', case=False))

# Vectorized updates for SMU:
mask_acc_laude = mask_smu & df['degree'].str.contains('Accountancy', case=False) & df['degree'].str.contains('Laude', case=False)
df.loc[mask_acc_laude, 'degree'] = 'Accountancy Cum Laude and above'

mask_bus_laude = mask_smu & df['degree'].str.contains('Business', case=False) & df['degree'].str.contains('Laude', case=False)
df.loc[mask_bus_laude, 'degree'] = 'Business Management Cum Laude and above'

mask_acc = mask_smu & df['degree'].str.contains('Accountancy', case=False) & ~df['degree'].str.contains('Laude', case=False)
df.loc[mask_acc, 'degree'] = 'Accountancy'

mask_bus = mask_smu & df['degree'].str.contains('Business', case=False) & ~df['degree'].str.contains('Laude', case=False)
df.loc[mask_bus, 'degree'] = 'Business Management'

# ----- NTU: Remove double degree rows and rename degrees -----
mask_double = (
    (df['university'] == 'Nanyang Technological University') &
    (df['school'] == 'College of Business (Nanyang Business School)') &
    (df['degree'].str.contains('and', case=False))
)
df = df[~mask_double]

mask_acc_ntu = (
    (df['university'] == 'Nanyang Technological University') &
    (df['school'] == 'College of Business (Nanyang Business School)') &
    (df['degree'].str.contains('accountancy', case=False))
)
df.loc[mask_acc_ntu, 'degree'] = 'Bachelor of Accountancy (Hons)'

mask_bus_ntu = (
    (df['university'] == 'Nanyang Technological University') &
    (df['school'] == 'College of Business (Nanyang Business School)') &
    (df['degree'].str.contains('business', case=False))
)
df.loc[mask_bus_ntu, 'degree'] = 'Bachelor of Business (Hons)'

# ----- Save the updated DataFrame to CSV and Excel -----
df.to_csv('standardized_degrees.csv', index=False)

# Display the unique updated degree names
print("Updated degrees:")
print(df['degree'].unique())



Updated degrees:
['Bachelor of Accountancy (Hons)' 'Bachelor of Business (Hons)'
 'Aerospace Engineering' 'Bioengineering'
 'Chemical and Biomolecular Engineering' 'Computer Engineering'
 'Civil Engineering' 'Computer Science'
 'Electrical and Electronic Engineering' 'Environmental Engineering'
 'Information Engineering and Media' 'Materials Engineering'
 'Mechanical Engineering' 'Maritime Studies' 'Art, Design & Media'
 'Chinese' 'Communication Studies' 'Economics' 'English'
 'Linguistics and Multilingual Studies' 'Psychology' 'Sociology'
 'Biomedical Sciences **'
 'Biomedical Sciences (Traditional Chinese Medicine) #'
 'Biological Sciences' 'Chemistry & Biological Chemistry'
 'Mathematics & Economics **' 'Mathematical Science'
 'Physics / Applied Physics' 'Sports Science and Management'
 'Science (with Education)' 'Arts (with Education)' 'Bachelor of Arts'
 'Bachelor of Arts (Hons)' 'Bachelor of Social Sciences'
 'Bachelor of Business Administration'
 'Bachelor of Business Administra

In [67]:
def normalize_degree(text):
    """Strip out common degree prefixes to extract the core subject."""
    if pd.isnull(text):
        return ""
    text = text.lower().strip()
    # Remove common degree prefixes
    patterns = [
        r"bachelor\s+of\s+science\s+",
        r"bachelor\s+of\s+engineering\s+",
        r"bachelor\s+of\s+",
        r"bsc\s+",
        r"beng\s+",
        r"bachelor'?s\s+of\s+science\s+",
        r"bachelor'?s\s+of\s+engineering\s+",
        r"bachelor'?s\s+of\s+",
    ]
    for pat in patterns:
        text = re.sub(pat, "", text)
    return text.strip()

def clean_degree(text):
    """Clean the degree text: remove symbols, extra spaces, and normalize prefixes."""
    if pd.isnull(text):
        return ""
    # Remove special symbols (e.g., *, #, ^)
    text = re.sub(r'[\*\#\^]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Normalize degree text by removing common prefixes
    text = normalize_degree(text)
    return text.strip()

# Create a new column with normalized degree names
df['normalized_degree'] = df['degree'].apply(clean_degree)


# For fuzzy matching, maintain a separate canonical list for each university.
canonical_dict = defaultdict(list)

def get_canonical(row):
    uni = row['university']
    # Convert the normalized degree to title case for uniformity.
    course = row['normalized_degree'].title()
    if not canonical_dict[uni]:
        canonical_dict[uni].append(course)
        return course
    # Compare only within this university's canonical names.
    match, score, _ = process.extractOne(course, canonical_dict[uni], scorer=fuzz.ratio)
    if score >= 80:
        return match
    else:
        canonical_dict[uni].append(course)
        return course

df['canonical_degree'] = df.apply(get_canonical, axis=1)

df.to_csv("standardized_degrees.csv", index=False)


In [71]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report


# --- Step 2: Clean and normalize the degree text ---
def clean_degree_for_model(text):
    if pd.isnull(text):
        return ""
    text = text.lower().strip()
    # Remove unwanted symbols like *, #, ^
    text = re.sub(r'[\*\#\^]+', '', text)
    # Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text)
    # Remove honours markers, e.g., "(hons)" or "honours"
    text = re.sub(r'\(hons\)', '', text)
    text = re.sub(r'\shonours', '', text)
    # Remove programme details (anything in parentheses mentioning 'programme')
    text = re.sub(r'\(.*programme\)', '', text)
    return text.strip()

df['degree_cleaned'] = df['degree'].apply(clean_degree_for_model)

# --- Step 3: Create a binary honours flag ---
df['is_honours'] = df['degree'].str.contains('hons|honours', case=False, na=False).astype(int)

# --- Step 4: Create the target column 'field_of_study' using a comprehensive mapping ---
# Expanded mapping dictionary: more specific phrases first (sorted by length).
# Balanced keyword_to_field mapping:
keyword_to_field = {
    # IT and Computing
    'computer science': 'Computer Science',
    'computing': 'IT',
    'information systems': 'IT',
    'information technology': 'IT',
    'information security': 'Cybersecurity',
    'electronic commerce': 'IT',

    # Engineering disciplines (clear and general enough)
    'aerospace': 'Aerospace Engineering',
    'bioengineering': 'Bioengineering',
    'biomedical engineering': 'Bioengineering',
    'chemical': 'Chemical Engineering',
    'civil': 'Civil Engineering',
    'computer engineering': 'Computer Engineering',
    'electrical': 'Electrical Engineering',
    'environmental engineering': 'Environmental Engineering',
    'industrial and systems engineering': 'Industrial Engineering',
    'industrial engineering': 'Industrial Engineering',
    'systems engineering': 'Systems Engineering',
    'materials': 'Materials Engineering',
    'mechanical': 'Mechanical Engineering',
    'marine': 'Marine Engineering',
    'naval architecture': 'Marine Engineering',
    'mechatronics': 'Mechanical Engineering',
    'engineering science': 'Engineering Science',

    # Sciences (general yet accurate)
    'biomedical sciences': 'Biological Sciences',
    'biological sciences': 'Biological Sciences',
    'chemistry': 'Chemistry',
    'physics': 'Physics',
    'mathematics': 'Mathematics',
    'mathematics & economics': 'Mathematics & Economics',
    'computational biology': 'Computational Biology',
    'environmental earth systems': 'Environmental Science',

    # Business-related degrees
    'accountancy': 'Accountancy',
    'business analytics': 'Business Analytics',
    'business': 'Business',
    'finance': 'Finance',
    'marketing': 'Marketing',
    'supply chain management': 'Supply Chain Management',
    'hospitality': 'Hospitality Management',
    'culinary arts': 'Culinary Arts',

    # Healthcare-related
    'medicine': 'Medicine',
    'surgery': 'Medicine',
    'nursing': 'Nursing',
    'pharmacy': 'Pharmacy',
    'occupational therapy': 'Health Sciences',
    'physiotherapy': 'Health Sciences',
    'radiography': 'Health Sciences',
    'radiation therapy': 'Health Sciences',
    'dentistry': 'Dentistry',
    'nutrition': 'Food Science and Nutrition',
    'traditional chinese medicine': 'Traditional Chinese Medicine',

    # Social sciences and Humanities
    'communication': 'Communication Studies',
    'criminology': 'Criminology',
    'psychology': 'Psychology',
    'public policy': 'Public Policy',
    'global affairs': 'Global Affairs',
    'sociology': 'Sociology',
    'economics': 'Economics',
    'social sciences': 'Social Sciences',
    'linguistics': 'Linguistics',
    'education': 'Education',
    'early childhood': 'Education',
    'human resource': 'Human Resource Management',
    'social work': 'Social Work',

    # Law
    'law': 'Law',

    # Arts and Design
    'architecture': 'Architecture',
    'industrial design': 'Industrial Design',
    'fine arts': 'Fine Arts',
    'digital arts': 'Digital Arts',
    'game design': 'Game Design',
    'music': 'Music',
    'art, design': 'Art & Design',
    'interior design': 'Interior Design',

    # Built environment & Real estate
    'real estate': 'Real Estate',
    'facilities management': 'Facilities Management',

    # General fields and fallback
    'sports science': 'Sports Science',
    'maritime': 'Maritime Studies',
    'history': 'History',
    'english': 'English',
    'philosophy': 'Philosophy',
    'chinese': 'Chinese Studies',
    'applied science': 'Applied Science',
    'environmental studies': 'Environmental Studies',
}

# Mapping function using keyword hierarchy
def map_to_field(degree):
    degree_lower = degree.lower()
    for keyword in sorted(keyword_to_field, key=len, reverse=True):
        if keyword in degree_lower:
            return keyword_to_field[keyword]
    return 'Other'

# Apply to DataFrame
df['field_of_study'] = df['degree_cleaned'].apply(map_to_field)



def map_to_field(degree):
    # Lowercase the degree text for matching
    degree = degree.lower()
    # Sort the keywords by length in descending order so that more specific phrases are matched first.
    for keyword in sorted(keyword_to_field.keys(), key=len, reverse=True):
        if keyword in degree:
            return keyword_to_field[keyword]
    return 'Other'

df['field_of_study'] = df['degree_cleaned'].apply(map_to_field)

# --- Step 5: (Optional) Check the updated unique standardized degree names ---
print("Unique standardized field of study labels:")
print(df['field_of_study'].unique())

# --- Step 6: Export and display the results ---
df.to_csv('poggers.csv', index=False)
print(df[['university', 'degree', 'degree_cleaned', 'is_honours', 'field_of_study']].head(10))



Unique standardized field of study labels:
['Accountancy' 'Business' 'Aerospace Engineering' 'Bioengineering'
 'Chemical Engineering' 'Computer Engineering' 'Civil Engineering'
 'Computer Science' 'Electrical Engineering' 'Environmental Engineering'
 'Other' 'Materials Engineering' 'Mechanical Engineering'
 'Maritime Studies' 'Art & Design' 'Chinese Studies'
 'Communication Studies' 'Economics' 'English' 'Linguistics' 'Psychology'
 'Sociology' 'Biological Sciences' 'Traditional Chinese Medicine'
 'Chemistry' 'Mathematics & Economics' 'Physics' 'Sports Science'
 'Education' 'Social Sciences' 'Computational Biology' 'IT' 'Medicine'
 'Architecture' 'Industrial Design' 'Facilities Management' 'Real Estate'
 'Engineering Science' 'Industrial Engineering' 'Law' 'Nursing' 'Music'
 'Applied Science' 'Pharmacy' 'Game Design' 'Digital Arts'
 'Interior Design' 'Marine Engineering' 'Food Science and Nutrition'
 'Culinary Arts' 'Health Sciences' 'Hospitality Management'
 'Environmental Studies' 'Hi