<a href="https://colab.research.google.com/github/alexgaaranes/malaia-group-2/blob/main/MALAIA_Liyab_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MALAIA - Group 2

---
##### Predicting Starting Salaries of Filipino Graduates Using Academic Background and Industry Placement: A Machine Learning Approach Based on the Liyab First Pay Survey

<br>



Cleaning data from [**Liyab First Pay Survey dataset**](https://docs.google.com/spreadsheets/d/1gnA91Tjr_3UCNV8x1_LoE0oC56r-pXXRdJcgTfOLlm0/edit?gid=549575995#gid=549575995)

### Data Prep

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Read data from shared drive
liyab = pd.read_csv("/content/drive/Shareddrives/MALAIA Group 2/liyab_data/liyab.csv")


### Data Cleaning

In [None]:
# Universities
universities = liyab['What school did you graduate from?'].unique()

# Years
years = liyab['What year did you start your first job?'].unique()

# Filter the years
liyab_filtered_years = liyab[liyab['What year did you start your first job?'].between(1987, 2025)]

# Print filtered years and their frequencies
print("Filtered Years and Frequencies")
print(liyab_filtered_years['What year did you start your first job?'].value_counts().sort_index())

# Print unfiltered years and their frequencies
print("\nUnfiltered Years and Frequencies")
print(liyab['What year did you start your first job?'].value_counts().sort_index())


Filtered Years and Frequencies
What year did you start your first job?
1987      1
1992      1
1997      1
1998      2
1999      3
2000      3
2001      2
2002      5
2003      7
2004      6
2005     18
2006     18
2007     18
2008     21
2009     31
2010     42
2011     65
2012     89
2013    118
2014    142
2015    193
2016    291
2017    441
2018    574
2019    649
2020    112
2021     24
2022     27
2023      6
2024     10
2025      2
Name: count, dtype: int64

Unfiltered Years and Frequencies
What year did you start your first job?
2          1
18         1
19         1
20         2
21         1
208        1
209        1
1987       1
1992       1
1997       1
1998       2
1999       3
2000       3
2001       2
2002       5
2003       7
2004       6
2005      18
2006      18
2007      18
2008      21
2009      31
2010      42
2011      65
2012      89
2013     118
2014     142
2015     193
2016     291
2017     441
2018     574
2019     649
2020     112
2021      24
2022      27
20

In [None]:
# Find the matching universities
import difflib
import numpy as np

MIN_RATIO: float = 0.8

uni_copy = universities.copy()
uni_copy = np.array([x for x in uni_copy if isinstance(x, str)])  # remove non-str (empty)

map = {}

for i in range(len(uni_copy)):
  if len(uni_copy) <= 0: break
  subj = uni_copy[0]
  for uni in uni_copy:
    similarity = difflib.SequenceMatcher(None, subj, uni).ratio()
    if similarity >= MIN_RATIO:
      map[str(uni)] = str(subj)
      uni_copy = uni_copy[~np.isin(uni_copy, uni)]

print(f"Length Diff:\n filtered: {len(set(map.values()))}\n orig: {len(universities)}")

Length Diff:
 filtered: 324
 orig: 598


In [None]:
# Replace the input university name with the common name

liyab_copy = liyab.copy()

print(f"Number of rows: {len(liyab_copy)}")

liyab_copy['What school did you graduate from?'] = liyab_copy['What school did you graduate from?'].replace(map)

# Further cleaning needed esp for acronyms and some campuses of the same univ are overwritten
print(f"Number of filtered univs: {len(liyab_copy['What school did you graduate from?'].unique())}")

# Rows with empty university name
print(f"No. of rows with empty univ name: {len(liyab_copy[liyab_copy['What school did you graduate from?'].isna()])}")

# Remove rows without univ name
liyab_copy = liyab_copy[~liyab_copy['What school did you graduate from?'].isna()]

print(f"Current number of rows: {len(liyab_copy)}")

Number of rows: 2933
Number of filtered univs: 325
No. of rows with empty univ name: 167
Current number of rows: 2766


### **Prompted solution**

In [None]:
!pip install rapidfuzz scikit-learn

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [None]:
from rapidfuzz.distance import Levenshtein
from sklearn.cluster import AgglomerativeClustering
import numpy as np

strings = universities

# Levenshtein distance
n = len(strings)
distance_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(n):
        if not (type(strings[i]) is str and type(strings[j]) is str): continue
        if i != j:
            distance_matrix[i][j] = Levenshtein.distance(strings[i], strings[j])
        else:
            distance_matrix[i][j] = 0

# Agglomerative Clustering
clustering = AgglomerativeClustering(
    metric='precomputed',
    linkage='complete',
    distance_threshold=10.0,
    n_clusters=None
)

labels = clustering.fit_predict(distance_matrix)

# Group by cluster label
from collections import defaultdict

clusters = defaultdict(list)
for label, string in zip(labels, strings):
    clusters[label].append(string)

for i, group in clusters.items():
    print(f"Cluster {i + 1}: {group}")


Cluster 21: ['UP Diliman', 'UP Visayas', 'UP diliman', nan, 'Up Diliman', 'UP-Diliman', 'UP Mindanao', 'UP Diliman ', 'Silliman', 'Up diliman', 'UP Diliiman', 'UE Caloocan', 'UP Tacloban', 'UP DIliman']
Cluster 25: ['Mapua University', 'Silliman University', 'Xavier University', 'San Beda University', 'National University', 'Arellano University', 'Adamson University', 'adamson university', 'Mapúa University', 'xavier university', 'Adamson University ', 'Bicol University', 'Silliman University ', 'Mapúa University ', 'New Era University', 'Baliuag University']
Cluster 10: ['DLSU', 'ADMU', 'UST', 'UP', 'Dlscsb', 'AdMU', 'DLS-CSB', 'UPLB', 'UPD', 'Ateneo ', 'Dlsu', 'Letran', 'PUP', 'feu', 'UPB', 'FEU', 'ust', 'DLSU-M', 'PLM', '2007', 'ua&p', 'RTU', 'MHCN', 'Ateneo', 'DLSCSB', 'CSB', 'DLS CSB', 'SSCR', 'APC', 'PWU', 'SISC', 'UP Cebu', 'UE', 'SPCBA', 'MAPUA', 'DLSUD ', 'uste', 'DLSU-D', 'Mapua ', 'Letran ', 'UP ', 'Mapua', 'uppb', 'Iacademy', 'USC', 'Ust', 'MCU', 'CEU', 'DLSUD', 'FEU ', 'US

In [None]:
pd.set_option('display.max_rows', None)

gender_cleaned = liyab['What is your gender?'].str.lower().str.strip()

# Define mapping
gender_map = {
    # FEMALE variants
    'female': 'female',
    'f': 'female',
    'femaile': 'female',
    'femail': 'female',
    'femali': 'female',
    'femalen': 'female',
    'femal': 'female',
    'femalr': 'female',
    'femalw': 'female',
    'femaled': 'female',
    'femae': 'female',
    'feme': 'female',
    'feme': 'female',
    'babae': 'female',
    'cisgender female': 'female',
    'cis female': 'female',
    'women': 'female',
    'woman': 'female',
    'female (cishet)': 'female',
    'biological female': 'female',
    'heterosexual female': 'female',
    'female (queer)': 'female',
    'cisgender-female': 'female',
    'female, cisgender': 'female',
    'cis woman/female': 'female',
    'frmale': 'female',
    '*sex = female': 'female',

    # MALE variants
    'male': 'male',
    'm': 'male',
    'make': 'male',
    'man': 'male',
    'cisgender male': 'male',
    'cis male': 'male',
    'male cisgender': 'male',
    'heterosexual male': 'male',
    'homosexual man': 'male',
    'males': 'male',
    'mqle': 'male',
    'norzagaray collegemale': 'male',

     # LGBTQ+
    'gay': 'lgbtq',
    'lesbian': 'lgbtq',
    'queer': 'lgbtq',
    'bisexual': 'lgbtq',
    'bisexual woman': 'lgbtq',
    'bisexual female': 'lgbtq',
    'cis-gender, pansexual, masculine': 'lgbtq',
    'nonbinary': 'lgbtq',
    'non-binary': 'lgbtq',
    'nb': 'lgbtq',
    'gender fluid': 'lgbtq',
    'non-conforming': 'lgbtq',
    'non-conforming male': 'lgbtq',
    'non-binary, presenting mainly as male': 'lgbtq',

    # PREFER NOT TO SAY
    'prefer not to say': 'prefer not to say',
    'prefer not to mention': 'prefer not to say',

    # OTHERs
    'tired potato': 'other',
    '21': 'other',
    '24': 'other',
    'philippines': 'other',
    'pogi': 'other',
}

# Apply mapping with fallback
gender_cleaned = gender_cleaned.replace(gender_map)

print(gender_cleaned.value_counts())

What is your gender?
female               1775
male                  995
lgbtq                  24
other                   5
prefer not to say       2
homosexual              1
homosexual male         1
Name: count, dtype: int64


In [None]:
universities_count = liyab['What school did you graduate from?'].value_counts().sort_index()
print(universities_count)

What school did you graduate from?
 De La Salle University-Dasmariñas                    1
(deferred) University of the Philippines - Diliman    1
2007                                                  1
2019                                                  1
2022                                                  1
                                                     ..
uplb                                                  1
uppb                                                  1
ust                                                   3
uste                                                  1
xavier university                                     1
Name: count, Length: 597, dtype: int64


In [None]:
import re

In [None]:
universities_formatted = liyab['What school did you graduate from?'].astype(str).str.lower().str.strip()
universities_formatted = universities_formatted.apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x))

print(universities_formatted.value_counts())

What school did you graduate from?
up diliman                     359
ateneo de manila university    218
nan                            167
ust                            150
university of santo tomas      110
                              ... 
university of makati             1
st scholasticas college          1
universidad de manila            1
adnu                             1
velez college                    1
Name: count, Length: 460, dtype: int64


### Industry Cleaning

In [None]:
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# === STEP 1: Normalize responses ===
def normalize(s):
    if not isinstance(s, str): return ''
    s = s.lower().strip()
    s = re.sub(r'[^a-z0-9\s]', '', s)  # remove punctuation
    s = re.sub(r'\s+', ' ', s)         # remove extra spaces
    return s

# Normalize and deduplicate entries
normalized_series = liyab['In what industry was this job?'].dropna().apply(normalize)
unique_entries = normalized_series.unique()

# === STEP 2: Define master categories ===
master_categories = [
    "Accountancy, banking and finance",
    "Business, consulting and management",
    "Charity and voluntary work",
    "Creative arts and design",
    "Energy and utilities",
    "Engineering and manufacturing",
    "Environment and agriculture",
    "Healthcare",
    "Hospitality and events management",
    "Information technology",
    "Law",
    "Law enforcement and security",
    "Leisure, sport and tourism",
    "Marketing, advertising and PR",
    "Media and internet",
    "Property and construction",
    "Public services and administration",
    "Recruitment and HR",
    "Retail",
    "Sales",
    "Science and pharmaceuticals",
    "Social care",
    "Teacher training and education",
    "Transport and logistics"
]

# === STEP 3: Load SentenceTransformer model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed master categories and industry entries
category_embeddings = model.encode(master_categories)
entry_embeddings = model.encode(unique_entries)

# === STEP 4: Assign each entry to the closest master category ===
entry_to_category = {}
for i, entry_vector in enumerate(entry_embeddings):
    similarities = cosine_similarity([entry_vector], category_embeddings)[0]
    best_category_idx = np.argmax(similarities)
    best_category = master_categories[best_category_idx]
    entry_to_category[unique_entries[i]] = best_category

# === STEP 5: Manual overrides for known misclassifications ===
manual_overrides = {
    # "nonprofitdevelopment work": "Government & NGO",
    # Add more overrides here if needed
}

# Apply overrides
for key, override_cat in manual_overrides.items():
    entry_to_category[key] = override_cat

# === STEP 6: Group entries by category ===
category_map = defaultdict(list)
for entry, category in entry_to_category.items():
    category_map[category].append(entry)

# === STEP 7: Output grouped entries ===
for category, entries in category_map.items():
    print(f"\n== {category} ==")
    for entry in entries:
        print(f"- {entry}")



== Accountancy, banking and finance ==
- banking
- fintech
- financial services
- finance financial technology
- finance
- financial markets
- financial research
- audit
- audit firm
- auditing
- finance bank
- financial services banking
- fund management
- auditing firm
- banking industry
- accounting firm
- financial service
- bank
- financing
- financial industry
- bank customer service
- accounting audit firm
- accounting and finance
- auditaccounting
- banking and finance
- accounting
- rural bank
- financial data bpo
- audit and accounting
- public accounting
- financial institution
- financial
- banking finance
- fund accounting shared services
- bankingfinance
- financial shared service
- financial services audit firm
- accountingconsultancy
- banking it
- financeshared services
- accounting auditassurance
- finance audit
- financial servcies
- accountingaudit
- banking institution
- financial servicesbanking
- finance accounting
- accounting consulting firm banking financial 