<a href="https://colab.research.google.com/github/alexgaaranes/malaia-group-2/blob/main/MALAIA_Liyab_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MALAIA - Group 2\n\n---\n##### Predicting Starting Salaries of Filipino Graduates Using Academic Background and Industry Placement: A Machine Learning Approach Based on the Liyab First Pay Survey\n\n<br>\n\nCleaning data from [**Liyab First Pay Survey dataset**](https://docs.google.com/spreadsheets/d/1gnA91Tjr_3UCNV8x1_LoE0oC56r-pXXRdJcgTfOLlm0/edit?gid=549575995#gid=549575995)

### Data Prep and Initial Exploration

In [None]:
# Mounting Google Drive. If running locally, ensure 'liyab.csv' is in the same directory or provide the correct path.\ntry:\n    from google.colab import drive\n    drive.mount('/content/drive')\n    # Update this path if your file is located elsewhere in Google Drive\n    csv_path = "/content/drive/Shareddrives/MALAIA Group 2/liyab_data/liyab.csv"\nexcept ModuleNotFoundError:\n    print("Not running in Colab. Assuming 'liyab.csv' is in the current directory or accessible via a local path.")\n    csv_path = "liyab.csv" # Adjust if your local path is different

In [None]:
import pandas as pd\nimport numpy as np\nimport re\n\n# Read data\ntry:\n    liyab = pd.read_csv(csv_path)\nexcept FileNotFoundError:\n    print(f"Error: The file {csv_path} was not found. Please check the path.")\n    # In a real scenario, you might stop execution here or try a fallback path\n    liyab = pd.DataFrame() # Create an empty DataFrame to prevent further errors if file not found\n\nprint(f"Successfully loaded data. Shape: {liyab.shape}")

In [None]:
print("Initial Data Information:")\nif not liyab.empty:\n    liyab.info()\n    print("\nMissing Values per Column:")\n    print(liyab.isnull().sum())\n    print("\nFirst 5 Rows:")\n    print(liyab.head())\n    print("\nColumn Names:")\n    print(liyab.columns.tolist())\nelse:\n    print("DataFrame is empty. Cannot display info.")

### Data Cleaning

#### 1. Year Started First Job

In [None]:
year_col = 'What year did you start your first job?'\nif not liyab.empty and year_col in liyab.columns:\n    print(f"Original value counts for '{year_col}':")\n    print(liyab[year_col].value_counts().sort_index().head(10)) # Show some problematic ones if any\n    \n    # Convert to numeric, coercing errors. This handles non-numeric strings.\n    liyab[year_col] = pd.to_numeric(liyab[year_col], errors='coerce')\n    \n    # Filter rows with years outside the plausible range (1987-2025)\n    # Also drops rows where year became NaN due to non-numeric original values\n    original_rows = len(liyab)\n    liyab.dropna(subset=[year_col], inplace=True) # Remove NaNs from coerce\n    liyab = liyab[liyab[year_col].between(1987, 2025)]\n    liyab[year_col] = liyab[year_col].astype(int)\n    print(f"\nRows removed due to invalid/outside range year: {original_rows - len(liyab)}")\n    print(f"Cleaned value counts for '{year_col}':")\n    print(liyab[year_col].value_counts().sort_index())\nelse:\n    print(f"Column '{year_col}' not found or DataFrame is empty.")

#### 2. Gender Cleaning

In [None]:
gender_col = 'What is your gender?'\nif not liyab.empty and gender_col in liyab.columns:\n    print(f"Original value counts for '{gender_col}':")\n    print(liyab[gender_col].value_counts().head(10))\n\n    liyab['Cleaned Gender'] = liyab[gender_col].astype(str).str.lower().str.strip()\n\n    gender_map = {\n        # FEMALE variants\n        'female': 'Female',\n        'f': 'Female',\n        'femaile': 'Female',\n        'femail': 'Female',\n        'femali': 'Female',\n        'femalen': 'Female',\n        'femal': 'Female',\n        'femalr': 'Female',\n        'femalw': 'Female',\n        'femaled': 'Female',\n        'femae': 'Female',\n        'feme': 'Female',\n        'babae': 'Female',\n        'cisgender female': 'Female',\n        'cis female': 'Female',\n        'women': 'Female',\n        'woman': 'Female',\n        'female (cishet)': 'Female',\n        'biological female': 'Female',\n        'heterosexual female': 'Female',\n        'female (queer)': 'Female',\n        'cisgender-female': 'Female',\n        'female, cisgender': 'Female',\n        'cis woman/female': 'Female',\n        'frmale': 'Female',\n        '*sex = female': 'Female',\n\n        # MALE variants\n        'male': 'Male',\n        'm': 'Male',\n        'make': 'Male',\n        'man': 'Male',\n        'cisgender male': 'Male',\n        'cis male': 'Male',\n        'male cisgender': 'Male',\n        'heterosexual male': 'Male',\n        'homosexual man': 'Male', # Categorizing by gender identity primarily\n        'males': 'Male',\n        'mqle': 'Male',\n        'norzagaray collegemale': 'Male', # This appeared in earlier exploration, likely a data entry error\n        'homosexual male': 'Male',\n\n        # LGBTQ+\n        'lgbtq': 'LGBTQ+',\n        'gay': 'LGBTQ+',\n        'lesbian': 'LGBTQ+',\n        'queer': 'LGBTQ+',\n        'bisexual': 'LGBTQ+',\n        'bisexual woman': 'LGBTQ+',\n        'bisexual female': 'LGBTQ+',\n        'cis-gender, pansexual, masculine': 'LGBTQ+',\n        'nonbinary': 'LGBTQ+',\n        'non-binary': 'LGBTQ+',\n        'nb': 'LGBTQ+',\n        'gender fluid': 'LGBTQ+',\n        'non-conforming': 'LGBTQ+',\n        'non-conforming male': 'LGBTQ+',\n        'non-binary, presenting mainly as male': 'LGBTQ+',\n        'homosexual': 'LGBTQ+', # General homosexual if not specified as man/woman for gender\n\n        # PREFER NOT TO SAY\n        'prefer not to say': 'Prefer not to say',\n        'prefer not to mention': 'Prefer not to say',\n\n        # OTHERs (explicitly mapped, rest will become 'Other')\n        'tired potato': 'Other',\n        'pogi': 'Other' # Humorous entry\n    }\n\n    liyab['Cleaned Gender'] = liyab['Cleaned Gender'].map(gender_map).fillna(liyab['Cleaned Gender'])\n\n    # Consolidate remaining unmapped values to 'Other'\n    allowed_genders = ['Female', 'Male', 'LGBTQ+', 'Prefer not to say', 'Other']\n    liyab['Cleaned Gender'] = liyab['Cleaned Gender'].apply(lambda x: x if x in allowed_genders else 'Other')\n    \n    # Optional: Drop original gender column and rename\n    # liyab.drop(columns=[gender_col], inplace=True)\n    # liyab.rename(columns={'Cleaned Gender': gender_col}, inplace=True)\n\n    print("\nCleaned value counts for 'Cleaned Gender':")\n    print(liyab['Cleaned Gender'].value_counts())\nelse:\n    print(f"Column '{gender_col}' not found or DataFrame is empty.")

#### 3. University Cleaning

In [None]:
uni_col = 'What school did you graduate from?'\nif not liyab.empty and uni_col in liyab.columns:\n    liyab['Cleaned University'] = liyab[uni_col].astype(str).str.lower().str.strip()\n    liyab['Cleaned University'] = liyab['Cleaned University'].apply(lambda x: re.sub(r'\s*\([^)]*\)\s*', '', x).strip())\n    liyab['Cleaned University'] = liyab['Cleaned University'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) # Changed to \w to keep underscores if any\n    liyab['Cleaned University'] = liyab['Cleaned University'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())\n\n    replacements = {\n        'univ ': 'university ', ' univ': ' university', 'inst ': 'institute ',\n        'st ': 'saint ', 'sta ': 'santa ', ' de ': ' ', ' la ': ' ', ' los ': ' ',\n        ' baños ': ' banos ', ' and ': ' ', ' & ': ' ', ' of ': ' ',\n        ' tech ': ' technology ', ' sci ': ' science '\n    }\n    for old, new in replacements.items():\n        liyab['Cleaned University'] = liyab['Cleaned University'].str.replace(old, new, regex=False)\n    liyab['Cleaned University'] = liyab['Cleaned University'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())\n\n    university_map = {\n        # UP System\n        r'.*university philippines diliman.*': 'University of the Philippines Diliman',\n        r'.*up diliman.*': 'University of the Philippines Diliman',\n        r'^upd$': 'University of the Philippines Diliman',\n        r'.*university philippines los banos.*': 'University of the Philippines Los Baños',\n        r'.*up los banos.*': 'University of the Philippines Los Baños',\n        r'^uplb$': 'University of the Philippines Los Baños',\n        r'.*university philippines manila.*': 'University of the Philippines Manila',\n        r'.*up manila.*': 'University of the Philippines Manila',\n        r'^upm$': 'University of the Philippines Manila',\n        r'.*university philippines visayas.*': 'University of the Philippines Visayas',\n        r'.*up visayas.*': 'University of the Philippines Visayas',\n        r'^upv$': 'University of the Philippines Visayas',\n        r'.*university philippines cebu.*': 'University of the Philippines Cebu',\n        r'.*up cebu.*': 'University of the Philippines Cebu',\n        r'.*university philippines baguio.*': 'University of the Philippines Baguio',\n        r'.*up baguio.*': 'University of the Philippines Baguio',\n        r'^upb$': 'University of the Philippines Baguio',\n        r'.*university philippines mindanao.*': 'University of the Philippines Mindanao',\n        r'.*up mindanao.*': 'University of the Philippines Mindanao',\n        r'.*university philippines open university.*': 'University of the Philippines Open University',\n        r'.*up open univ.*': 'University of the Philippines Open University',\n        r'^upou$': 'University of the Philippines Open University',\n        r'.*university philippines.*': 'University of the Philippines (Unspecified Campus)',\n        r'^up$': 'University of the Philippines (Unspecified Campus)',\n\n        # Ateneo System\n        r'.*ateneo manila university.*': 'Ateneo de Manila University',\n        r'^admu$': 'Ateneo de Manila University',\n        r'.*ateneo davao university.*': 'Ateneo de Davao University',\n        r'^addu$': 'Ateneo de Davao University',\n        r'.*ateneo zamboanga university.*': 'Ateneo de Zamboanga University',\n        r'^adzu$': 'Ateneo de Zamboanga University',\n        r'.*ateneo naga university.*': 'Ateneo de Naga University',\n        r'^adnu$': 'Ateneo de Naga University',\n        r'.*xavier university ateneo cagayan.*': 'Xavier University - Ateneo de Cagayan',\n        r'.*xavier university.*': 'Xavier University - Ateneo de Cagayan',\n        r'.*ateneo cagayan.*': 'Xavier University - Ateneo de Cagayan',\n        r'.*ateneo.*': 'Ateneo de Manila University', \n\n        # De La Salle System\n        r'.*salle university manila.*': 'De La Salle University Manila',\n        r'.*salle manila.*': 'De La Salle University Manila',\n        r'^dlsum$': 'De La Salle University Manila',\n        r'^dlsu$': 'De La Salle University Manila',\n        r'.*salle college saint benilde.*': 'De La Salle-College of Saint Benilde',\n        r'.*salle csb.*': 'De La Salle-College of Saint Benilde',\n        r'^csb$': 'De La Salle-College of Saint Benilde',\n        r'^benilde$': 'De La Salle-College of Saint Benilde',\n        r'.*salle lipa.*': 'De La Salle Lipa',\n        r'^dlsl$': 'De La Salle Lipa',\n        r'.*salle university dasmarinas.*': 'De La Salle University Dasmariñas',\n        r'^dlsud$': 'De La Salle University Dasmariñas',\n        r'.*salle medical health sciences institute.*' : 'De La Salle Medical and Health Sciences Institute',\n        r'.*salle university.*': 'De La Salle University Manila', \n        r'.*salle.*': 'De La Salle University Manila',\n\n        # UST\n        r'.*university santo tomas.*': 'University of Santo Tomas',\n        r'^ust$': 'University of Santo Tomas',\n\n        # Mapua\n        r'.*mapua institute technology.*': 'Mapúa University',\n        r'.*mapua university.*': 'Mapúa University',\n        r'^mapua$': 'Mapúa University',\n\n        # PUP\n        r'.*polytechnic university philippines.*': 'Polytechnic University of the Philippines',\n        r'^pup$': 'Polytechnic University of the Philippines',\n\n        # Common Universities (Expanded)\n        r'.*adamson university.*': 'Adamson University',\n        r'.*far eastern university.*': 'Far Eastern University', '^feu$': 'Far Eastern University',\n        r'.*lyceum philippines university.*': 'Lyceum of the Philippines University', '^lpu$': 'Lyceum of the Philippines University',\n        r'.*miriam college.*': 'Miriam College',\n        r'.*national university.*': 'National University', '^nu$': 'National University',\n        r'.*pamantasan lungsod maynila.*': 'Pamantasan ng Lungsod ng Maynila', '^plm$': 'Pamantasan ng Lungsod ng Maynila',\n        r'.*san beda university.*': 'San Beda University', r'.*san beda college.*': 'San Beda University', '^sbu$': 'San Beda University', '^sbc$': 'San Beda University',\n        r'.*silliman university.*': 'Silliman University',\n        r'.*technological institute philippines.*': 'Technological Institute of the Philippines', '^tip$': 'Technological Institute of the Philippines',\n        r'.*technological university philippines.*': 'Technological University of the Philippines', '^tup$': 'Technological University of the Philippines',\n        r'.*university east.*': 'University of the East', '^ue$': 'University of the East',\n        r'.*university san carlos.*': 'University of San Carlos', '^usc$': 'University of San Carlos',\n        r'.*saint louis university.*': 'Saint Louis University Baguio', '^slu$': 'Saint Louis University Baguio',\n        r'.*central philippine university.*': 'Central Philippine University', '^cpu$': 'Central Philippine University',\n        r'.*mindanao state university iligan institute technology.*': 'Mindanao State University - Iligan Institute of Technology',\n        r'.*msu iit.*': 'Mindanao State University - Iligan Institute of Technology',\n        r'.*mindanao state university.*': 'Mindanao State University (Unspecified Campus)', '^msu$': 'Mindanao State University (Unspecified Campus)',\n        r'.*holy angel university.*': 'Holy Angel University', '^hau$': 'Holy Angel University',\n        r'.*university baguio.*': 'University of Baguio', r'.*ub.*': 'University of Baguio',\n        r'.*university makati.*': 'University of Makati', '^umak$': 'University of Makati',\n        r'.*cebu institute technology.*': 'Cebu Institute of Technology - University', '^cit u.*': 'Cebu Institute of Technology - University', '^cit$': 'Cebu Institute of Technology - University',\n        r'.*university cebu.*': 'University of Cebu',\n        r'.*university perpetual help system dalta.*': 'University of Perpetual Help System DALTA',\n        r'.*uphsd.*': 'University of Perpetual Help System DALTA', r'.*university perpetual help.*': 'University of Perpetual Help System DALTA',\n        r'.*asia pacific college.*': 'Asia Pacific College', '^apc$': 'Asia Pacific College',\n        r'.*enderun colleges.*': 'Enderun Colleges',\n        r'.*iacademy.*': 'iACADEMY',\n        r'.*sti college.*': 'STI College', '^sti$': 'STI College',\n        r'.*ama computer university.*': 'AMA Computer University', r'.*ama computer college.*': 'AMA Computer University', '^ama$': 'AMA Computer University',\n        r'.*assumption college.*': 'Assumption College (San Lorenzo/Makati)',\n        r'.*centro escolar university.*': 'Centro Escolar University', '^ceu$': 'Centro Escolar University',\n        r'.*colegio san juan letran.*': 'Colegio de San Juan de Letran', r'.*letran.*': 'Colegio de San Juan de Letran',\n        r'.*don bosco technical college.*': 'Don Bosco Technical College', r'.*don bosco.*': 'Don Bosco Technical College',\n        r'.*holy name university.*': 'Holy Name University',\n        r'.*jose rizal university.*': 'Jose Rizal University', '^jru$': 'Jose Rizal University',\n        r'.*malayan colleges laguna.*': 'Malayan Colleges Laguna', '^mcl$': 'Malayan Colleges Laguna',\n        r'.*manila central university.*': 'Manila Central University', '^mcu$': 'Manila Central University',\n        r'.*our lady fatima university.*': 'Our Lady of Fatima University', '^olfu$': 'Our Lady of Fatima University',\n        r'.*philippine normal university.*': 'Philippine Normal University', '^pnu$': 'Philippine Normal University',\n        r'.*philippine womens university.*': 'Philippine Women\'s University', '^pwu$': 'Philippine Women\'s University',\n        r'.*rizal technological university.*': 'Rizal Technological University', '^rtu$': 'Rizal Technological University',\n        r'.*san sebastian college recoletos.*': 'San Sebastian College - Recoletos', r'.*san sebastian.*': 'San Sebastian College - Recoletos',\n        r'.*st pauls university.*': 'St. Paul University (Unspecified Campus)', r'.*saint pauls university.*': 'St. Paul University (Unspecified Campus)',\n        r'.*st scholastica.*': 'St. Scholastica\'s College Manila',\n        r'.*trinity university asia.*': 'Trinity University of Asia',\n        r'.*university san agustin.*': 'University of San Agustin',\n        r'.*university san jose recoletos.*': 'University of San Jose - Recoletos',\n        r'.*university immaculate concepcion.*': 'University of the Immaculate Conception',\n        r'.*university mindanao.*': 'University of Mindanao',\n        r'.*university northern philippines.*': 'University of Northern Philippines',\n        r'.*university rizal system.*': 'University of Rizal System',\n        r'.*university southeastern philippines.*': 'University of Southeastern Philippines', '^usep$': 'University of Southeastern Philippines',\n        r'.*visayas state university.*': 'Visayas State University', '^vsu$': 'Visayas State University',\n        r'.*western mindanao state university.*': 'Western Mindanao State University', '^wmsu$': 'Western Mindanao State University',\n        r'.*bulacan state university.*': 'Bulacan State University', '^bsu$': 'Bulacan State University', # Could be Batangas SU too, need care\n        r'.*cavite state university.*': 'Cavite State University', '^cvsu$': 'Cavite State University',\n        r'.*laguna state polytechnic university.*': 'Laguna State Polytechnic University', '^lspu$': 'Laguna State Polytechnic University',\n        r'.*nueva ecija university science technology.*': 'Nueva Ecija University of Science and Technology', '^neust$': 'Nueva Ecija University of Science and Technology',\n        r'.*pampanga state agricultural university.*': 'Pampanga State Agricultural University', '^psau$': 'Pampanga State Agricultural University',\n        r'.*tarlac state university.*': 'Tarlac State University', '^tsu$': 'Tarlac State University',\n        r'.*batangas state university.*': 'Batangas State University', # if not bsu already taken by bulacan\n        r'.*university cordilleras.*': 'University of the Cordilleras',\n        r'.*central luzon state university.*': 'Central Luzon State University', '^clsu$': 'Central Luzon State University',\n        r'.*mariano marcos state university.*': 'Mariano Marcos State University', '^mmsu$': 'Mariano Marcos State University',\n        r'.*palawan state university.*': 'Palawan State University', '^psu$': 'Palawan State University',\n        r'.*philippine state college aeronautics.*': 'Philippine State College of Aeronautics', '^philsca$': 'Philippine State College of Aeronautics',\n        r'.*southern luzon state university.*': 'Southern Luzon State University', '^slsu$': 'Southern Luzon State University',\n        r'.*university benguet.*': 'University of Benguet',\n        r'.*university luzon.*': 'University of Luzon',\n        r'.*university manila.*': 'Universidad de Manila', # or University of Manila if distinct\n        r'.*university pangasinan.*': 'University of Pangasinan',\n        r'.*west visayas state university.*': 'West Visayas State University', '^wvsu$': 'West Visayas State University',\n        r'.*zamboanga state college marine science technology.*': 'Zamboanga State College of Marine Sciences and Technology',\n        r'.*bicol university.*': 'Bicol University',\n        r'.*central mindanao university.*': 'Central Mindanao University',\n        r'.*negros oriental state university.*': 'Negros Oriental State University', '^norsu$': 'Negros Oriental State University',\n        # Generic catch for "State University" or "State College" if not caught above\n        r'.*\bstate university\b.*': 'Other State University',\n        r'.*\bstate college\b.*': 'Other State College',\n\n\n        # Non-university / Special Cases\n        r'.*still in school.*': 'Still Enrolled',\n        r'.*not yet a graduate.*': 'Still Enrolled',\n        r'.*not yet graduated.*': 'Still Enrolled',\n        r'.*undergrad.*': 'Still Enrolled / Did Not Graduate',\n        r'.*didnt graduate.*': 'Did Not Graduate',\n        r'.*college dropout.*': 'Did Not Graduate',\n        r'.*high school.*': 'High School Graduate',\n        r'.*hs grad.*': 'High School Graduate',\n        r'^na$': 'Not Applicable',\n        r'^n a$': 'Not Applicable',\n        r'nan': 'Not Specified', \n        r'.*prefer not to say.*': 'Prefer Not to Say',\n        r'.*secret.*': 'Prefer Not to Say',\n        r'^\s*$': 'Not Specified', \n        r'^\d{4}$': 'Invalid Entry (Year)',\n        r'^\d{1,2}$': 'Invalid Entry (Number)',\n        r'.*overseas.*': 'Overseas University',\n        r'.*vocational.*': 'Vocational Graduate'\n    }\n\n    temp_uni_values = liyab['Cleaned University'].copy()\n    # Ensure specific patterns are checked before more general ones.\n    # The dict preserves insertion order in Python 3.7+, which helps.\n    for pattern, standard_name in university_map.items():\n        mask = temp_uni_values.str.contains(pattern, regex=True, case=False, na=False)\n        temp_uni_values[mask] = standard_name\n    liyab['Cleaned University'] = temp_uni_values\n\n    known_and_special_categories = set(university_map.values())\n    \n    def categorize_remaining(uni_name_str):\n        if uni_name_str in known_and_special_categories:\n            return uni_name_str\n        \n        # Handle NaN or effectively empty strings that might have been produced by normalization\n        if pd.isna(uni_name_str) or uni_name_str.strip() == '':\n            return 'Not Specified' # Or UNIDENTIFIED_OR_VAGUE_UNIVERSITY if all unspecified should be removed\n\n        # If it contains identifying keywords but wasn't mapped, it's a candidate for UNIDENTIFIED\n        # This is a simple heuristic. More complex logic could be used.\n        keywords = ['university', 'college', 'institute', 'polytechnic', 'academy', 'school']\n        is_likely_school = any(keyword in uni_name_str for keyword in keywords)\n        \n        if is_likely_school and len(uni_name_str) > 3 : # Has a keyword and is not extremely short\n             # Check if it's a known special category that might have been missed by regex (e.g. due to word order)\n            if any(spec_cat_keyword in uni_name_str for spec_cat_keyword in ['still', 'dropout', 'high school', 'undergrad', 'vocational', 'overseas']):\n                 # This path means it has school keywords but also special category keywords.\n                 # The regex for special categories should ideally catch these. If it reaches here, it's ambiguous.\n                 return 'UNIDENTIFIED_OR_VAGUE_UNIVERSITY' # Treat as vague if mixed and not caught by specific patterns\n            return 'UNIDENTIFIED_OR_VAGUE_UNIVERSITY' # It seems like a school but wasn't mapped\n\n        # If it doesn't have common school keywords or is very short\n        # And not a special category already handled by the map.\n        # This path is for entries that are unlikely to be valid, mappable university names.\n        return 'UNIDENTIFIED_OR_VAGUE_UNIVERSITY'\n\n    liyab['Cleaned University'] = liyab['Cleaned University'].apply(categorize_remaining)\n    # Fill any NaNs that might have been introduced if `categorize_remaining` returned None (it shouldn't with current logic)\n    liyab['Cleaned University'].fillna('UNIDENTIFIED_OR_VAGUE_UNIVERSITY', inplace=True)\n\n\n    print("\nCleaned value counts for 'Cleaned University' (before removing vague):")\n    print(liyab['Cleaned University'].value_counts(dropna=False))\n    \n    rows_before_vague_removal = len(liyab)\n    \n    # Filter out rows with 'UNIDENTIFIED_OR_VAGUE_UNIVERSITY'\n    # Also filter 'Invalid Entry' and 'Not Specified' if they are not useful for prediction\n    categories_to_remove_strict = ['UNIDENTIFIED_OR_VAGUE_UNIVERSITY', 'Invalid Entry (Year)', \n                                   'Invalid Entry (Number)', 'Not Specified']\n    liyab = liyab[~liyab['Cleaned University'].isin(categories_to_remove_strict)]\n    print(f"\nRows removed due to vague, invalid or unspecified university: {rows_before_vague_removal - len(liyab)}")\n\n    print("\nCleaned value counts for 'Cleaned University' (after removing vague/invalid):")\n    print(liyab['Cleaned University'].value_counts(dropna=False))\n    \nelse:\n    print(f"Column '{uni_col}' not found or DataFrame is empty.")

#### 4. Industry Cleaning

In [None]:
!pip install sentence-transformers scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom collections import defaultdict\n\nindustry_col = 'In what industry was this job?'\nif not liyab.empty and industry_col in liyab.columns:\n    # Normalize function\n    def normalize_industry(s):\n        if not isinstance(s, str): return ''\n        s = s.lower().strip()\n        s = re.sub(r'[^a-z0-9\s]', '', s)  # remove punctuation\n        s = re.sub(r'\s+', ' ', s).strip() # remove extra spaces and trim\n        return s\n\n    # Define master categories\n    master_categories = [\n        "Accountancy, Banking and Finance",\n        "Business Process Outsourcing (BPO)", # Added BPO\n        "Business, Consulting and Management",\n        "Charity and Voluntary Work (NGO)",\n        "Creative Arts, Design and Media", # Combined Media\n        "Energy and Utilities",\n        "Engineering and Manufacturing",\n        "Environment and Agriculture",\n        "Healthcare and Pharmaceuticals", # Combined Science/Pharma\n        "Hospitality, Events, Leisure, Sport and Tourism", # Combined related fields\n        "Information Technology (IT)",\n        "Law and Legal Services",\n        "Law Enforcement and Security",\n        "Marketing, Advertising and Public Relations (PR)",\n        "Property and Construction",\n        "Public Services and Administration (Government)",\n        "Recruitment and Human Resources (HR)",\n        "Retail and E-commerce", # Added E-commerce\n        "Sales",\n        "Education and Training", # Renamed for clarity\n        "Transport and Logistics",\n        "Other"\n    ]\n\n    # Load SentenceTransformer model\n    model = SentenceTransformer('all-MiniLM-L6-v2')\n    category_embeddings = model.encode(master_categories)\n\n    # Prepare unique normalized entries for embedding\n    liyab[industry_col].fillna('', inplace=True)\n    unique_original_entries = liyab[industry_col].unique()\n    normalized_to_original_map = defaultdict(list)\n    for entry in unique_original_entries:\n        normalized = normalize_industry(entry)\n        if normalized: # Only consider non-empty normalized strings\n             normalized_to_original_map[normalized].append(entry)\n    \n    unique_normalized_entries = [n for n in normalized_to_original_map.keys() if n] # Ensure no empty strings\n\n    normalized_entry_to_category = {}\n    if unique_normalized_entries:\n        entry_embeddings = model.encode(unique_normalized_entries)\n        # Assign each unique normalized entry to the closest master category\n        for i, entry_vector in enumerate(entry_embeddings):\n            similarities = cosine_similarity([entry_vector], category_embeddings)[0]\n            best_category_idx = np.argmax(similarities)\n            best_category = master_categories[best_category_idx]\n            normalized_entry_to_category[unique_normalized_entries[i]] = best_category\n\n    # Manual overrides for normalized keys (applied after similarity mapping)\n    manual_overrides = {\n        normalize_industry("bpo"): "Business Process Outsourcing (BPO)",\n        normalize_industry("call center"): "Business Process Outsourcing (BPO)",\n        normalize_industry("kpo"): "Business Process Outsourcing (BPO)",\n        normalize_industry("shared services"): "Business Process Outsourcing (BPO)",\n        normalize_industry("itbpo"): "Information Technology (IT)", # Or BPO, depends on definition\n        normalize_industry("software engineering"): "Information Technology (IT)",\n        normalize_industry("fintech"): "Accountancy, Banking and Finance",\n        normalize_industry("ecommerce"): "Retail and E-commerce",\n        normalize_industry("government"): "Public Services and Administration (Government)",\n        normalize_industry("ngo"): "Charity and Voluntary Work (NGO)",\n        normalize_industry("real estate"): "Property and Construction",\n        normalize_industry("academe"): "Education and Training",\n        normalize_industry("education"): "Education and Training",\n        normalize_industry("teaching"): "Education and Training",\n        normalize_industry("research"): "Other", # Could be IT, Science, etc. Needs context or map to specific if clear\n        normalize_industry("architecture"): "Property and Construction",\n        normalize_industry("construction"): "Property and Construction",\n        normalize_industry("advertising"): "Marketing, Advertising and Public Relations (PR)",\n        normalize_industry("media"): "Creative Arts, Design and Media",\n        normalize_industry("telecommunications"): "Information Technology (IT)", # Often grouped with IT\n        normalize_industry("pharmaceutical"): "Healthcare and Pharmaceuticals",\n        normalize_industry("aviation"): "Transport and Logistics",\n        normalize_industry("automotive"): "Engineering and Manufacturing", # Or Sales if dealer\n        normalize_industry("food and beverage"): "Hospitality, Events, Leisure, Sport and Tourism", # Or Manufacturing if production\n        normalize_industry("fmcg"): "Retail and E-commerce", # Or Sales/Manufacturing\n        # Entries that are clearly not industries\n        normalize_industry("2020"): "Other",\n        normalize_industry("na"): "Other",\n        normalize_industry("none"): "Other"\n    }\n    for norm_key, override_cat in manual_overrides.items():\n        if norm_key: # Ensure key is not empty\n            normalized_entry_to_category[norm_key] = override_cat\n\n    # Map back to the original DataFrame\n    def map_to_final_industry_category(original_value):\n        if pd.isna(original_value) or original_value.strip() == '':\n            return 'Not Specified'\n        normalized_val = normalize_industry(original_value)\n        if not normalized_val:\n            return 'Not Specified'\n        return normalized_entry_to_category.get(normalized_val, 'Other') # Default for unmapped\n\n    liyab['Cleaned Industry'] = liyab[industry_col].apply(map_to_final_industry_category)\n\n    print("\nCleaned value counts for 'Cleaned Industry':")\n    print(liyab['Cleaned Industry'].value_counts(dropna=False))\n    \n    # Display original entries for a specific cleaned category for review\n    # print("\nOriginal entries for 'Other' category:")\n    # for norm_val, orig_vals in normalized_to_original_map.items():\n    #    if normalized_entry_to_category.get(norm_val) == 'Other':\n    #        print(f"  Normalized: '{norm_val}' -> Original(s): {orig_vals}")\nelse:\n    print(f"Column '{industry_col}' not found or DataFrame is empty.")

#### 5. Other Date Column Validation (Placeholder)

In [None]:
# Placeholder for other date validations\n# Example: If a 'Timestamp' or 'Graduation Date' column exists\nif not liyab.empty:\n    if 'Timestamp' in liyab.columns: # Common in Google Form exports\n        liyab['Timestamp'] = pd.to_datetime(liyab['Timestamp'], errors='coerce')\n        print(f"'Timestamp' column converted to datetime. NaT count: {liyab['Timestamp'].isnull().sum()}")\n    \n    grad_year_col = 'What year did you graduate?' # Assuming this column name might exist\n    if grad_year_col in liyab.columns:\n        liyab[grad_year_col] = pd.to_numeric(liyab[grad_year_col], errors='coerce')\n        # Add similar filtering as 'What year did you start your first job?' if needed\n        # liyab.dropna(subset=[grad_year_col], inplace=True)\n        # liyab = liyab[liyab[grad_year_col].between(1980, 2025)] # Example range\n        print(f"'{grad_year_col}' column converted to numeric. NaN count: {liyab[grad_year_col].isnull().sum()}")\nelse:\n    print("DataFrame is empty.")

#### 6. Handling Missing Values (Post Cleaning)

In [None]:
if not liyab.empty:\n    print("Missing values after initial cleaning steps (before final NaN drop based on key columns):")\n    print(liyab.isnull().sum())\n\n    # Define key columns that are essential for the analysis\n    salary_column_guess = next((col for col in liyab.columns if 'salary' in col.lower() and 'monthly' in col.lower()), None)\n    if salary_column_guess:\n        print(f"Guessed salary column: {salary_column_guess}")\n        if liyab[salary_column_guess].dtype == 'object':\n            liyab[salary_column_guess] = liyab[salary_column_guess].astype(str).str.replace(r'[^\d.]', '', regex=True)\n            liyab[salary_column_guess] = pd.to_numeric(liyab[salary_column_guess], errors='coerce')\n        \n        # Key columns for dropping rows if they contain NaNs\n        # Cleaned University might have 'Not Specified' etc. which are handled by category dropping later, not here by dropna.\n        key_columns_for_nan_drop = [salary_column_guess, 'Cleaned Industry', 'What year did you start your first job?'] \n        # Add 'Cleaned Gender' if it's considered essential and shouldn't have NaNs (already has 'Other'/'Prefer not to say')\n        if 'Cleaned Gender' in liyab.columns: key_columns_for_nan_drop.append('Cleaned Gender')\n            \n        key_columns_present = [col for col in key_columns_for_nan_drop if col in liyab.columns]\n        \n        if key_columns_present:\n            original_rows = len(liyab)\n            liyab.dropna(subset=key_columns_present, inplace=True)\n            print(f"\nRows removed due to missing values in key columns ({', '.join(key_columns_present)}): {original_rows - len(liyab)}")\n        else:\n            print("\nCould not find all key columns for NaN removal based on essential data.")\n            \n        # Further, remove rows where Cleaned University indicates non-graduates or other unsuitable categories\n        if 'Cleaned University' in liyab.columns:\n            original_rows = len(liyab)\n            # 'UNIDENTIFIED_OR_VAGUE_UNIVERSITY' is already removed in the university cleaning cell itself.\n            # 'Invalid Entry' and 'Not Specified' are also removed there now.\n            uni_categories_to_drop_final = ['Still Enrolled', 'Still Enrolled / Did Not Graduate', 'Did Not Graduate', \n                                      'High School Graduate', 'Not Applicable', 'Prefer Not to Say', 'Vocational Graduate']\n            liyab = liyab[~liyab['Cleaned University'].isin(uni_categories_to_drop_final)]\n            print(f"Rows removed due to unsuitable university categories (non-grads, etc.): {original_rows - len(liyab)}")\n    else:\n        print("\nCould not identify the salary column automatically. Skipping NaN removal based on salary and subsequent filtering.")\n\n    print("\nMissing values after NaN strategy and category filtering:")\n    print(liyab.isnull().sum())\n    print(f"\nFinal shape of the cleaned DataFrame: {liyab.shape}")\nelse:\n    print("DataFrame is empty.")

#### 7. Final Review and Column Selection (Example)

In [None]:
if not liyab.empty:\n    print("Cleaned DataFrame Head:")\n    print(liyab.head())\n\n    columns_to_keep = []\n    if salary_column_guess and salary_column_guess in liyab.columns: columns_to_keep.append(salary_column_guess)\n    if 'Cleaned University' in liyab.columns: columns_to_keep.append('Cleaned University')\n    if 'Cleaned Industry' in liyab.columns: columns_to_keep.append('Cleaned Industry')\n    if 'Cleaned Gender' in liyab.columns: columns_to_keep.append('Cleaned Gender')\n    if 'What year did you start your first job?' in liyab.columns: columns_to_keep.append('What year did you start your first job?')\n    # Add other relevant cleaned columns as needed\n\n    final_columns = [col for col in columns_to_keep if col in liyab.columns]\n\n    if final_columns:\n        liyab_final = liyab[final_columns].copy()\n        print("\nFinal selected DataFrame for modeling (liyab_final):")\n        print(liyab_final.head())\n        print(f"Shape of liyab_final: {liyab_final.shape}")\n    else:\n        print("\nNo columns selected for the final DataFrame. Check column names and cleaning steps.")\n        liyab_final = pd.DataFrame()\nelse:\n    print("DataFrame is empty, no final review possible.")\n    liyab_final = pd.DataFrame()

### **Original Notebook Cells (For Reference - May be outdated or partially integrated above)**

In [None]:
# !pip install rapidfuzz scikit-learn # Moved to industry cleaning cell

In [None]:
# This cell for university clustering was for exploration.\n# The implemented cleaning uses a more direct mapping approach.\n# from rapidfuzz.distance import Levenshtein\n# from sklearn.cluster import AgglomerativeClustering\n# import numpy as np\n# strings = liyab['What school did you graduate from?'].unique() # Example, use original unique names\n# # ... rest of the clustering code ... \n# print("University clustering output (for reference only, not used in final cleaning pipeline):")

In [None]:
# Original value counts for universities (for reference)\n# universities_count = liyab_original_backup['What school did you graduate from?'].value_counts().sort_index()\n# print(universities_count)

In [None]:
# Original formatted university counts (for reference)\n# universities_formatted = liyab_original_backup['What school did you graduate from?'].astype(str).str.lower().str.strip()\n# universities_formatted = universities_formatted.apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x))\n# print(universities_formatted.value_counts())