In [1]:
import pandas as pd
import numpy as np
import ast
import re
import joblib
import os

# Suppress warnings for a cleaner user experience
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
artifacts = {}

In [3]:
try:
    # --- Load required libraries ---
    from sentence_transformers import SentenceTransformer
    from sklearn.cluster import KMeans
    from sklearn.feature_extraction.text import TfidfVectorizer

    # --- Define file paths ---
    model_path = '../data/final_adaboost_model.pkl' # <-- Your final, tuned AdaBoost pipeline
    le_path = '../data/label_encoder.pkl'
    uni_map_path = '../data/university_target_map.pkl'
    svd_mean_path = '../data/mean_svd_score.pkl'
    jobs_path = '../data/processed/all_jobs.csv'
    
    # --- Load artifacts ---
    artifacts['model_pipeline'] = joblib.load(model_path)
    artifacts['le'] = joblib.load(le_path)
    artifacts['university_map'] = joblib.load(uni_map_path)
    artifacts['all_jobs'] = pd.read_csv(jobs_path)
    artifacts['mean_svd_score'] = joblib.load(svd_mean_path)
    
    # --- Load or re-create helper models for feature engineering ---
    print("Loading helper models for feature engineering...")
    artifacts['embedding_model'] = SentenceTransformer('all-MiniLM-L6-v2')

    df_orig = pd.read_csv('../data/raw/resume_data_1.csv')
    df_orig.columns = df_orig.columns.str.strip().str.replace(r'\ufeff', '', regex=True)
    if 'Resume_Years_Exp' not in df_orig.columns: df_orig['Resume_Years_Exp'] = 0 # Placeholder if column is missing
    
    clustering_text = df_orig['skills'].fillna('') + ' ' + df_orig['positions'].fillna('') + ' experience ' + df_orig['Resume_Years_Exp'].astype(str)
    tfidf_cluster = TfidfVectorizer(stop_words='english', max_features=5000, min_df=2).fit(clustering_text)
    kmeans_model = KMeans(n_clusters=5, init='k-means++', n_init=10, random_state=42).fit(tfidf_cluster.transform(clustering_text))
    
    artifacts['kmeans_model'] = kmeans_model
    artifacts['tfidf_cluster'] = tfidf_cluster

    print("\nAll components loaded successfully!")

except FileNotFoundError as e:
    print(f"Error: Could not find a required file: {e.filename}")
    print("Please ensure you have run the saving code in your training notebook.")
    artifacts = None
except ImportError:
    print("\nPlease install sentence-transformers: !pip install -U sentence-transformers")
    artifacts = None

Loading helper models for feature engineering...

All components loaded successfully!


In [14]:
def create_feature_matrix(new_resume_dict, artifacts):
    """ Generates the complete feature matrix for a new resume against all jobs. """
    print(f"\nProcessing new resume and preparing {len(artifacts['all_jobs'])} candidate pairs...")
    resume_df = pd.DataFrame([new_resume_dict])
    candidate_df = pd.merge(resume_df.assign(key=1), artifacts['all_jobs'].assign(key=1), on='key').drop('key', axis=1)

    # --- Engineer all required features ---
    def extract_job_experience(text):
        if isinstance(text, str):
            match = re.search(r'(\d+)', text.lower()); return int(match.group(1)) if match else 0
        return 0
    
    candidate_df['experience_years_required'] = candidate_df['experience_years_required'].apply(extract_job_experience)
    candidate_df['Experience_Mismatch'] = abs(candidate_df['Resume_Years_Exp'] - candidate_df['experience_years_required'])
    
    def process_degree_data(degree_string):
        master_keywords = ['master', 'm.sc', 'msc', 'm.a', 'mba', 'm.com']; bachelor_keywords = ['bachelor', 'b.sc', 'bsc', 'b.tech', 'bba', 'b.a', 'b.com']; doctorate_keywords = ['ph.d', 'phd', 'doctorate']
        stem_keywords = ['science', 'tech', 'eng', 'computer', 'math', 'statistic']; business_keywords = ['business', 'bba', 'mba', 'account', 'financ', 'commerc']; arts_keywords = ['arts', 'humanities']
        highest_level, degree_type = 'Other', 'Other'
        try:
            degrees = ast.literal_eval(str(degree_string));
            if not isinstance(degrees, list): degrees = [degrees]
            found_levels, found_types = [], []
            for degree in degrees:
                degree_lower = str(degree).lower()
                if any(k in degree_lower for k in doctorate_keywords): found_levels.append(3)
                elif any(k in degree_lower for k in master_keywords): found_levels.append(2)
                elif any(k in degree_lower for k in bachelor_keywords): found_levels.append(1)
                if any(k in degree_lower for k in stem_keywords): found_types.append('STEM')
                elif any(k in degree_lower for k in business_keywords): found_types.append('Business')
                elif any(k in degree_lower for k in arts_keywords): found_types.append('Arts')
            if found_levels:
                max_level = max(found_levels);
                if max_level == 3: highest_level = 'Doctorate'
                elif max_level == 2: highest_level = 'Masters'
                elif max_level == 1: highest_level = 'Bachelors'
            if 'STEM' in found_types: degree_type = 'STEM'
            elif 'Business' in found_types: degree_type = 'Business'
            elif 'Arts' in found_types: degree_type = 'Arts'
        except: pass
        return pd.Series([highest_level, degree_type])
    candidate_df[['highest_education_level', 'degree_type']] = candidate_df['degree_names'].apply(process_degree_data)

    from sklearn.metrics.pairwise import cosine_similarity
    print("Calculating embedding similarity...")
    resume_text = candidate_df[['career_objective', 'skills', 'major_field_of_studies', 'positions']].fillna('').astype(str).agg(' '.join, axis=1)
    job_text = candidate_df[['job_position_name', 'educationaL_requirements', 'skills_required', 'responsibilities.1']].fillna('').astype(str).agg(' '.join, axis=1)
    resume_emb = artifacts['embedding_model'].encode(resume_text.tolist())
    job_emb = artifacts['embedding_model'].encode(job_text.tolist())
    candidate_df['Embedding_Cosine_Similarity'] = [cosine_similarity([resume_emb[i]], [job_emb[i]])[0][0] for i in range(len(job_emb))]

    candidate_df['svd_predicted_score'] = artifacts['mean_svd_score'] 
    
    def calculate_skill_overlap(row):
        try: 
            r_skills = set(str(row['skills']).lower().split()); j_skills = set(str(row['skills_required']).lower().split())
            intersection = r_skills.intersection(j_skills); union = r_skills.union(j_skills)
            return pd.Series([len(intersection), len(intersection) / len(union) if union else 0])
        except: return pd.Series([0, 0.0])
    candidate_df[['Skill_Overlap_Count', 'Skill_Jaccard_Score']] = candidate_df.apply(calculate_skill_overlap, axis=1)
    
    cluster_text = f"{new_resume_dict.get('skills','')} {new_resume_dict.get('positions','')} experience {new_resume_dict.get('Resume_Years_Exp',0)}"
    candidate_df['Resume_Cluster_KMeans'] = artifacts['kmeans_model'].predict(artifacts['tfidf_cluster'].transform([cluster_text]))[0]
    
    candidate_df['university_encoded'] = candidate_df['first_university'].map(artifacts['university_map']).fillna(artifacts['university_map'].mean())


    print("Feature matrix created successfully.")
    return candidate_df

In [15]:
def recommend_jobs(resume_data, artifacts):
    """ Generates and displays recommendations from the loaded classification model. """
    feature_df = create_feature_matrix(resume_data, artifacts)
    
    final_features_list = [
        'experience_years_required', 'Skill_Overlap_Count', 'Skill_Jaccard_Score', 
        'Resume_Years_Exp', 'Experience_Mismatch', 'Embedding_Cosine_Similarity', 
        'svd_predicted_score', 'Resume_Cluster_KMeans', 'highest_education_level', 
        'degree_type', 'university_encoded', 'gpa'
    ]
    # **Add gpa to the resume data to be passed**
    feature_df['gpa'] = resume_data.get('gpa', 0.0)
    
    # Ensure the dataframe has columns in the correct order for the model
    X_predict = feature_df[final_features_list]

    predicted_probs = artifacts['model_pipeline'].predict_proba(X_predict)
    high_match_index = list(artifacts['le'].classes_).index('High')
    feature_df['high_match_probability'] = predicted_probs[:, high_match_index]
    
    cls_recs = feature_df.sort_values(by='high_match_probability', ascending=False).head(7)

    print("\n\n" + "="*60 + "\n                     RECOMMENDATION RESULTS\n" + "="*60)
    print("\n--- Top 7 Recommended Jobs (Ranked by 'High' Match Probability) ---")
    print(cls_recs[['job_position_name', 'high_match_probability']].to_string(index=False))
    print("\n" + "="*60)

In [16]:
if artifacts:
    # ** EDIT THE VALUES IN THIS DICTIONARY TO TEST A NEW RESUME **
    new_resume = {
        'career_objective': 'A highly motivated data scientist seeking a challenging role to apply my skills in machine learning, Python, and cloud technologies.',
        'skills': 'python, sql, machine learning, deep learning, pytorch, aws, docker, natural language processing',
        'major_field_of_studies': 'Computer Science',
        'positions': 'Machine Learning Engineer',
        'degree_names': "['Masters in Computer Science']",
        'Resume_Years_Exp': 3,
        'gpa': 3.9,
        'first_university': 'Carnegie Mellon University' # Use a known university for best results
    }

    # Generate and display recommendations
    recommend_jobs(new_resume, artifacts)
else:
    print("\nCould not run recommendations because artifacts were not loaded.")


Processing new resume and preparing 28 candidate pairs...
Calculating embedding similarity...
Feature matrix created successfully.


                     RECOMMENDATION RESULTS

--- Top 7 Recommended Jobs (Ranked by 'High' Match Probability) ---
                         job_position_name  high_match_probability
                  Senior Software Engineer                0.355618
            Machine Learning (ML) Engineer                0.355618
                         Marketing Officer                0.355618
Manager- Human Resource Management (HRM)\n                0.355618
                     Data Science Engineer                0.355618
                            Civil Engineer                0.355618
    Full Stack Developer (Python,React js)                0.355618



In [18]:
if artifacts:
    # ** EDIT THE VALUES IN THIS DICTIONARY TO TEST A NEW RESUME **
    new_resume_1 = {
        'career_objective': 'A highly motivated software engineer seeking a challenging role to apply my coding skills in Web development, HTML, CSS, JS, and cloud technologies.',
        'skills': 'Java, sql, HTML, CSS, JS, aws, docker, React JS, MongoDB',
        'major_field_of_studies': 'Computer Science',
        'positions': 'none',
        'degree_names': "['Masters in Computer Science']",
        'Resume_Years_Exp': 0,
        'gpa': 3.9,
        'first_university': 'DePaul University' # Use a known university for best results
    }

    # Generate and display recommendations
    recommend_jobs(new_resume_1, artifacts)
else:
    print("\nCould not run recommendations because artifacts were not loaded.")


Processing new resume and preparing 28 candidate pairs...
Calculating embedding similarity...
Feature matrix created successfully.


                     RECOMMENDATION RESULTS

--- Top 7 Recommended Jobs (Ranked by 'High' Match Probability) ---
                         job_position_name  high_match_probability
                  Senior Software Engineer                0.355618
            Machine Learning (ML) Engineer                0.355618
                         Marketing Officer                0.355618
Manager- Human Resource Management (HRM)\n                0.355618
                     Data Science Engineer                0.355618
                            Civil Engineer                0.355618
    Full Stack Developer (Python,React js)                0.355618

