In [1]:
import pandas as pd
import numpy as np
import ast
import re
import joblib

In [3]:
print("\n--- Loading all saved model components ---")
try:
    final_model = joblib.load('../data/final_stacking_model.pkl')
    le = joblib.load('../data/label_encoder.pkl')
    university_map = joblib.load('../data/university_target_map.pkl')
    all_jobs = pd.read_csv('../data/processed/all_jobs.csv')
    
    
    # Load the original data to get the trained KMeans model
    from sklearn.cluster import KMeans
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    df_orig = pd.read_csv('../data/processed/refined_df.csv')
    resume_skills_c = df_orig['skills'].fillna('')
    resume_title_c = df_orig['positions'].fillna('')
    resume_exp_c = 'experience ' + df_orig['Resume_Years_Exp'].astype(str)
    clustering_text = resume_skills_c + ' ' + resume_title_c + ' ' + resume_exp_c
    tfidf_cluster = TfidfVectorizer(stop_words='english', max_features=5000, min_df=2).fit(clustering_text)
    kmeans_model = KMeans(n_clusters=7, init='k-means++', n_init=10, random_state=42).fit(tfidf_cluster.transform(clustering_text))
    
    print("All components loaded successfully.")
except FileNotFoundError:
    print("Error: Could not find saved model files. Please run the saving code in your previous notebook first.")
    final_model, le, university_map, all_jobs, kmeans_model, tfidf_cluster = [None]*6


--- Loading all saved model components ---
All components loaded successfully.


In [15]:
def create_feature_matrix(new_resume_dict, all_jobs_df, uni_map, kmeans, tfidf_v):
    print(f"Processing new resume and preparing {len(all_jobs_df)} candidate pairs...")
    resume_df = pd.DataFrame([new_resume_dict])
    candidate_df = pd.merge(resume_df.assign(key=1), all_jobs_df.assign(key=1), on='key').drop('key', axis=1)
    
    # Apply all feature engineering steps
    candidate_df['Experience_Mismatch'] = abs(candidate_df['Resume_Years_Exp'] - candidate_df['experience_years_required'])
    
    def calculate_skill_overlap(row):
        try:
            r_skills = set(str(row['skills']).lower().split())
            j_skills = set(str(row['skills_required']).lower().split())
            intersection = r_skills.intersection(j_skills)
            union = r_skills.union(j_skills)
            return pd.Series([len(intersection), len(intersection) / len(union) if union else 0])
        except: return pd.Series([0, 0.0])
    candidate_df[['Skill_Overlap_Count', 'Skill_Jaccard_Score']] = candidate_df.apply(calculate_skill_overlap, axis=1)

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics import pairwise_distances
    resume_text = candidate_df[['career_objective', 'skills', 'major_field_of_studies', 'positions', 'responsibilities']].fillna('').astype(str).agg(' '.join, axis=1)
    job_text = candidate_df[['job_position_name', 'educationaL_requirements', 'skills_required', 'responsibilities.1']].fillna('').astype(str).agg(' '.join, axis=1)
    sim_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000).fit(pd.concat([resume_text, job_text]))
    candidate_df['Cosine_Similarity'] = 1 - pairwise_distances(sim_vectorizer.transform(resume_text), sim_vectorizer.transform(job_text), metric='cosine').diagonal()

    new_resume_cluster_text = f"{new_resume_dict.get('skills','')} {new_resume_dict.get('positions','')} experience {new_resume_dict.get('Resume_Years_Exp',0)}"
    new_resume_tfidf = tfidf_v.transform([new_resume_cluster_text])
    candidate_df['Resume_Cluster_KMeans'] = kmeans.predict(new_resume_tfidf)[0]

    candidate_df['gpa'] = new_resume_dict.get('gpa', 0.0)
    candidate_df['first_university'] = new_resume_dict.get('first_university', 'Unknown')
    candidate_df['university_encoded'] = candidate_df['first_university'].map(uni_map).fillna(uni_map.mean())

    print("Feature matrix created successfully.")
    return candidate_df


In [16]:
def recommend_top_n_jobs(resume_data, model, jobs_df, uni_map, label_encoder, kmeans, tfidf_v, top_n=5):
    if model is None: print("Model not loaded."); return None

    # 1. Create the feature matrix
    feature_df = create_feature_matrix(resume_data, jobs_df, uni_map, kmeans, tfidf_v)
    
    final_features = ['experience_years_required', 'Skill_Overlap_Count', 'Skill_Jaccard_Score', 
                      'Resume_Years_Exp', 'Experience_Mismatch', 'Cosine_Similarity', 'gpa',
                      'Resume_Cluster_KMeans', 'university_encoded']
    X_predict = feature_df[final_features]

    # 2. Predict probabilities for each class
    predicted_probs = model.predict_proba(X_predict)
    
    # **MODIFICATION: Add probabilities for all classes to the dataframe**
    for i, class_label in enumerate(label_encoder.classes_):
        feature_df[f'P({class_label})'] = predicted_probs[:, i]

    # 3. Rank by 'High' match probability and return the top N jobs with all probabilities
    recommendations = feature_df.sort_values(by='P(High)', ascending=False).head(top_n)
    
    # Define columns to display
    display_cols = ['job_position_name', 'P(High)', 'P(Medium)', 'P(Low)']
    
    return recommendations[display_cols]

In [17]:
sample_resume = {
    'career_objective': 'Seeking a challenging role in data science to apply my skills in machine learning and Python.',
    'skills': 'Python, SQL, Scikit-learn, TensorFlow, Keras, Pandas, Matplotlib, AWS',
    'major_field_of_studies': 'Computer Science',
    'positions': 'Data Science Intern',
    'responsibilities': 'Developed predictive models, built data pipelines.',
    'Resume_Years_Exp': 1,
    'gpa': 3.8,
    'first_university': 'University of Illinois at Urbana-Champaign'
}

In [18]:
if final_model:
    print("\n\n--- Getting Job Recommendations for Sample Resume ---")
    top_jobs = recommend_top_n_jobs(sample_resume, final_model, all_jobs, university_map, le, kmeans_model, tfidf_cluster, top_n=5)
    if top_jobs is not None:
        print("\nTop 5 Recommended Jobs:")
        # Format the output for better readability
        pd.options.display.float_format = '{:.4f}'.format
        print(top_jobs.to_string(index=False))



--- Getting Job Recommendations for Sample Resume ---
Processing new resume and preparing 28 candidate pairs...
Feature matrix created successfully.

Top 5 Recommended Jobs:
                                          job_position_name  P(High)  P(Medium)  P(Low)
                                            DevOps Engineer   0.3192     0.5301  0.1508
                               Executive/ Sr. Executive -IT   0.2283     0.5713  0.2004
                                                AI Engineer   0.2174     0.6677  0.1149
Intern (Generative AI Engineering - 2D/3D Image Generation)   0.2067     0.5324  0.2609
                            Management Trainee - Mechanical   0.1682     0.6222  0.2097


We can see that out of 28 resumes, it has suggested related jobs as per resume. We see DevOps as the most suitable, as the skills overlap is big in this case: python, sql, AWS are very important skills in devops as well. We were also able to catch that resume intents to be an intern position and were able to suggest GEN AI intern too. Although, out of all the jobs, i think AI engineer fits the best.