# Data Preparation

In [3]:
#import data3 csv, drop the columns of name and job description and keep only the rows where 'Best Match' is 1
import pandas as pd  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity   
data3 = pd.read_csv('data3.csv')

data3.head(5)  # Display the first 5 rows of the filtered DataFrame

Unnamed: 0,Job Applicant Name,Age,Gender,Race,Ethnicity,Resume,Job Roles,Job Description,Best Match
0,Daisuke Mori,29,Male,Mongoloid/Asian,Vietnamese,"Proficient in Injury Prevention, Motivation, N...",Fitness Coach,A Fitness Coach is responsible for helping cl...,0
1,Taichi Shimizu,31,Male,Mongoloid/Asian,Filipino,"Proficient in Healthcare, Pharmacology, Medica...",Physician,"Diagnose and treat illnesses, prescribe medica...",0
2,Sarah Martin,46,Female,White/Caucasian,Dutch,"Proficient in Forecasting, Financial Modelling...",Financial Analyst,"As a Financial Analyst, you will be responsibl...",0
3,Keith Hughes,43,Male,Negroid/Black,Caribbean,"Proficient in Budgeting, Supply Chain Optimiza...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1
4,James Davis,49,Male,White/Caucasian,English,"Proficient in Logistics, Negotiation, Procurem...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1


### Unnecessary columns, like applicant names, are removed as they are irrelevant to the job matching process.

In [None]:
data3 = data3.drop(columns='Job Applicant Name')

### We filter the dataset to include only resumes that are marked as a "Best Match," ensuring our analysis focuses on high-quality examples.

In [None]:
data3 = data3[data3['Best Match'] == 1]  # Keep only rows where 'Best Match' is 1


# Job Category Generation

A predefined list of broad job categories is established to serve as our target labels.

In [4]:
job_categories = ["Software Engineering / IT","Data Science / AI / ML","Marketing / Advertising / PR","Sales / Business Development","Finance / Accounting / Auditing","Human Resources (HR) / Recruiting","Healthcare / Medical","Education / Teaching / Training","Creative / Design / UX / UI","Product Management","Project / Program Management","Customer Service / Support","Operations / Supply Chain / Logistics","Legal / Compliance / Regulatory","Administration / Office Support","Manufacturing / Engineering / Construction","Retail / E-commerce / Buying / Merchandising","Hospitality / Food / Travel","Real Estate / Property Management","Energy / Environment / Sustainability","Consulting / Strategy","Research / Science / R&D","Others / Uncategorized"]


 We utilize a pre-trained SentenceTransformer model (specifically 'all-MiniLM-L6-v2'). This model is crucial for converting text into numerical representations (embeddings) that capture semantic meaning. The core idea is to generate job categories based on the content of the resume descriptions, rather than predicting specific job roles directly. This approach provides more generalized and useful categories.

In [5]:
import torch

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')  # Light, fast, and good



  from .autonotebook import tqdm as notebook_tqdm


 Each of the predefined job categories is converted into a numerical embedding using the loaded SentenceTransformer model. This allows us to compare them mathematically with resume embeddings.

In [6]:
category_embeddings = model.encode(job_categories, convert_to_tensor=True)

# Skill Recognition and Categorization

Categorization Function: A function is developed to process a given resume's skills.

It parses the comma-separated skill string into a list of individual skills.

It then encodes these skills into numerical embeddings using the SentenceTransformer.

The average of these skill embeddings is computed to represent the overall skill profile of the resume.

Cosine Similarity for Matching: The cosine similarity is calculated between the resume's skill embedding and the embeddings of all predefined job categories. Cosine similarity measures the angle between two vectors, indicating how similar their directions are. A higher cosine similarity score indicates a stronger match.

The job category with the highest cosine similarity score is identified as the most probable match for the resume.

Synthetic Job Category Generation: This process effectively "synthetically" generates a job category for each resume by matching its skills to the most relevant predefined category.

Applying Categories to Data: The newly predicted job categories are then added as a new column to our dataset.

In [7]:
def categorize_with_bert_debug(skills_string):
   

    # Convert comma-separated string to list
    skills_list = [skill.strip() for skill in skills_string.split(',') if skill.strip()]
    

    # Handle empty or invalid input
    if not skills_list:
        print("No valid skills found.")
        return "Others"

    # Encode and average embeddings
    skill_embeddings = model.encode(skills_list, convert_to_tensor=True)
    skill_embedding = torch.mean(skill_embeddings, dim=0).unsqueeze(0)  # <-- Fix here

   

    # Compute similarity
    similarities = util.cos_sim(skill_embedding, category_embeddings)
  

    best_match_idx = similarities.argmax().item()

    return job_categories[best_match_idx]
result = categorize_with_bert_debug(data3['Resume'].iloc[2])
print("Predicted category:", result)


Predicted category: Manufacturing / Engineering / Construction


In [9]:
data3['job_category'] = data3['Resume'].apply(categorize_with_bert_debug)


In [8]:
data3 = pd.read_csv('enhanced_resumes.csv')

# Top Job Role Prediction

Importing CountVectorizer: We import CountVectorizer from sklearn.feature_extraction.text. This tool converts a collection of text documents into a matrix of token counts, effectively representing text as numerical data.

Fitting CountVectorizer: The CountVectorizer is fitted to the 'Resume' column of our filtered dataset. This step builds a vocabulary of all unique words in the resumes and prepares the vectorizer to transform new text.

Calculating Cosine Similarity for Resumes: Cosine similarity is then applied to the vectorized resumes. This creates a similarity matrix where each entry represents the similarity between two resumes based on their word counts.

Prediction Function: A function is created to:

Take a user's input (e.g., a resume or a set of skills).

Vectorize this input using the previously fitted CountVectorizer.

Calculate the similarity score between the user's input vector and all existing resume vectors using the pre-computed cosine similarity matrix.

Identify the top five unique job categories and their corresponding job descriptions that best match the user's input, based on these similarity scores.

The results are then sorted by their ranking (similarity score).

In [9]:
cv = CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(data3["Resume"])

In [10]:
cosine_sim = cosine_similarity(count_matrix)
cosine_sim

array([[1.        , 0.786991  , 0.64684316, ..., 0.65673607, 0.52363494,
        0.52363494],
       [0.786991  , 1.        , 0.50097943, ..., 0.66759195, 0.53229065,
        0.53229065],
       [0.64684316, 0.50097943, 1.        , ..., 0.52807869, 0.5       ,
        0.52941176],
       ...,
       [0.65673607, 0.66759195, 0.52807869, ..., 1.        , 0.62709344,
        0.56108361],
       [0.52363494, 0.53229065, 0.5       , ..., 0.62709344, 1.        ,
        0.52941176],
       [0.52363494, 0.53229065, 0.52941176, ..., 0.56108361, 0.52941176,
        1.        ]], shape=(4850, 4850))

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_unique_job_categories(user_input):

    # Vectorize user input
    user_vector = cv.transform([user_input])

    # Compute cosine similarity between user and all resumes
    similarities = cosine_similarity(user_vector, count_matrix).flatten()

    # Sort indices by similarity
    sorted_indices = similarities.argsort()[::-1]  # descending

    # Collect top 5 unique job categories
    seen_categories = set()
    seen_description=set()
    results = []

    for idx in sorted_indices:
        category = data3.iloc[idx]['job_category']
        description = data3.iloc[idx]['Job Description']

        if category not in seen_categories and description not in seen_description :
            seen_categories.add(category)
            seen_description.add(description)
            results.append((category, description))

        if len(results) == 5:
            break

    return results


## Testing and Validation

In [24]:
user_input = "Python, Machine Learning, Data Analysis, SQL, Cloud Computing, Software Development, Project Management, Team Leadership"
output = get_unique_job_categories(user_input)
output

[('Data Science / AI / ML',
  'As a Machine Learning Engineer, you will design and implement machine learning algorithms that allow systems to learn from data and improve over time. This role requires strong programming skills, an understanding of statistical modeling, and the ability to work with large datasets. You will collaborate with data scientists and software engineers to develop predictive models and automate decision-making processes. Machine Learning Engineers play a key role in developing AI systems that have applications in industries ranging from finance to healthcare, making your work essential for driving technological innovation.'),
 ('Project / Program Management',
  'As a Business Analyst, you will be responsible for assessing business processes and recommending strategies for improving efficiency and profitability. You will collect and analyze data, identify trends, and work with various departments to implement solutions that drive business success. This role requi