In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import os

# Load job descriptions CSV
job_descriptions_df = pd.read_csv(r"C:\Users\Prahar\Downloads\job_descriptions.csv")

# Load CV details CSV
cv_details_df = pd.read_csv(r"C:\Users\Prahar\Downloads\CV_Details(1).csv")

# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")


# Define a function to tokenize and embed text
def tokenize_and_embed(text):
    if isinstance(text, list):
        # Tokenize text as a list of strings
        inputs = tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512,
            add_special_tokens=True,
        )
    else:
        # Tokenize a single string
        inputs = tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512,
            add_special_tokens=True,
        )

    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


# Clean and preprocess job descriptions
job_descriptions_df["job_description"] = job_descriptions_df["job_description"].astype(
    str
)
job_descriptions_df["job_description"] = job_descriptions_df["job_description"].apply(
    lambda x: " ".join(x.split())
)  # Remove extra spaces and line breaks

# Tokenize and embed job descriptions
job_description_embeddings = np.vstack(
    job_descriptions_df["job_description"].apply(lambda x: tokenize_and_embed(x))
)

# Clean and preprocess CV details
cv_details_df["Education"] = cv_details_df["Education"].astype(str)
cv_details_df["Skills and Highlights"] = cv_details_df["Skills and Highlights"].astype(
    str
)

# Concatenate relevant columns for CVs
cv_details_df["cv_text"] = (
    cv_details_df["Job Role"]
    + " "
    + cv_details_df["Education"]
    + " "
    + cv_details_df["Skills and Highlights"]
)

cv_details_df = cv_details_df.dropna()

# Tokenize and embed CV details
cv_embeddings = np.vstack(
    cv_details_df["cv_text"].apply(lambda x: tokenize_and_embed(x))
)

# Calculate cosine similarities
cosine_similarities = cosine_similarity(job_description_embeddings, cv_embeddings)

# Create a DataFrame to store the results
result_df = pd.DataFrame(columns=["job_id", "job_description", "top_cvs"])

# Rank CVs and list the top 5 for each job description
for i, job_description in enumerate(job_descriptions_df["job_description"]):
    job_desc_id = job_descriptions_df["job_id"][i]
    similarities = cosine_similarities[i]
    top_cv_indices = np.argsort(similarities)[::-1][:5]
    top_cvs = [
        f"{cv_details_df['File Name'][index]} - {cv_details_df['Job Role'][index]}"
        for index in top_cv_indices
    ]
    result_df.loc[i] = [job_desc_id, job_description, top_cvs]

    # Print the file name being processed
    pdf_file_name = os.path.basename(cv_details_df["File Name"][top_cv_indices[0]])
    print(
        f"Processing job description {job_desc_id} using CV from file: {pdf_file_name}"
    )

# Save or display the result_df
result_df.to_csv("job_cv_ranking.csv", index=False)
