In [17]:
# ---------- Setup ----------
import os, sys
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import nltk

# go one folder up from "notebooks" → project root
sys.path.append(os.path.abspath(".."))

# now imports will work
from src.preprocess import clean_text
from src.analyzer import analyze_resume_jd

# Ensure src package is accessible
sys.path.append(os.path.abspath(".."))

# Download stopwords (only first time)
nltk.download("stopwords")

# ---------- Import your project modules ----------
from src.preprocess import clean_text
from src.feature_extraction import get_embeddings
from src.analyzer import analyze_resume_jd

# ---------- Load Data ----------
project_root = os.path.abspath("..")
data_dir = os.path.join(project_root, "data")

files = os.listdir(data_dir)
print("Found files:", files)

# Paths
resumes_path = [f for f in files if "resume" in f.lower()][0]
jobs_path = [f for f in files if "job" in f.lower()][0]

resumes = pd.read_csv(os.path.join(data_dir, resumes_path))
jobs = pd.read_csv(os.path.join(data_dir, jobs_path))

print("Resumes shape:", resumes.shape)
print("Jobs shape:", jobs.shape)

# ✅ Define correct column names
RESUME_COLUMN = "Resume_str"
JOBDESC_COLUMN = "jobdescription"

# ---------- Sample Pair ----------
sample_resume = resumes[RESUME_COLUMN].iloc[0]
sample_jd = jobs[JOBDESC_COLUMN].iloc[0]

print("Sample Resume:", sample_resume[:200], "...")
print("Sample JD:", sample_jd[:200], "...")

result = analyze_resume_jd(sample_resume, sample_jd)
print("One-to-One Match Result:", result)

# ---------- Rank All Resumes for First JD ----------
jd = jobs[JOBDESC_COLUMN].iloc[0]
results = []

for i, res in enumerate(resumes[RESUME_COLUMN]):
    score = analyze_resume_jd(res, jd)
    results.append({
        "Resume_Index": i,
        "Similarity_Score": score["similarity_score"],
        "Feedback": score["feedback"]
    })

ranked = pd.DataFrame(results).sort_values(
    by="Similarity_Score", ascending=False
).reset_index(drop=True)

print("\nTop 5 Resumes for JD #0:")
print(ranked.head(5))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mr.Aj_222\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Found files: ['jobs.csv', 'resumes.csv']
Resumes shape: (2484, 4)
Jobs shape: (22000, 14)
Sample Resume:          HR ADMINISTRATOR/MARKETING ASSOCIATE

HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Resp ...
Sample JD: Job Description   Send me Jobs like this Qualifications: - == > 10th To Graduation & Any Skill: - == > Basic Computer Knowledge Job Requirement : - == > System or Laptop Type of job: - == > Full Time  ...
One-to-One Match Result: {'similarity_score': 58.59, 'feedback': 'Needs improvement'}

Top 5 Resumes for JD #0:
   Resume_Index  Similarity_Score    Feedback
0          1718             73.19  Good match
1           313             71.72  Good match
2            78             71.69  Good match
3            40             71.37  Good match
4           246             71.07  Good match


## Ranking All Resumes vs One JD

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Precompute embeddings
resume_texts = resumes[RESUME_COLUMN].apply(clean_text).tolist()
job_text = clean_text(target_jd)

print("Encoding resumes...")
resume_embeddings = get_embeddings(resume_texts)
print("Encoding job description...")
job_embedding = get_embeddings([job_text])[0]  # single vector

# 2. Compute similarity in one go
similarities = cosine_similarity(resume_embeddings, [job_embedding]).flatten()

# 3. Build results DataFrame
ranked = pd.DataFrame({
    "Resume_Index": range(len(resume_texts)),
    "Similarity_Score": (similarities * 100).round(2),
    "Feedback": ["Good match" if s > 0.7 else "Needs improvement" for s in similarities]
}).sort_values(by="Similarity_Score", ascending=False).reset_index(drop=True)

print("Top 10 Resumes for JD:")
print(ranked.head(10))


Encoding resumes...
Encoding job description...
Top 10 Resumes for JD:
   Resume_Index  Similarity_Score           Feedback
0           649         71.910004         Good match
1           587         71.800003         Good match
2          1376         71.419998         Good match
3          1359         71.199997         Good match
4          1402         70.540001         Good match
5          1368         70.470001         Good match
6          1465         70.169998         Good match
7          1389         69.919998  Needs improvement
8          1447         69.629997  Needs improvement
9           608         69.050003  Needs improvement


In [None]:
print("Resume CSV columns:", resumes.columns.tolist())
print("Jobs CSV columns:", jobs.columns.tolist())


Resume CSV columns: ['ID', 'Resume_str', 'Resume_html', 'Category']
Jobs CSV columns: ['company', 'education', 'experience', 'industry', 'jobdescription', 'jobid', 'joblocation_address', 'jobtitle', 'numberofpositions', 'payrate', 'postdate', 'site_name', 'skills', 'uniq_id']
