In [None]:
## Task - 8

In [None]:
# ============================
#  Imports
# ============================
import os
import zipfile
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm


In [4]:
# ============================
#  Unzip datasets
# ============================
# Paths to your datasets (update if needed)
dataset1_path = "dataset_1.zip"
dataset2_path = "dataset_2.zip"
extract_path = "./data_task8"

os.makedirs(extract_path, exist_ok=True)

# Extract both datasets
for dataset in [dataset1_path, dataset2_path]:
    with zipfile.ZipFile(dataset, "r") as z:
        z.extractall(extract_path)

print("✅ Datasets extracted to:", extract_path)


✅ Datasets extracted to: ./data_task8


In [None]:
# ============================
#  Load data (fixed)
# ============================
# Load resumes dataset
resumes_df = None
jobs_df = None

for root, _, files in os.walk(extract_path):
    for f in files:
        file_path = os.path.join(root, f)
        if "resume" in f.lower():
            try:
                resumes_df = pd.read_csv(file_path, encoding="utf-8")
            except UnicodeDecodeError:
                resumes_df = pd.read_csv(file_path, encoding="latin1")
            print("✅ Resumes dataset loaded:", f)
        elif "job" in f.lower():
            try:
                jobs_df = pd.read_csv(file_path, encoding="utf-8")
            except UnicodeDecodeError:
                jobs_df = pd.read_csv(file_path, encoding="latin1")
            print("✅ Jobs dataset loaded:", f)

print("Resumes shape:", resumes_df.shape)
print("Jobs shape:", jobs_df.shape)


✅ Jobs dataset loaded: job_descriptions.csv


In [None]:
# ============================
# Detect important columns
# ============================
def detect_column(df, candidates):
    for col in candidates:
        if col in df.columns:
            return col
    raise KeyError(f"None of {candidates} found in dataset!")

# Resume text
resume_col = detect_column(resumes_df, ["Resume", "Resume_str", "CV", "Text"])

# Job description
job_col = detect_column(jobs_df, ["Job Description", "JobDescription", "Description", "Responsibilities", "RoleDescription"])

# Job title
job_title_col = detect_column(jobs_df, ["Job Title", "Role", "Title", "Position"])

print("✅ Using resume column:", resume_col)
print("✅ Using job column:", job_col)
print("✅ Using job title column:", job_title_col)


In [None]:
# ============================
#  Skill extractor (simple rule-based)
# ============================
def extract_skills(text):
    if not isinstance(text, str):
        return []
    keywords = [
        "python", "java", "sql", "excel", "power bi", "tableau", 
        "ml", "machine learning", "deep learning", "nlp", "c++", 
        "aws", "azure", "spark", "hadoop", "keras", "pytorch", "tensorflow"
    ]
    text_lower = text.lower()
    return [kw for kw in keywords if kw in text_lower]


In [None]:
# ============================
#  Embedding Model
# ============================
model = SentenceTransformer("all-MiniLM-L6-v2")
print("✅ Model loaded")


In [None]:
# ============================
#  Precompute Embeddings
# ============================
resumes_texts = resumes_df[resume_col].astype(str).tolist()
jobs_texts = (jobs_df[job_col].astype(str) + " " + jobs_df.get("skills", "").astype(str)).tolist()

resume_embeddings = model.encode(resumes_texts, batch_size=32, show_progress_bar=True)
job_embeddings = model.encode(jobs_texts, batch_size=32, show_progress_bar=True)

print("✅ Embeddings created:", resume_embeddings.shape, job_embeddings.shape)


In [None]:
# ============================
#  Matching & Ranking 
# ============================
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)

results = []
N = 5  # top resumes per job

for j_idx, job in tqdm(jobs_df.iterrows(), total=len(jobs_df), desc="Matching Jobs"):
    sims = similarity_matrix[:, j_idx]
    top_indices = np.argsort(sims)[::-1][:N]
    
    for rank, r_idx in enumerate(top_indices, start=1):
        resume_text = resumes_df.iloc[r_idx][resume_col]
        
        score = round(sims[r_idx] * 100, 2)

        # Skill overlap
        resume_skills = set(extract_skills(resume_text))
        jd_skills = set(extract_skills(job[job_col] + " " + str(job.get("skills", ""))))
        
        matched = resume_skills & jd_skills
        missing = jd_skills - resume_skills

        results.append({
            "JobID": job.get("Job Id", j_idx),
            "JobRole": job.get(job_title_col, f"Job_{j_idx}"),
            "CandidateID": resumes_df.iloc[r_idx].get("CandidateID", r_idx),
            "ResumeSnippet": str(resume_text)[:150] + "...",
            "Score": score,
            "MatchedSkills": ", ".join(matched),
            "MissingSkills": ", ".join(missing),
            "Rank": rank
        })

top_matches_df = pd.DataFrame(results)
top_matches_df.to_csv("topN_job_matches.csv", index=False)

print(f"✅ Matching completed! Top {N} resumes saved to topN_job_matches.csv")
top_matches_df.head(10)


In [None]:
## Bonus 

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import joblib

# Load job dataset
jobs = pd.read_csv("dataset_2.csv")   # your job dataset file
jobs = jobs[['Job Title','Job Description','skills']].dropna()
jobs['combined'] = jobs['Job Title'] + " " + jobs['Job Description'] + " " + jobs['skills']

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode job descriptions
job_embeddings = model.encode(jobs['combined'].tolist(), show_progress_bar=True)

# Save jobs + embeddings + model
joblib.dump(jobs, "jobs_dataset.pkl")
joblib.dump(job_embeddings, "job_embeddings.pkl")
model.save("resume_model")
print("✅ Jobs and embeddings saved!")
