In [2]:
import kagglehub
import pandas as pd
import os
import pandas as pd
from PyPDF2 import PdfReader
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
# Download datasets from kaggle
path_jd = kagglehub.dataset_download("vaghefi/indeed-jobs")
path_resumes = kagglehub.dataset_download("sauravsolanki/hire-a-perfect-machine-learning-engineer")

print(f"Job Descriptions Path: {path_jd}")
print(f"Resumes Path: {path_resumes}")


Job Descriptions Path: /Users/owaiskamdar/.cache/kagglehub/datasets/vaghefi/indeed-jobs/versions/1
Resumes Path: /Users/owaiskamdar/.cache/kagglehub/datasets/sauravsolanki/hire-a-perfect-machine-learning-engineer/versions/1


In [45]:
# define roots for jobs
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
RAW_JOBS_PATH = os.path.join(PROJECT_ROOT, "data/raw/jobs")

# Ensure directories exist
os.makedirs(RAW_JOBS_PATH, exist_ok=True)

# Load job descriptions CSV
job_descriptions = pd.read_csv(path_jd + "/indeed_jobs.csv")

# Save the CSV file in the correct directory
job_descriptions.to_csv(os.path.join(RAW_JOBS_PATH, "job_descriptions.csv"), index=False)

print(f"Job descriptions CSV saved at: {RAW_JOBS_PATH}/job_descriptions.csv")

# Preview dataset
job_descriptions.head()


Job descriptions CSV saved at: /Users/owaiskamdar/Desktop/resume_optimizer/lllm-resume-optimizer/data/raw/jobs/job_descriptions.csv


Unnamed: 0,title,description,city,state,zipcode,salary,company,rating,reviews
0,Data Scientist,About Live Objects\nLive Objects delivers cont...,Palo Alto,CA,94301.0,,LIVE OBJECTS,,
1,"Senior Data Scientist, FP&A","The Senior Data Scientist, FP&A role is primar...",Orrville,OH,44667.0,,The J. M. Smucker Company,3.8,393.0
2,BI Developer (Tableau),**U.S. Citizens and those authorized to work i...,Charlotte,NC,28202.0,"$97,000 a year",Vaco,3.7,272.0
3,Search & Information Retrieval Engineer / Scie...,"Summary\nPosted: Jul 29, 2020\nRole Number:200...",Santa Clara Valley,CA,95014.0,,Apple,4.2,9782.0
4,Machine Learning Engineer,"At Sisu, we're building a software platform th...",San Francisco,CA,,,Sisu,4.4,8.0


In [46]:
# path for resumes
RAW_RESUMES_PATH = os.path.join(PROJECT_ROOT, "data/raw/resumes")

os.makedirs(RAW_RESUMES_PATH, exist_ok=True)

# source folder where kagglehub stores pdfs
pdf_folder = path_resumes + "/HireAMLE/dataset/trainResumes"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

#save pdfs
for pdf_file in tqdm(pdf_files, desc="Downloading PDFs to /data/raw/resumes"):
    source_path = os.path.join(pdf_folder, pdf_file)
    destination_path = os.path.join(RAW_RESUMES_PATH, pdf_file)
    
    if not os.path.exists(destination_path):  # Avoid redundant downloads
        with open(source_path, "rb") as src_file, open(destination_path, "wb") as dest_file:
            dest_file.write(src_file.read())

print(f"All resumes (PDFs) have been downloaded and saved to: {RAW_RESUMES_PATH}")

# Count number of PDFs
print(f"Total Resumes Downloaded: {len(pdf_files)}")


Downloading PDFs to /data/raw/resumes: 100%|██████████| 90/90 [00:00<00:00, 78333.13it/s]

All resumes (PDFs) have been downloaded and saved to: /Users/owaiskamdar/Desktop/resume_optimizer/lllm-resume-optimizer/data/raw/resumes
Total Resumes Downloaded: 90





In [47]:
# extract text from pdfs
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        reader = PdfReader(pdf_path)
        text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
        return text.strip()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# get list of pdfs
pdf_files = [f for f in os.listdir(RAW_RESUMES_PATH) if f.endswith(".pdf")]

# extract text from each PDF and store it in a DataFrame
resume_data = []
for pdf_file in tqdm(pdf_files, desc="Extracting text from PDFs"):
    pdf_path = os.path.join(RAW_RESUMES_PATH, pdf_file)
    resume_text = extract_text_from_pdf(pdf_path)
    resume_data.append({"filename": pdf_file, "resume_text": resume_text})

# convert to DataFrame
df_resumes = pd.DataFrame(resume_data)

# Save processed resumes
PROCESSED_RESUMES_PATH = os.path.join(PROJECT_ROOT, "data/processed/resumes")
os.makedirs(PROCESSED_RESUMES_PATH, exist_ok=True)
df_resumes.to_csv(os.path.join(PROCESSED_RESUMES_PATH, "processed_resumes.csv"), index=False)

# Preview extracted resumes
df_resumes.head()


Extracting text from PDFs: 100%|██████████| 90/90 [00:01<00:00, 51.93it/s]


Unnamed: 0,filename,resume_text
0,candidate_070.pdf,"Publicis Sapient\nDATA PROGRAMMER, JAN 2019 - ..."
1,candidate_064.pdf,DAMIAN ROSSIER\nPROFILE\nExpertise EDA modeler...
2,candidate_138.pdf,One97 Communications Limited \nData Scientist ...
3,candidate_104.pdf,Sanrachna SGT University \nDeputy Data Analyst...
4,candidate_110.pdf,BILL CLIFFORD\nPROFILE\nI am an experienced an...


In [48]:
# load job descriptions
job_descriptions = pd.read_csv(os.path.join(RAW_JOBS_PATH, "job_descriptions.csv"))

# format job descriptions
job_descriptions["job_description"] = "**TITLE**: " + job_descriptions["title"] + \
                                      " **DESCRIPTIONS** " + job_descriptions["description"] + \
                                      " **COMPANY** " + job_descriptions["company"]

# Drop unnecessary columns
job_descriptions = job_descriptions.drop(columns=["title", "description", "company", 
                                                  "city", "state", "zipcode", "salary", "rating", "reviews"])

# save processed job descriptions
PROCESSED_JOBS_PATH = os.path.join(PROJECT_ROOT, "data/processed/jobs")
os.makedirs(PROCESSED_JOBS_PATH, exist_ok=True)
job_descriptions.to_csv(os.path.join(PROCESSED_JOBS_PATH, "processed_job_descriptions.csv"), index=False)
job_descriptions.head()


Unnamed: 0,job_description
0,**TITLE**: Data Scientist **DESCRIPTIONS** Abo...
1,"**TITLE**: Senior Data Scientist, FP&A **DESCR..."
2,**TITLE**: BI Developer (Tableau) **DESCRIPTIO...
3,**TITLE**: Search & Information Retrieval Engi...
4,**TITLE**: Machine Learning Engineer **DESCRIP...


In [49]:
job_descriptions_sampled = job_descriptions.sample(n=2000, random_state=42).reset_index(drop=True)

# Preview sample
job_descriptions_sampled.head()


Unnamed: 0,job_description
0,**TITLE**: Product Manager - Computer Vision *...
1,**TITLE**: Machine Learning Software Engineer ...
2,**TITLE**: Data Engineer **DESCRIPTIONS** Data...
3,**TITLE**: Sr. Data Engineer **DESCRIPTIONS** ...
4,**TITLE**: Machine Learning Engineer **DESCRIP...


In [59]:
# load the processed resumes dataset
df_resumes = pd.read_csv(os.path.join(PROCESSED_RESUMES_PATH, "processed_resumes.csv"))

# check the number of resumes
num_resumes = len(df_resumes)
print(f"Total resumes available: {num_resumes}")

# 2000 rows
repeated_resumes = [df_resumes.iloc[i % num_resumes] for i in range(2000)]
df_resumes_expanded = pd.DataFrame(repeated_resumes).reset_index(drop=True)

# Preview repeated resumes
df_resumes_expanded.head()



Total resumes available: 90


Unnamed: 0,filename,resume_text
0,candidate_070.pdf,"Publicis Sapient\nDATA PROGRAMMER, JAN 2019 - ..."
1,candidate_064.pdf,DAMIAN ROSSIER\nPROFILE\nExpertise EDA modeler...
2,candidate_138.pdf,One97 Communications Limited \nData Scientist ...
3,candidate_104.pdf,Sanrachna SGT University \nDeputy Data Analyst...
4,candidate_110.pdf,BILL CLIFFORD\nPROFILE\nI am an experienced an...


In [60]:

# count rows
len(df_resumes_expanded)    


2000

In [61]:
# merge both
final_df = job_descriptions_sampled.copy()
final_df["resume_text"] = df_resumes_expanded["resume_text"].values  # Assign resumes

# Preview the merged dataset
final_df.head()



Unnamed: 0,job_description,resume_text
0,**TITLE**: Product Manager - Computer Vision *...,"Publicis Sapient\nDATA PROGRAMMER, JAN 2019 - ..."
1,**TITLE**: Machine Learning Software Engineer ...,DAMIAN ROSSIER\nPROFILE\nExpertise EDA modeler...
2,**TITLE**: Data Engineer **DESCRIPTIONS** Data...,One97 Communications Limited \nData Scientist ...
3,**TITLE**: Sr. Data Engineer **DESCRIPTIONS** ...,Sanrachna SGT University \nDeputy Data Analyst...
4,**TITLE**: Machine Learning Engineer **DESCRIP...,BILL CLIFFORD\nPROFILE\nI am an experienced an...


In [62]:

# count rows
len(final_df)    


2000

In [63]:
# Define final dataset path
FINAL_DATASET_PATH = os.path.join(PROJECT_ROOT, "data/final/matched_resumes_jobs.csv")

# Ensure the directory exists
os.makedirs(os.path.dirname(FINAL_DATASET_PATH), exist_ok=True)

# Save final dataset
final_df.to_csv(FINAL_DATASET_PATH, index=False)

print(f"Final dataset saved at: {FINAL_DATASET_PATH}")

# Display a sample
final_df.head()

Final dataset saved at: /Users/owaiskamdar/Desktop/resume_optimizer/lllm-resume-optimizer/data/final/matched_resumes_jobs.csv


Unnamed: 0,job_description,resume_text
0,**TITLE**: Product Manager - Computer Vision *...,"Publicis Sapient\nDATA PROGRAMMER, JAN 2019 - ..."
1,**TITLE**: Machine Learning Software Engineer ...,DAMIAN ROSSIER\nPROFILE\nExpertise EDA modeler...
2,**TITLE**: Data Engineer **DESCRIPTIONS** Data...,One97 Communications Limited \nData Scientist ...
3,**TITLE**: Sr. Data Engineer **DESCRIPTIONS** ...,Sanrachna SGT University \nDeputy Data Analyst...
4,**TITLE**: Machine Learning Engineer **DESCRIP...,BILL CLIFFORD\nPROFILE\nI am an experienced an...


In [66]:
final_df["resume_text"][0]



'Publicis Sapient\nDATA PROGRAMMER, JAN 2019 - TILL DATE\nSetting up processes for data management, template\nanalytical modules/deliverables; improving processes\nwith a focus on automation of data feeding to ML\nmodels.\nPublicis Sapient\nANALYST INTERN, OCT 2018 - JAN 2018\nWorked on air pollution related Disease Analysis.\nB.Tech (Electrical), NIST Kottam, 2019WORK EXPERIENCE\nEDUCATIONGRANT ABBERNS\nDATA MANAGER\nExpertise in identifying & developing innovative and out-of-the-box analytical solutions using suitable machine learning\nalgorithms and statistical methods to achieve actionable insights & optimize business decisions Skilled in assisting in the\nstrategic direction of the company by identifying opportunities in large, rich data sets and creating and implementing data-\ndriven strategies that fuel growth Capable in driving data-driven decisionmaking, stakeholder management, steering\nanalytics practice and deciding best-fit commercial model options for different sourcing 

In [67]:
final_df["job_description"][0]

"**TITLE**: Product Manager - Computer Vision **DESCRIPTIONS** Want to join a fun, creative company that is on the cutting edge of amazing technologies? NVIDIA is developing groundbreaking solutions in some of the world’s most exciting technology areas including Computer Vision, Virtual Reality, Artificial Intelligence, Deep Learning and Autonomous Vehicles.\nWe are looking for a Product Manager to join our NVIDIA Product Management team to define and develop products to grow our AI and Computer Vision business. As a Product Manager, you will mold NVIDIA state of the art AI and computer vision technology into next generation products. You will work with forward-thinking people in engineering, operations and marketing to lead every phase of product, from conception to obsolescence.\nWhat you'll be doing:\nCollect requirements to define future products, including competitive analysis and customer feedback.\nCollaborate with research, engineering, product, and campaign teams in driving th