In [8]:
import platform
print(platform. python_version())

3.13.2


### Some important notes here: the headlines begin with "steps" are required for user interactions

# 1. Environment Setup and Imports

So for this project, we need you to go to https://www.kaggle.com/datasets/saugataroyarghya/resume-dataset to download the dataset in this same file directory for ai training purpose

### imports

In [2]:
#required imports for running the notebook, use pip to download if there is any missing notebook
import os
import json
import re
import glob
import pandas as pd
import sys
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

# Optional dependencies for PDF/DOCX parsing
try:
    import PyPDF2
except ImportError:
    PyPDF2 = None

try:
    import docx
except ImportError:
    docx = None

# 2. Data Collection and Preprocessing
   - Upload user's own resume and job description file
   - Data cleaning: Handling missing values, text preprocessing, and data labeling
   - Save processed data to data/processed/

In [3]:
class DataPreprocessor:
    """
    DataPreprocessor handles loading raw resumes in multiple formats,
    cleaning text, extracting structured sections, and saving processed output.
    """

    def __init__(self, input_dir: str, output_dir: str):
        """
        Args:
            input_dir: Directory path to scan by default or empty to prompt for paths
            output_dir: Directory where processed JSON will be written.
        """
        self.input_dir = input_dir
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def _collect_paths(self, default_dir: str) -> list:
        """
        Collect resume file paths from default_dir or via user input.
        Accepts CSV, PDF, DOCX, TXT. If input is not a valid path, performs a fuzzy search
        in the current working directory for matching filenames.
        """
        paths = []
        # 1) Scan default directory if it exists
        if os.path.isdir(default_dir):
            for ext in ("*.csv", "*.pdf", "*.docx", "*.txt"):
                paths.extend(glob.glob(os.path.join(default_dir, ext)))
        # 2) If still no files, prompt user with instructions
        if not paths:
            print(f"No resume files found in '{default_dir}'.")
            print("Please enter either:")
            print("  • An absolute or relative file path (e.g. C:\\Users\\Rick\\Desktop\\resume.pdf or ~/resumes/resume.pdf)")
            print("  • A filename to fuzzy-search your workspace (e.g. 'resume.pdf')")
            user_input = input("Enter directory path, file path, or filename: ")
            entries = [e.strip().strip('"\'') for e in user_input.split(',') if e.strip()]
            for entry in entries:
                entry_path = os.path.abspath(os.path.expanduser(entry))
                if os.path.isdir(entry_path):
                    for ext in ("*.csv", "*.pdf", "*.docx", "*.txt"):
                        paths.extend(glob.glob(os.path.join(entry_path, ext)))
                elif os.path.isfile(entry_path):
                    paths.append(entry_path)
                else:
                    fuzzy = glob.glob(f"**/{entry}", recursive=True)
                    if fuzzy:
                        print(f"Fuzzy match found for '{entry}': {fuzzy}")
                        paths.extend([os.path.abspath(p) for p in fuzzy])
                    else:
                        print(f"Warning: '{entry}' not found or matched. Skipping.")
        return paths

    def load_resumes(self) -> list:
        """
        Load resumes from collected paths.
        Returns a list of dicts: {'id': str, 'raw_text': str}.
        """
        paths = self._collect_paths(self.input_dir)
        if not paths:
            raise RuntimeError("No valid resume file paths provided. Aborting.")

        records = []
        for path in paths:
            ext = os.path.splitext(path)[1].lower()
            basename = os.path.basename(path)
            try:
                if ext == ".csv":
                    df = pd.read_csv(path)
                    if "resume_text" not in df.columns:
                        raise ValueError(f"CSV '{basename}' is missing the 'resume_text' column.")
                    for idx, row in df.iterrows():
                        records.append({
                            'id': row.get('id', f"{basename}_{idx}"),
                            'raw_text': str(row['resume_text'])
                        })
                elif ext == ".pdf":
                    if PyPDF2 is None:
                        print(f"Warning: PyPDF2 not installed; skipping PDF file '{basename}'.")
                        continue
                    text_pages = []
                    with open(path, "rb") as f:
                        reader = PyPDF2.PdfReader(f)
                        for page in reader.pages:
                            text_pages.append(page.extract_text() or '')
                    records.append({'id': basename, 'raw_text': '\n'.join(text_pages)})
                elif ext == ".docx":
                    if docx is None:
                        print(f"Warning: python-docx not installed; skipping DOCX file '{basename}'.")
                        continue
                    document = docx.Document(path)
                    paragraphs = [p.text for p in document.paragraphs]
                    records.append({'id': basename, 'raw_text': '\n'.join(paragraphs)})
                elif ext == ".txt":
                    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                        records.append({'id': basename, 'raw_text': f.read()})
                else:
                    print(f"Skipping unsupported file type: {basename}")
            except Exception as e:
                print(f"Error processing '{basename}': {e}")
        return records

    def clean_text(self, text: str) -> str:
        """
        Normalize text:
          - lowercase
          - remove non-ASCII characters
          - collapse whitespace
        """
        text = text.lower()
        text = re.sub(r"[^\x00-\x7f]", "", text)
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    def extract_sections(self, text: str) -> dict:
        """
        Placeholder for splitting text into sections like 'education', 'experience'.
        TODO: implement via regex or NLP-based heading detection.
        """
        return {}

    def process(self):
        """
        Full pipeline:
          1) Load resumes
          2) Clean text
          3) Extract sections
          4) Save processed data as JSON
        """
        records = self.load_resumes()
        processed = []
        for rec in records:
            clean = self.clean_text(rec['raw_text'])
            sections = self.extract_sections(clean)
            entry = {'id': rec['id'], 'clean_text': clean, **sections}
            processed.append(entry)

        out_path = os.path.join(self.output_dir, 'processed_resumes.json')
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(processed, f, ensure_ascii=False, indent=2)
        print(f"Processed {len(processed)} resumes → {out_path}")

## Step1: Ask User to upload their Resume using required file path

In [4]:
if __name__ == '__main__':
    input_dir = 'data/raw'
    output_dir = 'data/processed'
    dp = DataPreprocessor(input_dir, output_dir)
    dp.process()

No resume files found in 'data/raw'.
Please enter either:
  • An absolute or relative file path (e.g. C:\Users\Rick\Desktop\resume.pdf or ~/resumes/resume.pdf)
  • A filename to fuzzy-search your workspace (e.g. 'resume.pdf')


Enter directory path, file path, or filename:  "C:\Users\Rick\Desktop\5293\final_project_4\Jack_TotallyUnqualified_Resume.pdf"


Processed 1 resumes → data/processed\processed_resumes.json


In [5]:
# Cell 2: Input job description (supports .txt, .pdf, .docx, or URL)
import os
import sys
import glob

# Function to load job description from file or URL
def load_job_description(source: str) -> str:
    """
    Load job description text from a local file (.txt|.pdf|.docx) or URL.
    Performs fuzzy substring search if the direct path fails.
    """
    source = source.strip().strip('"\'')
    # URL
    if source.startswith(('http://', 'https://')):
        import requests
        r = requests.get(source)
        r.raise_for_status()
        return r.text
    # Local
    path = os.path.abspath(os.path.expanduser(source))
    if os.path.isfile(path):
        ext = os.path.splitext(path)[1].lower()
        if ext == '.txt':
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
        elif ext == '.pdf':
            from PyPDF2 import PdfReader
            reader = PdfReader(path)
            return "\n".join([page.extract_text() or '' for page in reader.pages])
        elif ext == '.docx':
            import docx
            doc = docx.Document(path)
            return "\n".join([p.text for p in doc.paragraphs])
        else:
            raise ValueError(f"Unsupported extension: {ext}")
    # Fuzzy search
    name = os.path.basename(source)
    print(f"'{source}' not found; fuzzy searching for '*{name}*'...")
    matches = glob.glob(f"**/*{name}*", recursive=True)
    if matches:
        print(f"Found: {matches[0]}")
        return load_job_description(matches[0])
    raise FileNotFoundError(f"Source '{source}' not found.")

## Step2: Ask User to put their own job description with required format needed

In [6]:
print("\nEnter job description (.txt/.pdf/.docx path or URL):")
src = input()
job_description = load_job_description(src)


Enter job description (.txt/.pdf/.docx path or URL):


 "C:\Users\Rick\Desktop\5293\final_project_4\sample-job-description.pdf"


# 3. Model Selection and Training
   - Choose model architecture: GPT-2 from hugging face
   - Fine-tune the model using resume and job description data from Kaggle datasets
   - Implement prompt optimization strategies to refine output quality
   - Save the trained model to models/fine_tuned_llm.pth

## Step3: Ask User to enter their own hugging face token key

In [1]:
import os
import pandas as pd
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)


# 1) Prompt user for Hugging Face access token
hf_token = input("Enter your Hugging Face access token: ").strip()

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Enter your Hugging Face access token:  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


In [2]:
# 1) Paths
DATA_PATH = "resume_data.csv"
OUTPUT_DIR = "flan-t5-base-resume"

# 2) Load and preprocess CSV
df = pd.read_csv(DATA_PATH, encoding="utf-8")
df.rename(columns=lambda x: x.lstrip("\ufeff"), inplace=True)

def assemble_resume(row):
    parts = []
    if pd.notna(row.get("career_objective")):
        parts.append(row["career_objective"])
    if pd.notna(row.get("skills")):
        parts.append("Skills: " + row["skills"])
    if pd.notna(row.get("responsibilities")):
        parts.append("Responsibilities: " + row["responsibilities"])
    return "\n".join(parts)

def assemble_jd(row):
    parts = []
    if pd.notna(row.get("job_position_name")):
        parts.append("Position: " + row["job_position_name"])
    if pd.notna(row.get("skills_required")):
        parts.append("Required Skills: " + row["skills_required"])
    if pd.notna(row.get("responsibilities.1")):
        parts.append("Responsibilities: " + row["responsibilities.1"])
    return "\n".join(parts)

df["resume_text"] = df.apply(assemble_resume, axis=1)
df["jd_text"]     = df.apply(assemble_jd, axis=1)
df = df[
    df["resume_text"].str.strip().astype(bool) &
    df["jd_text"].str.strip().astype(bool)
]

# 3) Build a Hugging Face Dataset
df["prompt"]     = df["jd_text"] + "\n\nPlease generate a resume based on the above JD:"
df["completion"] = df["resume_text"]
hf_ds = Dataset.from_pandas(df[["prompt", "completion"]])

In [3]:
small_ds = hf_ds.shuffle(seed=42).select(range(200))

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model     = AutoModelForSeq2SeqLM.from_pretrained(
    "google/flan-t5-base",
    device_map={"": "cpu"},
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)

def preprocess_fn(examples):
    inp = tokenizer(
        examples["prompt"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tgt = tokenizer(
        examples["completion"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    inp["labels"] = tgt["input_ids"]
    return inp

tokenized_ds = small_ds.map(
    preprocess_fn,
    batched=True,
    remove_columns=["prompt", "completion"]
)

from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest"
)

from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="flan-t5-tinytest",
    per_device_train_batch_size=16,
    max_steps=50,
    logging_steps=10,
    save_steps=50,
    fp16=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
)

trainer.train()

# Make sure this path matches the one you use in inference
OUTPUT_DIR = "flan-t5-base-resume"  


out = model.generate(**tokenizer(
    "Position: Data Analyst\nRequired Skills: Python, SQL\n\nPlease generate a resume:",
    return_tensors="pt"
))
print(tokenizer.decode(out[0], skip_special_tokens=True))

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: zz3237 (zz3237-columbia-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,3.2392
20,3.1852
30,2.6668
40,2.6486
50,2.5512


Skills: Data Analyst (Requirement: Python, SQL)


# 4. Generate modified Resume based on fine-tuned gpt model from above and generate a cover letter

In [14]:
# === Section 4: Inference on processed resumes + user’s JD ===

import os, json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1) Load the processed resumes JSON
proc_path = "data/processed/processed_resumes.json"
with open(proc_path, "r", encoding="utf-8") as f:
    processed = json.load(f)

if not processed:
    raise RuntimeError("No processed resumes found at " + proc_path)

# 2) Load your fine‐tuned Flan-T5 checkpoint
MODEL_DIR = "flan-t5-base-resume"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to("cpu")
model.eval()

# 3) Prepare output folder
out_dir = "data/final_outputs"
os.makedirs(out_dir, exist_ok=True)

# 4) Loop through each processed resume
for rec in processed:
    rec_id       = rec["id"]
    clean_resume = rec["clean_text"]
    jd           = job_description   # from Section 2

    # ——— a) Rewrite & expand resume ———
    resume_prompt = (
        f"{jd}\n\n"
        "Here is my current resume:\n"
        f"{clean_resume}\n\n"
        "Please rewrite and expand my resume to better match the job description above:"
    )
    inputs = tokenizer(
        resume_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="longest"
    )
    out_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=250,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    new_resume = tokenizer.decode(out_ids[0], skip_special_tokens=True)

    # ——— b) Generate cover letter ———
    cover_prompt = (
        f"{jd}\n\n"
        "Write a concise, enthusiastic cover letter highlighting my skills "
        "and experience as shown above, tailored to this job description:"
    )
    inputs2 = tokenizer(
        cover_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="longest"
    )
    out2_ids = model.generate(
        inputs2.input_ids,
        attention_mask=inputs2.attention_mask,
        max_new_tokens=300,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    cover_letter = tokenizer.decode(out2_ids[0], skip_special_tokens=True)

    # ——— c) Save results ———
    with open(f"{out_dir}/{rec_id}_resume.txt",       "w", encoding="utf-8") as f:
        f.write(new_resume)
    with open(f"{out_dir}/{rec_id}_cover_letter.txt", "w", encoding="utf-8") as f:
        f.write(cover_letter)

    # remove this break to process *all* uploads; kept here to demo just the first
    break

print(f"Done. Outputs (one resume + cover letter) saved under {out_dir}/") 

Done. Outputs (one resume + cover letter) saved under data/final_outputs/


# Redo part 3 and 4 for changing training logic

### By precomputing embeddings for our JD–resume pairs and using FAISS to retrieve just three examples for a few-shot prompt to flan-t5-small, we eliminated all fine-tuning and achieve CPU-only, sub-second resume and cover letter generation.

In [3]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

df = pd.read_csv("resume_data.csv", encoding="utf-8")
df.rename(columns=lambda x: x.lstrip("\ufeff"), inplace=True)

def assemble_resume(r):
    parts = []
    if pd.notna(r.get("career_objective")):
        parts.append(r["career_objective"])
    if pd.notna(r.get("skills")):
        parts.append("Skills: " + r["skills"])
    if pd.notna(r.get("responsibilities")):
        parts.append("Responsibilities: " + r["responsibilities"])
    return "\n".join(parts)

def assemble_jd(r):
    parts = []
    if pd.notna(r.get("job_position_name")):
        parts.append("Position: " + r["job_position_name"])
    if pd.notna(r.get("skills_required")):
        parts.append("Required Skills: " + r["skills_required"])
    if pd.notna(r.get("responsibilities.1")):
        parts.append("Responsibilities: " + r["responsibilities.1"])
    return "\n".join(parts)

df["resume_text"] = df.apply(assemble_resume, axis=1)
df["jd_text"]     = df.apply(assemble_jd, axis=1)

df = df[
    df["resume_text"].str.strip().astype(bool) &
    df["jd_text"].str.strip().astype(bool)
]

prompts     = df["jd_text"].tolist()
completions = df["resume_text"].tolist()

from transformers import AutoTokenizer, AutoModel

embed_model_name = "sentence-transformers/all-mpnet-base-v2"
embed_tok   = AutoTokenizer.from_pretrained(embed_model_name)
embed_model = AutoModel.from_pretrained(embed_model_name).eval()

def compute_embeddings(texts, batch_size=32):
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i+batch_size]
        inputs = embed_tok(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = embed_model(**inputs)
        # mean pooling
        embs = outputs.last_hidden_state.mean(dim=1)
        all_embs.append(embs.cpu().numpy())
    return np.vstack(all_embs)

prompts = df["jd_text"].tolist()
completions = df["resume_text"].tolist()

print(" Computing embeddings in batches…")
embs = compute_embeddings(prompts)

import faiss
dim   = embs.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embs)

print(f" Indexed {embs.shape[0]} examples. Ready for few-shot retrieval.")

 Computing embeddings in batches…
 Indexed 9544 examples. Ready for few-shot retrieval.


In [16]:
import os, json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

with open("data/processed/processed_resumes.json", "r", encoding="utf-8") as f:
    processed = json.load(f)

MODEL_DIR = "flan-t5-base-resume"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to("cpu")
model.eval()

os.makedirs("data/final_outputs", exist_ok=True)

for rec in processed:
    rid  = rec["id"]
    res  = rec["clean_text"]
    jd   = job_description

    # a) resume
    prompt = (
      f"### JD:\n{jd}\n\n"
      "### Current Resume:\n" + res + "\n\n"
      "### Rewritten Resume:\n"
      "- Professional Summary (2–3 sentences)\n"
      "- Key Skills (bullet list)\n"
      "- Experience (bullets, quantifiable)\n"
      "- Education\n"
    )
    enc = tokenizer(prompt, return_tensors="pt", truncation=True,
                    max_length=512, padding="longest")
    out = model.generate(
      **enc, max_new_tokens=250,
      do_sample=True, temperature=0.7, top_p=0.9,
      pad_token_id=tokenizer.eos_token_id
    )
    new_resume = tokenizer.decode(out[0], skip_special_tokens=True)

    # b) Cover letter
    prompt2 = (
      f"### JD:\n{jd}\n\n"
      "### Resume:\n" + res + "\n\n"
      "### Cover Letter (3 paragraphs):\n"
      "1. Intro (position, company)\n"
      "2. Highlight 2–3 key achievements\n"
      "3. Closing (enthusiasm, next steps)\n"
    )
    enc2 = tokenizer(prompt2, return_tensors="pt", truncation=True,
                     max_length=512, padding="longest")
    out2 = model.generate(
      **enc2, max_new_tokens=300,
      do_sample=True, temperature=0.7, top_p=0.9,
      pad_token_id=tokenizer.eos_token_id
    )
    cover = tokenizer.decode(out2[0], skip_special_tokens=True)

    # c) Save
    with open(f"data/final_outputs/{rid}_resume.txt", "w", encoding="utf-8") as f:
        f.write(new_resume)
    with open(f"data/final_outputs/{rid}_cover_letter.txt", "w", encoding="utf-8") as f:
        f.write(cover)

    break

print("Done—check data/final_outputs/ for your new resumes & cover letters.")

Done—check data/final_outputs/ for your new resumes & cover letters.


In [17]:
for rec in processed:
    rid  = rec["id"]
    res  = rec["clean_text"]
    jd   = job_description

    resume_prompt = (
        "### Instruction:\n"
        "Rewrite and expand the candidate’s resume to match the job description below.\n"
        "Output **four** clearly-labeled sections. Do **not** include anything else.\n\n"

        "### Job Description:\n"
        f"{jd}\n\n"

        "### Current Resume:\n"
        f"{res}\n\n"

        "### Updated Resume:\n"
        "1) Professional Summary (2–3 sentences)\n"
        "2) Key Skills (bullet list, max 6 items)\n"
        "3) Experience (bullet list, focus on quantifiable achievements, max 5 items)\n"
        "4) Education (degree, institution, year)\n\n"
        "Begin your answer **immediately after** \"### Updated Resume:\""
    )

    inputs = tokenizer(
        resume_prompt,
        return_tensors="pt",
        truncation=True, max_length=512, padding="longest"
    ).to("cpu")
    out_ids = model.generate(
        **inputs,
        max_new_tokens=300,
        do_sample=True, temperature=0.7, top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    new_resume = tokenizer.decode(out_ids[0], skip_special_tokens=True)


    cover_prompt = (
        "### Instruction:\n"
        "Write a **three-paragraph** cover letter based on the job description & resume below.\n"
        "Do **not** include any preamble, explanations, or repeat the JD. Output **only** the letter.\n\n"

        "### Job Description:\n"
        f"{jd}\n\n"

        "### Resume:\n"
        f"{res}\n\n"

        "### Cover Letter:\n"
        "Paragraph 1: Introduce application, mention position & company.\n"
        "Paragraph 2: Highlight 2–3 relevant achievements from the resume.\n"
        "Paragraph 3: Express enthusiasm and next steps.\n\n"
        "Begin your answer **immediately after** \"### Cover Letter:\""
    )

    inputs2 = tokenizer(
        cover_prompt,
        return_tensors="pt",
        truncation=True, max_length=512, padding="longest"
    ).to("cpu")
    out2_ids = model.generate(
        **inputs2,
        max_new_tokens=350,
        do_sample=True, temperature=0.7, top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    cover_letter = tokenizer.decode(out2_ids[0], skip_special_tokens=True)


    print("=== UPDATED RESUME ===\n", new_resume)
    print("\n=== COVER LETTER ===\n", cover_letter)
    break

=== UPDATED RESUME ===
 i want to be a risk manager.

=== COVER LETTER ===
 monopoly)
