# Resume Filter Project GPT2.0 Version

### Author: 
Pengyu Tao pt2649, Zhou Zhou zz3237


In [1]:
import platform
print(platform. python_version())

3.13.2


In [2]:
!pip install openai==0.28

Collecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.78.1
    Uninstalling openai-1.78.1:
      Successfully uninstalled openai-1.78.1
Successfully installed openai-0.28.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.3.13 requires openai<2.0.0,>=1.68.2, but you have openai 0.28.0 which is incompatible.


### Some important notes here: the headlines begin with "steps" are required for user interactions

# 1. Environment Setup and Imports

So for this project, we need you to go to https://www.kaggle.com/datasets/saugataroyarghya/resume-dataset to download the dataset in this same file directory for ai training purpose

### All imports required for this whole notebook, if you don't have it then use pip install

In [3]:
import os
import json
import re
import glob
import pandas as pd
import sys
import openai
import time
import faiss
import numpy as np
import asyncio
import aiohttp
import nest_asyncio
import torch
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Optional dependencies for PDF/DOCX parsing
try:
    import PyPDF2
except ImportError:
    PyPDF2 = None

try:
    import docx
except ImportError:
    docx = None

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


# 2. Data Collection and Preprocessing
   - Cleaning the resume data that the user uploads
   - Data cleaning: Handling missing values, text preprocessing, and data labeling
   - Save processed data to data/processed/

In [4]:
class DataPreprocessor:
    """
    DataPreprocessor handles loading raw resumes in multiple formats,
    cleaning text, extracting structured sections, and saving processed output.
    """

    def __init__(self, input_dir: str, output_dir: str):
        """
        Args:
            input_dir: Directory path to scan by default or empty to prompt for paths
            output_dir: Directory where processed JSON will be written.
        """
        self.input_dir = input_dir
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def _collect_paths(self, default_dir: str) -> list:
        """
        Collect resume file paths from default_dir or via user input.
        Accepts CSV, PDF, DOCX, TXT. If input is not a valid path, performs a fuzzy search
        in the current working directory for matching filenames.
        """
        paths = []
        # 1) Scan default directory if it exists
        if os.path.isdir(default_dir):
            for ext in ("*.csv", "*.pdf", "*.docx", "*.txt"):
                paths.extend(glob.glob(os.path.join(default_dir, ext)))
        # 2) If still no files, prompt user with instructions
        if not paths:
            print(f"No resume files found in '{default_dir}'.")
            print("Please enter either:")
            print("  • An absolute or relative file path (e.g. C:\\Users\\Rick\\Desktop\\resume.pdf or ~/resumes/resume.pdf)")
            print("  • A filename to fuzzy-search your workspace (e.g. 'resume.pdf')")
            user_input = input("Enter directory path, file path, or filename: ")
            entries = [e.strip().strip('"\'') for e in user_input.split(',') if e.strip()]
            for entry in entries:
                entry_path = os.path.abspath(os.path.expanduser(entry))
                if os.path.isdir(entry_path):
                    for ext in ("*.csv", "*.pdf", "*.docx", "*.txt"):
                        paths.extend(glob.glob(os.path.join(entry_path, ext)))
                elif os.path.isfile(entry_path):
                    paths.append(entry_path)
                else:
                    fuzzy = glob.glob(f"**/{entry}", recursive=True)
                    if fuzzy:
                        print(f"Fuzzy match found for '{entry}': {fuzzy}")
                        paths.extend([os.path.abspath(p) for p in fuzzy])
                    else:
                        print(f"Warning: '{entry}' not found or matched. Skipping.")
        return paths

    def load_resumes(self) -> list:
        """
        Load resumes from collected paths.
        Returns a list of dicts: {'id': str, 'raw_text': str}.
        """
        paths = self._collect_paths(self.input_dir)
        if not paths:
            raise RuntimeError("No valid resume file paths provided. Aborting.")

        records = []
        for path in paths:
            ext = os.path.splitext(path)[1].lower()
            basename = os.path.basename(path)
            try:
                if ext == ".csv":
                    df = pd.read_csv(path)
                    if "resume_text" not in df.columns:
                        raise ValueError(f"CSV '{basename}' is missing the 'resume_text' column.")
                    for idx, row in df.iterrows():
                        records.append({
                            'id': row.get('id', f"{basename}_{idx}"),
                            'raw_text': str(row['resume_text'])
                        })
                elif ext == ".pdf":
                    if PyPDF2 is None:
                        print(f"Warning: PyPDF2 not installed; skipping PDF file '{basename}'.")
                        continue
                    text_pages = []
                    with open(path, "rb") as f:
                        reader = PyPDF2.PdfReader(f)
                        for page in reader.pages:
                            text_pages.append(page.extract_text() or '')
                    records.append({'id': basename, 'raw_text': '\n'.join(text_pages)})
                elif ext == ".docx":
                    if docx is None:
                        print(f"Warning: python-docx not installed; skipping DOCX file '{basename}'.")
                        continue
                    document = docx.Document(path)
                    paragraphs = [p.text for p in document.paragraphs]
                    records.append({'id': basename, 'raw_text': '\n'.join(paragraphs)})
                elif ext == ".txt":
                    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                        records.append({'id': basename, 'raw_text': f.read()})
                else:
                    print(f"Skipping unsupported file type: {basename}")
            except Exception as e:
                print(f"Error processing '{basename}': {e}")
        return records

    def clean_text(self, text: str) -> str:
        """
        Normalize text:
          - lowercase
          - remove non-ASCII characters
          - collapse whitespace
        """
        text = text.lower()
        text = re.sub(r"[^\x00-\x7f]", "", text)
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    def extract_sections(self, text: str) -> dict:
        """
        Placeholder for splitting text into sections like 'education', 'experience'.
        TODO: implement via regex or NLP-based heading detection.
        """
        return {}

    def process(self):
        """
        Full pipeline:
          1) Load resumes
          2) Clean text
          3) Extract sections
          4) Save processed data as JSON
        """
        records = self.load_resumes()
        processed = []
        for rec in records:
            clean = self.clean_text(rec['raw_text'])
            sections = self.extract_sections(clean)
            entry = {'id': rec['id'], 'clean_text': clean, **sections}
            processed.append(entry)

        out_path = os.path.join(self.output_dir, 'processed_resumes.json')
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(processed, f, ensure_ascii=False, indent=2)
        print(f"Processed {len(processed)} resumes → {out_path}")

## Step1: Ask User to upload their Resume using required file path

In [6]:
if __name__ == '__main__':
    input_dir = 'data/raw'
    output_dir = 'data/processed'
    dp = DataPreprocessor(input_dir, output_dir)
    dp.process()

No resume files found in 'data/raw'.
Please enter either:
  • An absolute or relative file path (e.g. C:\Users\Rick\Desktop\resume.pdf or ~/resumes/resume.pdf)
  • A filename to fuzzy-search your workspace (e.g. 'resume.pdf')


Enter directory path, file path, or filename:  "C:\Users\Rick\Desktop\5293\final project notebook running check\Jack_TotallyUnqualified_Resume.pdf"


Processed 1 resumes → data/processed\processed_resumes.json


## Step2: Ask User to enter their own Api Key

In [7]:
print("Note: This pipeline currently uses OpenAI. Please enter your OpenAI API key.")
api_key = input("API key: ")

Note: This pipeline currently uses OpenAI. Please enter your OpenAI API key.


API key:  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


In [8]:
# Input job description
import os
import sys
import glob

def load_job_description(source: str) -> str:
    """
    Load job description text from a local file (.txt|.pdf|.docx) or URL.
    Performs fuzzy substring search if the direct path fails.
    """
    source = source.strip().strip('"\'')

    if source.startswith(('http://', 'https://')):
        import requests
        r = requests.get(source)
        r.raise_for_status()
        return r.text

    path = os.path.abspath(os.path.expanduser(source))
    if os.path.isfile(path):
        ext = os.path.splitext(path)[1].lower()
        if ext == '.txt':
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
        elif ext == '.pdf':
            from PyPDF2 import PdfReader
            reader = PdfReader(path)
            return "\n".join([page.extract_text() or '' for page in reader.pages])
        elif ext == '.docx':
            import docx
            doc = docx.Document(path)
            return "\n".join([p.text for p in doc.paragraphs])
        else:
            raise ValueError(f"Unsupported extension: {ext}")
    # Fuzzy search
    name = os.path.basename(source)
    print(f"'{source}' not found; fuzzy searching for '*{name}*'...")
    matches = glob.glob(f"**/*{name}*", recursive=True)
    if matches:
        print(f"Found: {matches[0]}")
        return load_job_description(matches[0])
    raise FileNotFoundError(f"Source '{source}' not found.")

## Step3: Ask User to put their own job description with required format needed

In [9]:
print("\nEnter job description (.txt/.pdf/.docx path):")
src = input()
job_description = load_job_description(src)


Enter job description (.txt/.pdf/.docx path or URL):


 "C:\Users\Rick\Desktop\5293\final project notebook running check\sample-job-description.pdf"


# 3. Model Selection and Training
   - Choose model architecture: GPT-2.0 via OpenAI API
   - Fine-tune the model using resume and job description data from Kaggle datasets
   - Implement prompt optimization strategies to refine output quality
   - Save the trained model to models/fine_tuned_llm.pth

In [10]:
import os
import pandas as pd
import json
import openai
import time

In [11]:
openai.api_key = api_key
try:
    response = openai.Embedding.create(
        input=["Test embedding"],
        model="text-embedding-ada-002"
    )
    print("API Key is working and embedding generated successfully.")
except Exception as e:
    print(f"API Key Error: {e}")

API Key is working and embedding generated successfully.


In [12]:
import os
import time
import json
import pandas as pd
import openai
import faiss
import numpy as np
import asyncio
import aiohttp
import nest_asyncio

openai.api_key = api_key

# Apply nest_asyncio to prevent RuntimeError in Jupyter
nest_asyncio.apply()

# Define Data Paths
data_path = 'resume_data.csv'
faiss_path = 'resume_index.faiss'
metadata_path = 'resume_metadata.csv'

# Data Preprocessing
df = pd.read_csv(data_path, encoding='utf-8')
df.rename(columns=lambda x: x.lstrip("\ufeff"), inplace=True)

def assemble_resume(row):
    parts = []
    if pd.notna(row.get("career_objective")):
        parts.append(row["career_objective"])
    if pd.notna(row.get("skills")):
        parts.append("Skills: " + row["skills"])
    if pd.notna(row.get("responsibilities")):
        parts.append("Responsibilities: " + row["responsibilities"])
    return "\n".join(parts)

def assemble_jd(row):
    parts = []
    if pd.notna(row.get("job_position_name")):
        parts.append("Position: " + row["job_position_name"])
    if pd.notna(row.get("skills_required")):
        parts.append("Required Skills: " + row["skills_required"])
    if pd.notna(row.get("responsibilities.1")):
        parts.append("Responsibilities: " + row["responsibilities.1"])
    return "\n".join(parts)

df['resume_text'] = df.apply(assemble_resume, axis=1)
df['job_description'] = df.apply(assemble_jd, axis=1)
df['resume_id'] = range(len(df))

# Async Embedding Generation
async def async_get_embedding(session, text):
    """ Asynchronous embedding generation using OpenAI API. """
    try:
        async with session.post(
            "https://api.openai.com/v1/embeddings",
            headers={"Authorization": f"Bearer {api_key}"},
            json={"input": text, "model": "text-embedding-ada-002"}
        ) as response:
            if response.status != 200:
                print(f"Error {response.status}: {await response.text()}")
                return np.zeros(1536)
            data = await response.json()
            return data['data'][0]['embedding']
    except Exception as e:
        print(f"Embedding Error: {e}")
        return np.zeros(1536)

async def process_embeddings(data, batch_size=20):
    embeddings = []
    start_time = time.time()

    # Async HTTP session
    async with aiohttp.ClientSession() as session:
        for start in range(0, len(data), batch_size):
            batch = data[start:start + batch_size]
            tasks = [async_get_embedding(session, text) for text in batch]
            batch_embeddings = await asyncio.gather(*tasks)
            embeddings.extend(batch_embeddings)

            # Progress logging
            elapsed = time.time() - start_time
            processed = start + len(batch)
            remaining = len(data) - processed
            est_remaining_time = (elapsed / processed) * remaining if processed else 0
            print(f"Processed {processed}/{len(data)} | Elapsed: {elapsed:.2f}s | Remaining: {est_remaining_time:.2f}s")
    
    return np.array(embeddings)

# Execute Async Embedding Generation
print("Starting async embedding generation...")
start_time = time.time()
embeddings = asyncio.run(process_embeddings(df['resume_text'].tolist(), batch_size=20))
print(f"Embedding generation completed in {time.time() - start_time:.2f} seconds.")

# Check for Zero Vectors
num_zero_vectors = np.sum(np.all(embeddings == 0, axis=1))
print(f"Number of zero vectors: {num_zero_vectors}")

# Build and Save FAISS Index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, faiss_path)
df.to_csv(metadata_path, index=False, encoding='utf-8')

print(f" FAISS index saved as: {faiss_path}")
print(f" Metadata saved as: {metadata_path}")

Starting async embedding generation...
Processed 20/9544 | Elapsed: 4.10s | Remaining: 1953.44s
Processed 40/9544 | Elapsed: 6.35s | Remaining: 1508.05s
Processed 60/9544 | Elapsed: 6.81s | Remaining: 1076.32s
Processed 80/9544 | Elapsed: 7.22s | Remaining: 854.31s
Processed 100/9544 | Elapsed: 8.07s | Remaining: 762.54s
Processed 120/9544 | Elapsed: 8.88s | Remaining: 697.74s
Processed 140/9544 | Elapsed: 9.31s | Remaining: 625.40s
Processed 160/9544 | Elapsed: 11.24s | Remaining: 659.44s
Processed 180/9544 | Elapsed: 11.80s | Remaining: 613.68s
Processed 200/9544 | Elapsed: 12.35s | Remaining: 576.89s
Processed 220/9544 | Elapsed: 12.92s | Remaining: 547.47s
Processed 240/9544 | Elapsed: 13.36s | Remaining: 518.11s
Processed 260/9544 | Elapsed: 13.81s | Remaining: 493.07s
Processed 280/9544 | Elapsed: 14.31s | Remaining: 473.33s
Processed 300/9544 | Elapsed: 14.73s | Remaining: 453.82s
Processed 320/9544 | Elapsed: 17.16s | Remaining: 494.65s
Processed 340/9544 | Elapsed: 17.64s | Re

In [13]:
import os
import pandas as pd
import json

# Define paths
data_path = 'resume_data.csv'
output_path = 'fine_tune_data.jsonl'

# Load data
df = pd.read_csv(data_path, encoding='utf-8')
df.rename(columns=lambda x: x.lstrip("\ufeff"), inplace=True)

# Function to assemble resume text
def assemble_resume(row):
    parts = []
    if pd.notna(row.get("career_objective")):
        parts.append(row["career_objective"])
    if pd.notna(row.get("skills")):
        parts.append("Skills: " + row["skills"])
    if pd.notna(row.get("responsibilities")):
        parts.append("Responsibilities: " + row["responsibilities"])
    return "\n".join(parts)

# Function to assemble job description text
def assemble_jd(row):
    parts = []
    if pd.notna(row.get("job_position_name")):
        parts.append("Position: " + row["job_position_name"])
    if pd.notna(row.get("skills_required")):
        parts.append("Required Skills: " + row["skills_required"])
    if pd.notna(row.get("responsibilities.1")):
        parts.append("Responsibilities: " + row["responsibilities.1"])
    return "\n".join(parts)

# Convert to JSONL format
with open(output_path, 'w', encoding='utf-8') as outfile:
    for _, row in df.iterrows():
        resume_text = assemble_resume(row)
        jd_text = assemble_jd(row)
        if resume_text.strip() and jd_text.strip():
            data = {
                "prompt": jd_text.strip() + "\n\nGenerate Resume：",
                "completion": " " + resume_text.strip()
            }
            json.dump(data, outfile, ensure_ascii=False)
            outfile.write("\n")

print(f"Data successfully saved to {output_path}")

Data successfully saved to fine_tune_data.jsonl


In [14]:
import os
import pandas as pd
import json
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
import torch

# Define paths
data_path = 'resume_data.csv'
output_path = 'fine_tune_data.jsonl'

# Load data
df = pd.read_csv(data_path, encoding='utf-8')
df.rename(columns=lambda x: x.lstrip("\ufeff"), inplace=True)

# Function to assemble resume text
def assemble_resume(row):
    parts = []
    if pd.notna(row.get("career_objective")):
        parts.append(row["career_objective"])
    if pd.notna(row.get("skills")):
        parts.append("Skills: " + row["skills"])
    if pd.notna(row.get("responsibilities")):
        parts.append("Responsibilities: " + row["responsibilities"])
    return "\n".join(parts)

# Function to assemble job description text
def assemble_jd(row):
    parts = []
    if pd.notna(row.get("job_position_name")):
        parts.append("Position: " + row["job_position_name"])
    if pd.notna(row.get("skills_required")):
        parts.append("Required Skills: " + row["skills_required"])
    if pd.notna(row.get("responsibilities.1")):
        parts.append("Responsibilities: " + row["responsibilities.1"])
    return "\n".join(parts)

# Convert to JSONL format
with open(output_path, 'w', encoding='utf-8') as outfile:
    for _, row in df.iterrows():
        resume_text = assemble_resume(row)
        jd_text = assemble_jd(row)
        if resume_text.strip() and jd_text.strip():
            data = {
                "prompt": jd_text.strip() + "\n\n Generate a resume:",
                "completion": " " + resume_text.strip()
            }
            json.dump(data, outfile, ensure_ascii=False)
            outfile.write("\n")

print(f"Data successfully saved to {output_path}")

# Fine-Tuning GPT-2 using Hugging Face
MODEL_NAME = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Load dataset
dataset = load_dataset('json', data_files=output_path, split='train')

# Tokenize dataset
def tokenize_function(examples):
    full_texts = [p + " " + c for p, c in zip(examples['prompt'], examples['completion'])]
    tokenized_output = tokenizer(full_texts, padding="max_length", truncation=True, max_length=256)
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8, 
    num_train_epochs=1,  
    logging_dir="./logs",
    logging_steps=100, 
    save_steps=100,  
    evaluation_strategy="no",
    warmup_steps=5,  
    learning_rate=5e-5, 
    fp16=True,  
    dataloader_num_workers=4 
)

# Load model
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

# Custom Trainer Class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if "labels" not in inputs:
            inputs["labels"] = inputs["input_ids"].clone()

        # Forward pass
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            labels=inputs["labels"]
        )

        # Loss
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

# Instantiate Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train model
trainer.train()

print("Fine-tuning completed!")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Model and tokenizer saved to ./fine_tuned_model")

Data successfully saved to fine_tune_data.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/9544 [00:00<?, ? examples/s]

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: zz3237 (zz3237-columbia-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.8218
200,1.0962
300,0.8328
400,0.7457
500,0.6798
600,0.6524
700,0.6217
800,0.5576
900,0.5335
1000,0.5112


Fine-tuning completed!
Model and tokenizer saved to ./fine_tuned_model


# 4. Generate modified Resume based on fine-tuned gpt model from above and generate a cover letter

In [15]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

class LocalResumeGenerator:
    def __init__(self, model_dir: str, device: str = 'cpu'):
        """
        Initialize the generator with a fine-tuned GPT-2 model.

        Args:
            model_dir: Path to the directory containing the fine-tuned model.
            device: 'cpu' or 'cuda'.
        """
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
        self.model     = GPT2LMHeadModel.from_pretrained(model_dir).to(device)
        self.device    = device

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

    def _generate(self, prompt: str, max_new_tokens: int = 200) -> str:
        """
        Internal generation method using max_new_tokens while ensuring
        total length never exceeds model's context window.
        """
        max_window = self.tokenizer.model_max_length
        max_prompt_len = max_window - max_new_tokens

        inputs = self.tokenizer(
            prompt,
            return_tensors='pt',
            truncation=True,
            max_length=max_prompt_len
        ).to(self.device)

        output_ids = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

    def improve_resume(self, clean_text: str, jd: str) -> str:
        """
        Rewrite the candidate's resume to align with the job description.
        """
        system_prompt = (
            "You are a professional career coach specializing in risk management roles. "
            "Rewrite the candidate’s resume to match the job description. "
            "Output format:\n"
            "1. Professional Summary (1–2 sentences)\n"
            "2. Key Skills (bullet list, max 6 items)\n"
            "3. Experience (bullet list, focus on quantifiable achievements, max 5 items)\n"
            "4. Education (degree, institution, year)\n"
            "Use a concise, professional tone."
        )
        user_prompt = (
            f"{system_prompt}\n\n"
            f"Original Resume:\n{clean_text}\n\n"
            f"Job Description:\n{jd}\n\n"
            "Rewritten Resume:"
        )
        return self._generate(user_prompt)

    def make_cover_letter(self, clean_text: str, jd: str) -> str:
        """
        Generate a targeted cover letter based on the resume and job description.
        """
        system_prompt = (
            "You are an expert cover letter writer for entry-level risk management positions. "
            "Write a three-paragraph cover letter as follows:\n"
            "1. Opening paragraph: introduce the application motive, mention the job title and company name;\n"
            "2. Middle paragraph: highlight 2–3 most relevant experiences from the resume and explain how they prepare the candidate for this role;\n"
            "3. Closing paragraph: express enthusiasm for the opportunity and indicate next steps politely.\n"
            "Keep the total length between 250 and 300 words, in a professional and enthusiastic tone."
        )
        user_prompt = (
            f"{system_prompt}\n\n"
            f"Resume:\n{clean_text}\n\n"
            f"Job Description:\n{jd}\n\n"
            "Cover Letter:"
        )
        return self._generate(user_prompt)

In [16]:
gen = LocalResumeGenerator("./fine_tuned_model")
with open('data/processed/processed_resumes.json','r',encoding='utf-8') as f:
    recs = json.load(f)

for rec in recs:
    imp = gen.improve_resume(rec['clean_text'], job_description)
    cov = gen.make_cover_letter(rec['clean_text'], job_description)

    out_dir = 'data/final_outputs_gpt2'
    os.makedirs(out_dir, exist_ok=True)
    with open(f"{out_dir}/{rec['id']}_resume.txt", 'w', encoding='utf-8') as f:
        f.write(imp)
    with open(f"{out_dir}/{rec['id']}_cover_letter.txt", 'w', encoding='utf-8') as f:
        f.write(cov)

print("Done. Generated resume and cover letter are saved in the same file path under folder: data/final_outputs")

Done. Generated resume and cover letter are saved in the same file path under folder: data/final_outputs
