In [1]:
%%capture
import weave
from weave import Dataset
from typing import List, Optional
import instructor
from openai import OpenAI
import marvin
import json
import re
import os
from pydantic import BaseModel
from set_env import set_env
set_env("OPENAI_API_KEY")

In [2]:
weave.init('compare-output-structuring-models')

Logged in as Weights & Biases user: bassimfaizal.
View Weave data at https://wandb.ai/bassimfaizal/compare-output-structuring-models/weave


<weave.weave_client.WeaveClient at 0x107c147f0>

In [3]:
class ResumeInfo(BaseModel):
    name: str
    email: Optional[str] = None
    phone: Optional[str] = None
    location: Optional[str] = None
    linkedin: Optional[str] = None
    education: Optional[List[str]] = None
    skills: Optional[List[str]] = None
    certifications: Optional[List[str]] = None
    projects: Optional[List[str]] = None
        

In [4]:
MODEL_NAME = "gpt-4o"
SYSTEM_PROMPT = "You are an assistant that extracts resume information in a structured format"

# Clients
openai_client = OpenAI()
instructor_client = instructor.from_openai(OpenAI())

class VanillaOpenAI(weave.Model):
    prompt_type: str = "zero_shot"

    @property
    def json_structure(self):
        return """
        {
            "name": "string",
            "email": "string",
            "phone": "string",
            "location": "string",
            "linkedin": "string or null",
            "education": ["string"],
            "skills": ["string"],
            "certifications": ["string or null"],
            "projects": ["string or null"]
        }
        """

    def get_prompt(self, prompt: str) -> str:
        base_prompt = f"""
        Extract the relevant information and format it as a JSON object.
        The JSON should be structured exactly as follows:

        {self.json_structure}

        Only include fields if information is available. If a field is not applicable or the information is not present, omit the field entirely.
        Ensure the output is a valid JSON object.

        {prompt}
        """

        if self.prompt_type == "few_shot":
            example_resume = "JULIA CHEN Software Engineer julia.chen@email.com (987) 654-3210 San Francisco, CA ..."
            example_output = """
            {
                "name": "Julia Chen",
                "email": "julia.chen@email.com",
                "phone": "(987) 654-3210",
                "location": "San Francisco, CA",
                "education": ["B.S. Computer Science, Stanford University, September 2008 - June 2012, Palo Alto, CA"],
                "skills": ["Go", "Kubernetes", "Python", "Flask", "Jenkins", "Prometheus", "Grafana", "Ruby on Rails"]
            }
            """
            base_prompt = f"""
            {base_prompt}

            Here's an example:

            Input resume:
            {example_resume}

            Output JSON:
            {example_output}
            """

        return base_prompt

    @weave.op()
    def predict(self, prompt: str):
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": self.get_prompt(prompt)}
        ]

        completion = openai_client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            response_format={"type": "json_object"},
            temperature=0.0
        )
        json_str = completion.choices[0].message.content
        try:
            return ResumeInfo(**json.loads(json_str))
        except:
            return json.loads(json_str)


class InstructorModel(weave.Model):
    @weave.op()
    def predict(self, prompt: str) -> ResumeInfo:
        resume_info = instructor_client.chat.completions.create(
            model=MODEL_NAME,
            response_model=ResumeInfo,
            messages=[
                {"role": "user", "content": prompt},
                {"role": "system", "content": SYSTEM_PROMPT}
            ],
        )
        return resume_info

class MarvinModel(weave.Model):
    def __init__(self, name=None):
        super().__init__(name=name)
        marvin.settings.openai.chat.completions.model = MODEL_NAME
    @weave.op()
    def predict(self, prompt: str) -> ResumeInfo:
        resume_info = marvin.extract(
            prompt,
            target=ResumeInfo
        )[0]
        return resume_info

zero_shot_model = VanillaOpenAI(name='zero_shot_GPT-4o')
few_shot_model = VanillaOpenAI(prompt_type="few_shot", name='few_shot_GPT-4o')
instructor_model = InstructorModel(name='instructor_GPT-4o')
marvin_model = MarvinModel(name='marvin_GPT-4o')

print("✅ Weave models created")

✅ Weave models created


In [5]:
resumes = []
folder_path = "resumes-txt"

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as f:
            resumes.append(f.read())

print(f"Read {len(resumes)} resume files.")

Read 30 resume files.


In [6]:
models = [zero_shot_model, few_shot_model, instructor_model, marvin_model]

# Create a dataset from the resumes list
resume_dataset = Dataset(
    name="resume_dataset",
    rows=[{"resume": resume} for resume in resumes]
)

# Define scoring functions
@weave.op()
def is_valid_resume_info(model_output: dict) -> dict:
    return {'is_valid': isinstance(model_output, ResumeInfo)}

# Removed explanations for scores to save on cost. 
@weave.op()
async def info_accuracy(resume: str, model_output: ResumeInfo):
    accuracy_prompt = """
    Given a resume and the extracted information, evaluate the accuracy of the extracted information. 
    Provide a score from 1 to 5, where 5 means perfect extraction and 1 means completely incorrect or missing information.
    No points should be deducted if a field has a null value and the resume doesn't have that information.
    Points should be deducted if a field value is missing when present in the resume or the value is incorrect.
    
    Resume:
    {resume}
    
    Extracted Information:
    {extracted_info}
    
    Provide your evaluation in the following JSON format:
    {{
        "score": <int>
    }}
    """

    prompt = accuracy_prompt.format(
        resume=resume,
        extracted_info=model_output.json()
    )

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    response_content = json.loads(response.choices[0].message.content)
    
    return {
        "accuracy_score": int(response_content["score"])
    }


# Define the preprocess_model_input function
def preprocess_model_input(row):
    return {'prompt': 'Extract the data in a structured form from this resume:' + row['resume']}

# Define the evaluation
evaluation = weave.Evaluation(
    name='resume_info_extraction_eval',
    dataset=resume_dataset,
    trials=1,
    scorers=[
        is_valid_resume_info,
        info_accuracy
    ],
    preprocess_model_input=preprocess_model_input
)

# Run evaluation for each model
async def run_evaluation():
    results = {}
    for model in models:
        result = await evaluation.evaluate(model)
        results[model.__class__.__name__] = result
    return results

# Run the evaluation
results = await run_evaluation()