In [1]:
%%capture
import weave
from weave import Dataset
from typing import List, Optional
import instructor
from openai import OpenAI
import marvin
import json
import re
import os
from pydantic import BaseModel
from set_env import set_env
set_env("OPENAI_API_KEY")

In [2]:
weave.init('compare-output-structuring-models')

Logged in as Weights & Biases user: bassimfaizal.
View Weave data at https://wandb.ai/le-khan-academy/compare-output-structuring-models/weave


<weave.weave_client.WeaveClient at 0x103b0b460>

In [3]:
class ResumeInfo(BaseModel):
    name: str
    email: Optional[str] = None
    phone: Optional[str] = None
    location: Optional[str] = None
    linkedin: Optional[str] = None
    education: Optional[List[str]] = None
    skills: Optional[List[str]] = None
    certifications: Optional[List[str]] = None
    projects: Optional[List[str]] = None
        

In [5]:
MODEL_NAME = "gpt-4o"
SYSTEM_PROMPT = "You are an assistant that extracts resume information in a structured format"

# Clients
openai_client = OpenAI()
instructor_client = instructor.from_openai(OpenAI())

class VanillaOpenAI(weave.Model):
    prompt_type: str = "zero_shot"

    @property
    def json_structure(self):
        return """
        {
            "name": "string",
            "email": "string",
            "phone": "string",
            "location": "string",
            "linkedin": "string or null",
            "education": ["string"],
            "skills": ["string"],
            "certifications": ["string or null"],
            "projects": ["string or null"]
        }
        """

    def get_prompt(self, prompt: str) -> str:
        base_prompt = f"""
        Extract the relevant information and format it as a JSON object.
        The JSON should be structured exactly as follows:

        {self.json_structure}

        Only include fields if information is available. If a field is not applicable or the information is not present, omit the field entirely.
        Ensure the output is a valid JSON object.

        {prompt}
        """

        if self.prompt_type == "few_shot":
            example_resume = "JULIA CHEN Software Engineer julia.chen@email.com (987) 654-3210 San Francisco, CA ..."
            example_output = """
            {
                "name": "JULIA CHEN",
                "email": "julia.chen@email.com",
                "phone": "(987) 654-3210",
                "location": "San Francisco, CA",
                "education": ["B.S. Computer Science, Stanford University, September 2008 - June 2012, Palo Alto, CA"],
                "skills": ["Go", "Kubernetes", "Python", "Flask", "Jenkins", "Prometheus", "Grafana", "Ruby on Rails"]
            }
            """
            base_prompt = f"""
            {base_prompt}

            Here's an example:

            Input resume:
            {example_resume}

            Output JSON:
            {example_output}
            """

        return base_prompt

    @weave.op()
    def predict(self, prompt: str):
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": self.get_prompt(prompt)}
        ]

        completion = openai_client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=0.0
        )
        json_str = self._parse_json(completion.choices[0].message.content)
        try:
            return ResumeInfo(**json.loads(json_str))
        except:
            return json.loads(json_str)

    @staticmethod
    def _parse_json(content: str) -> str:
        json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
        if json_match:
            return json_match.group(1)
        else:
            return content


class InstructorModel(weave.Model):
    @weave.op()
    def predict(self, prompt: str) -> ResumeInfo:
        resume_info = instructor_client.chat.completions.create(
            model=MODEL_NAME,
            response_model=ResumeInfo,
            messages=[
                {"role": "user", "content": prompt},
                {"role": "system", "content": SYSTEM_PROMPT}
            ],
        )
        return resume_info

class MarvinModel(weave.Model):
    def __init__(self, name=None):
        super().__init__(name=name)
        marvin.settings.openai.chat.completions.model = MODEL_NAME
    @weave.op()
    def predict(self, prompt: str) -> ResumeInfo:
        resume_info = marvin.extract(
            prompt,
            target=ResumeInfo
        )[0]
        return resume_info

zero_shot_model = VanillaOpenAI(name='zero_shot_GPT-4o')
few_shot_model = VanillaOpenAI(prompt_type="few_shot", name='few_shot_GPT-4o')
instructor_model = InstructorModel(name='instructor_GPT-4o')
marvin_model = MarvinModel(name='marvin_GPT-4o')

print("✅ Weave models created")

✅ Weave models created


In [6]:
resumes = []
folder_path = "resumes-txt"

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as f:
            resumes.append(f.read())

print(f"Read {len(resumes)} resume files.")

Read 30 resume files.


In [7]:
resumes = resumes[:5]

In [9]:
isinstance(zero_shot_model.predict(resumes[0]), ResumeInfo)

🍩 https://wandb.ai/le-khan-academy/compare-output-structuring-models/r/call/9ba6a3af-cb02-4857-8777-29d80497d20d


True

In [10]:
models = [zero_shot_model, few_shot_model, instructor_model, marvin_model]

# Create a dataset from the resumes list
resume_dataset = Dataset(
    name="resume_dataset",
    rows=[{"resume": resume} for resume in resumes]
)

# Define scoring functions
@weave.op()
def is_valid_resume_info(model_output: dict) -> dict:
    return {'is_valid': isinstance(model_output, ResumeInfo)}

@weave.op()
def completeness_score(model_output: dict) -> dict:
    if not isinstance(model_output, ResumeInfo):
        return {'completeness': 0}
    
    fields = [f for f in dir(model_output) if not f.startswith('_')]
    filled_fields = sum(1 for f in fields if getattr(model_output, f) is not None)
    return {'completeness': filled_fields / len(fields)}

# Define the preprocess_model_input function
def preprocess_model_input(row):
    return {'prompt': 'Extract the data in a structured form from this resume:' + row['resume']}

# Define the evaluation
evaluation = weave.Evaluation(
    name='resume_info_extraction_eval',
    dataset=resume_dataset,
    trials=1,
    scorers=[
        is_valid_resume_info,
        completeness_score
    ],
    preprocess_model_input=preprocess_model_input
)

# Run evaluation for each model
async def run_evaluation():
    results = {}
    for model in models:
        result = await evaluation.evaluate(model)
        results[model.__class__.__name__] = result
    return results

# Run the evaluation
results = await run_evaluation()

In [9]:
results

{'InstructorModel': {'is_valid_resume_info': {'is_valid': {'true_count': 2,
    'true_fraction': 1.0}},
  'completeness_score': {'completeness': {'mean': 0.9310344827586207}},
  'model_latency': {'mean': 6.416078448295593}}}