# Generating Synthetic Entities


In [1]:
import os
from getpass import getpass

from typing import List

from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field, conlist

## Setup


In [2]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY = getpass("Enter your OpenAI API key: ")

In [3]:
model = ChatOpenAI(model="gpt-4-turbo-preview")

## Define Pydantic Models

We'll use Pydantic to define our desired output schema which will be enforces via a `parser` during the post-processing step of the LCEL chain.


In [4]:
from typing import Type


def factory_job_profile(n_skills: int = 3, n_postings: int = 3) -> Type[BaseModel]:
    """
    Factory function that generates a JobProfile class with the specified number of skills and relevant postings.

    Args:
        n_skills (int, optional): The number of skills required for the job role. Defaults to 3.
        n_postings (int, optional): The number of relevant job postings. Defaults to 3.

    Returns:
        Type[BaseModel]: The generated JobProfile class.
    """

    class JobProfile(BaseModel):
        title: str = Field(description="The title of the job")
        description: str = Field(
            description="A brief description of the job role and responsibilities"
        )
        skills: conlist(str, min_items=n_skills, max_items=n_skills) = Field(  # type: ignore
            description="A list of skills required for the job role"
        )
        relevant_postings: conlist(str, min_items=n_postings, max_items=n_postings) = (  # type: ignore
            Field(description="A list of relevant job postings")
        )

    return JobProfile


# edit this for desired number of skills and postings
job_profile = factory_job_profile(n_skills=5, n_postings=5)


class IndustryJobs(BaseModel):
    industry_jobs: List[job_profile]  # type: ignore


class IndustryJobsProfile(BaseModel):
    industry_name: str
    industry_description: str
    industry_jobs: IndustryJobs


parser = PydanticOutputParser(pydantic_object=IndustryJobs)

## Configure Prompt Template


In [5]:
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

SYSTEM_PROMPT = """You are a expert human resources professional with broad and deep knowledge of talent profiles across every industry. \
Your job is to generate a list of {n_jobs} diverse and popular Job Profiles that cover a range of functions, from foundational roles \
to innovative and emerging positions based on a provided industry name and description."""

USER_PROMPT = """Generate a list of Job Profiles. There should be no duplicates.

{format_instructions}

Here is the new industry you need to generate jobs for:
Industry Name: {industry_name}
Industry Description: {industry_description}
Jobs Profiles:"""

chat_template = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT),
        HumanMessagePromptTemplate.from_template(USER_PROMPT),
    ],
    input_variables=["n_jobs", "industry_name", "industry_description"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

Let's see what the actual prompt looks like:


In [6]:
example = {
    "n_jobs": 5,
    "industry_name": "Hospitality",
    "industry_description": "The hospitality industry is a broad category of fields within the service industry that includes lodging, event planning, theme parks, transportation, cruise line, and additional fields within the tourism industry.",
}
messages = chat_template.format_messages(**example)
[print(message.content) for message in messages]

You are a expert human resources professional with broad and deep knowledge of talent profiles across every industry. Your job is to generate a list of 5 diverse and popular Job Profiles that cover a range of functions, from foundational roles to innovative and emerging positions based on a provided industry name and description.
Generate a list of Job Profiles. There should be no duplicates.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"industry_jobs": {"title": "Industry Jobs", "type": "array", "items": {"$ref": "#/definitions/JobProfile"}}}, "required": ["industry_job

[None, None]

## Run the Chain


In [7]:
chain = chat_template | model | parser
out = chain.invoke(example)

Visualize the generated entities


In [8]:
for job in out.industry_jobs:
    print(job.title)
    print(job.description)
    print(job.skills)
    print(job.relevant_postings)
    print()

Hotel General Manager
Oversees all operations and day-to-day activities of a hotel to ensure guest satisfaction and profitability.
['Leadership', 'Customer Service', 'Financial Management', 'Problem Solving', 'Communication']
['Hotel General Manager at Hilton', 'General Manager at Marriott International', 'Resort General Manager at Hyatt', 'Hotel Manager at The Ritz-Carlton', 'Managing Director at Sheraton Hotels & Resorts']

Event Planner
Coordinates all aspects of professional meetings and events. They often choose meeting locations, arrange transportation, and coordinate other details.
['Organizational', 'Networking', 'Negotiation', 'Time Management', 'Creativity']
['Corporate Event Planner at Dream Events', 'Wedding Planner at Blissful Weddings', 'Event Coordinator at Global Conferences', 'Conference Manager at Event Solutions', 'Event Specialist at Creative Concepts']

Executive Chef
Responsible for the food operations in restaurants, hotels, casinos, or other venues that serve fo

## Combine Output


In [9]:
jobs_profile = IndustryJobsProfile(
    industry_name=example["industry_name"],
    industry_description=example["industry_description"],
    industry_jobs=out,
)
print(jobs_profile.json(indent=2))

{
  "industry_name": "Hospitality",
  "industry_description": "The hospitality industry is a broad category of fields within the service industry that includes lodging, event planning, theme parks, transportation, cruise line, and additional fields within the tourism industry.",
  "industry_jobs": {
    "industry_jobs": [
      {
        "title": "Hotel General Manager",
        "description": "Oversees all operations and day-to-day activities of a hotel to ensure guest satisfaction and profitability.",
        "skills": [
          "Leadership",
          "Customer Service",
          "Financial Management",
          "Problem Solving",
          "Communication"
        ],
        "relevant_postings": [
          "Hotel General Manager at Hilton",
          "General Manager at Marriott International",
          "Resort General Manager at Hyatt",
          "Hotel Manager at The Ritz-Carlton",
          "Managing Director at Sheraton Hotels & Resorts"
        ]
      },
      {
        

## Experiment with different industries


In [10]:
examples = [
    {
        "industry_name": "Technology and Information Services",
        "industry_description": "This industry encompasses roles in software development, cybersecurity, data analysis, and information technology services, driven by rapid technological advancements and the need for digital transformation across sectors. It demands technical skill, creativity, and adaptability.",
    },
    {
        "industry_name": "Healthcare and Life Sciences",
        "industry_description": "Healthcare includes clinical roles and support positions in healthcare administration and health IT, while life sciences focus on research, development, and production of medical technologies. This industry improves health outcomes and is resilient to economic fluctuations.",
    },
    {
        "industry_name": "Financial Services and Fintech",
        "industry_description": "This sector includes banking, insurance, investment firms, and fintech, which integrates technology to improve financial services. It values analytical skills, regulatory knowledge, and innovation, adapting to new technologies and changing consumer expectations.",
    },
    {
        "industry_name": "Green Energy and Sustainability",
        "industry_description": "Focusing on renewable energy sources and sustainability, this sector is expanding with the global emphasis on combating climate change. It offers roles in engineering, environmental science, and policy work dedicated to a sustainable future.",
    },
    {
        "industry_name": "Creative and Digital Media",
        "industry_description": "Covering digital marketing, content creation, graphic design, and multimedia arts, this industry has grown with digital platforms transforming content creation and distribution. It suits those combining creative talent with digital tool proficiency to engage audiences.",
    },
]

examples = [
    {
        "n_jobs": 10,
        **example,
    }
    for example in examples
]

In [11]:
from tqdm import tqdm
from langchain_core.runnables.base import RunnableSequence


def generate_job_profiles(
    chain: RunnableSequence, examples: List[dict]
) -> List[IndustryJobsProfile]:
    """
    Generate job profiles using a runnable sequence chain and a list of examples.

    Args:
        chain (RunnableSequence): The runnable sequence chain used to generate job profiles.
        examples (List[dict]): A list of examples containing industry information.

    Returns:
        List[IndustryJobsProfile]: A list of generated job profiles.


    Note: This should be made into an async function.

    """
    synthetic_data = []
    for example in tqdm(examples):
        out = chain.invoke(example)
        jobs_profile = IndustryJobsProfile(
            industry_name=example["industry_name"],
            industry_description=example["industry_description"],
            industry_jobs=out,
        )
        synthetic_data.append(jobs_profile)

    return synthetic_data

In [12]:
synethic_data = generate_job_profiles(chain, examples)

100%|██████████| 5/5 [04:36<00:00, 55.27s/it]


In [17]:
for jobs_profile in synethic_data:
    print(jobs_profile.json(indent=2))
    print("----" * 10)

{
  "industry_name": "Technology and Information Services",
  "industry_description": "This industry encompasses roles in software development, cybersecurity, data analysis, and information technology services, driven by rapid technological advancements and the need for digital transformation across sectors. It demands technical skill, creativity, and adaptability.",
  "industry_jobs": {
    "industry_jobs": [
      {
        "title": "Software Engineer",
        "description": "Responsible for developing and maintaining software applications by applying engineering principles.",
        "skills": [
          "Programming",
          "System Design",
          "Debugging",
          "Software Testing",
          "Agile Methodologies"
        ],
        "relevant_postings": [
          "Junior Software Engineer at Tech Innovations Inc.",
          "Senior Software Engineer at Web Solutions LLC",
          "Software Engineer (Full Stack) at Creative Tech Studio",
          "Backend Softw