# Generating Synthetic Entities with `Outlines`


Plan: Given a domain/industry, need to generate synthetic entities that are comprised of:

1. First, given a domain/industry name and a description of that domain, generate a list of N possible job titles (along with job description)

IndustryJobs - Industry Name - Industry Description - Job Titles

2. Then for each job title/description generate, generate a job entity:

Job Entity - Job Title (str) - Job Description (str) - Associated Job Postings/Position (List[str]) - Job Skills (List[str])


In [1]:
import os
from enum import Enum
from getpass import getpass
from dataclasses import dataclass

from pydantic import BaseModel, conlist, constr

import transformers


import outlines

In [2]:
class Job(BaseModel):
    job_title: str
    job_description: str


class IndustryJobs(BaseModel):
    industry_name: str
    industry_description: str
    industry_jobs: conlist(Job, min_length=5, max_length=10)  # type: ignore

In [3]:
@outlines.prompt
def industry_jobs_prompt(name: str, description: str) -> IndustryJobs:
    """
    You are a expert human resources professional with broad, deep knowledge of talent profiles across every industry.
    Your job is to generate a list of diverse and popular job titles and corresponding descriptions that cover a range
    of functions, from foundational roles to innovative and emerging positions based on a provided industry name and description.

    Here is the new industry you need to generate jobs for:
    Industry Name: {{ name }}
    Industry Description: {{ description }}
    Jobs List:
    """

In [4]:
from outlines import models

model_id = "Qwen/Qwen1.5-7B-Chat"
# config = transformers.AutoConfig.from_pretrained(
#     model_id,
# )

model = outlines.models.transformers(
    model_id, device="cuda", model_kwargs={"device_map": "auto", "load_in_4bit": True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model.device

device(type='cuda', index=0)

In [6]:
prompt = industry_jobs_prompt(
    name="Software Development",
    description="Software development is the process of conceiving, specifying, designing, programming, documenting, testing, and bug fixing involved in creating and maintaining applications, frameworks, or other software components.",
)

In [7]:
outlines.generate.json(model, IndustryJobs)(prompt)



IndustryJobs(industry_name='Software Development', industry_description='Software development is the process of creating and maintaining applications using coding skills and advanced technologies.', industry_jobs=[Job(job_title='Software Developer', job_description='Design and develop software applications, using programming languages such as Java, Python, or C++. Collaborate with cross-functional teams to ensure software quality, scalability, and functionality. Write clean, efficient, and well-documented code. Perform functional testing, debugging, and bug fixing.'), Job(job_title='Frontend Developer', job_description='Design and develop user interfaces for web and mobile applications using HTML, CSS, and JavaScript. Create responsive, intuitive, and visually appealing designs that improve user experience. Collaborate with backend developers to integrate frontend functionality with backend systems. Stay up-to-date with前端 technologies and web development best practices.'), Job(job_titl