demo/configs/headline_generation_improve.yml

custom_function: demo.headline_generation_detail.headline_generation
description: Generated experiment config
dataset:
  data_generators:
    openai_prompt_data_generator:
      chunk_size: 100000
      diversify: true
      # model_name specify the llm model , e.g. a16z-infra/llama-2-13b-chat:9dff94b1bed5af738655d4a7cbcdcde2bd503aa85c94334fe1f42af7f3dd5ee3
      model_name: gpt-4
      prompt:
          "Please provide a concrete and realistic test case as a dictionary for function invocation using the ** operator.
          Only include parameters, excluding description and name.
          Ensure it's succinct and well-structured.
          **Only provide the dictionary.**"
      input_function:
        description:
          "Given a tech startup business named [tech_startup_business], specializing in [business], and target_peopleing [target_people], generate a corresponding landing page headline."
        name: headline_generation_for_business
        parameters:
          tech_startup_business: str
          business: str
          target_people: str
      number_of_examples: 3
      output_path: null
  source_type: machine_generated


variations:
  - name: task
    variations:
      - instantiated_value: Generate landing page headline for {tech_startup_business}, company business is {business}, target_people is {target_people}
        value: Generate landing page headline for {tech_startup_business}, company business is {business}, target_people is {target_people}
        value_type: str
        variation_id: null

evaluators:
  - evaluator_type: individual
    metric_calculators:
      - method: AVERAGE
    name: openai_prompt_based_evaluator
    display_name: clear
    prompt: |-
      You are assessing a submitted answer on a given task based on a criterion. Here is the data:
      - Task: Given an tech startup business, generate one corresponding landing page headline
      - Does the headline clearly communicate what the startup does or what problem it solves?
        It should be immediately clear to anyone who reads the headline what the startup's purpose is.
        A lack of clarity can lead to confusion and may discourage potential users or investors.
      [Input]: company name: {tech_startup_business} , company business: {business} , target people: {target_people}
      [Result]: {raw_output}
      Answer the question by selecting one of the following options:
      A It fails to meet the criterion at all.
      B It somewhat meets the criterion, but there is significant room for improvement.
      C It meets the criterion to a satisfactory degree.
      D It meets the criterion very well.
      E It meets the criterion exceptionally well, with little to no room for improvement.
    choices: ["A", "B", "C", "D", "E"]
    # model_name specify the llm model , e.g. a16z-infra/llama-2-13b-chat:9dff94b1bed5af738655d4a7cbcdcde2bd503aa85c94334fe1f42af7f3dd5ee3
    model_name: gpt-4
    description: "evaluate the quality of the landing page headline"
    scale_description: "0-4"
    choice_scores:
      A: 0
      B: 1
      C: 2
      D: 3
      E: 4
  

  - evaluator_type: individual
    metric_calculators:
      - method: AVERAGE
    name: openai_prompt_based_evaluator
    prompt: |-
      You are assessing a submitted answer on a given task based on a criterion. Here is the data:
      - Task: Given an tech startup business, generate one corresponding landing page headline
      - Is the headline relevant to the target audience? The headline should speak directly to the
        startup's intended users or customers, highlighting the benefits or value proposition that 
        the startup offers.
      [Input]: company name: {tech_startup_business} , company business: {business} , target people: {target_people}
      [Result]: {raw_output}
      Answer the question by selecting one of the following options:
      A It fails to meet the criterion at all.
      B It somewhat meets the criterion, but there is significant room for improvement.
      C It meets the criterion to a satisfactory degree.
      D It meets the criterion very well.
      E It meets the criterion exceptionally well, with little to no room for improvement.
    display_name: relevance
    description: Is the headline relevant to the target audience?
    scale_description: "0-4"
    choices: ["A", "B", "C", "D", "E"]
    choice_scores:
      A: 0
      B: 1
      C: 2
      D: 3
      E: 4
  
  - evaluator_type: individual
    metric_calculators:
      - method: AVERAGE
    name: openai_prompt_based_evaluator
    prompt: |-
      You are assessing a submitted answer on a given task based on a criterion. Here is the data:
      - Task: Given an tech startup business, generate one corresponding landing page headline
      - Is the headline catchy or memorable? While it's important to be clear and relevant,
        a good headline should also be engaging and memorable. 
        This can help the startup stand out in a crowded market.
      [Input]: company name: {tech_startup_business} , company business: {business} , target people: {target_people}
      [Result]: {raw_output}
      Answer the question by selecting one of the following options:
      A It fails to meet the criterion at all.
      B It somewhat meets the criterion, but there is significant room for improvement.
      C It meets the criterion to a satisfactory degree.
      D It meets the criterion very well.
      E It meets the criterion exceptionally well, with little to no room for improvement.
    display_name: catchiness
    description: Is the headline catchy or memorable?
    scale_description: "0-4"
    choices: ["A", "B", "C", "D", "E"]
    choice_scores:
      A: 0
      B: 1
      C: 2
      D: 3
      E: 4


selection_strategy:
  ahp_selection:
    criteria:
      - "openai_prompt_based_evaluator: clear"
      - "openai_prompt_based_evaluator: relevance"
      - "openai_prompt_based_evaluator: catchiness"
      - average_token_usage
      - average_latency
    criteria_maximization:
      "openai_prompt_based_evaluator: clear": true
      "openai_prompt_based_evaluator: relevance" : true
      "openai_prompt_based_evaluator: catchiness" : true
      average_latency: false
      average_token_usage: false
    criteria_weights:
      "openai_prompt_based_evaluator: clear": 0.333
      "openai_prompt_based_evaluator: relevance" : 0.333
      "openai_prompt_based_evaluator: catchiness" : 0.333
      average_latency: 0.0
      average_token_usage: 0.0
    # normalize_func: "z-score"

improver:
  name: "optimize_by_prompt_improver"
  model_name: "gpt-4"
  max_iterations: 2
  improve_var: ["task"]
  head_meta_instruction: |
    Now you will help me generate a prompt which is used to generate a corresponding
    landing page headline according for a startup business named [tech_startup_business],
    specializing in [business], and target_peopleing [target_people].
    I already have some prompt and its evaluation results :
    
  end_meta_instruction: |
    Give me a new prompt that is different from all pairs above, and has a evaluation value higher than any of above.