In [1]:
# This will be used in order to get the training data for the model.
# This will be done using GPT-4o to generate the text for the data.

from langchain_openai import ChatOpenAI

from langchain_core.prompts import ChatPromptTemplate

from pydantic import BaseModel, Field

from langchain_core.output_parsers import JsonOutputKeyToolsParser

from typing import List

from src.config import NO_GENERATED_SAMPLES, LLM_TEMPERATURE, MODEL_NAME

llm = ChatOpenAI(model=MODEL_NAME, temperature=LLM_TEMPERATURE)

class TrainingInstance(BaseModel):
    """This is an instance of training data for the model.
    It contains a sentence and the state sequence for the sentence that will be used to train the hmm.
    """
    sentence: str = Field(description="The sentence that was generated by the model")
    state_sequence: List[str] = Field(description="The state sequence for the sentence, each state will correspond to the word in the sentence. States can be A, D, P, W. It should be of the same length as the sentence.")

class TrainingData(BaseModel):
    """This will be the training data for the model.
    It will contain a list of training instances.
    """
    instances: List[TrainingInstance] = Field(description="The training instances")

system_prompt = """
You are a helpful assistant that generates training data showing the state sequence for a sentence.
- You should generate {instances} instances of training data.
- The training data will be used by a time table scheduler to extract fields like action, duration, preference, from a user's prompt, like "I want to play cricket for 3 hours this morning".
- The duration should be of today, not tomorrow or yesterday.
- Call the TrainingData tool in order to generate the training data.
- The number of possible states in the model are as follows:
    1) Action: It is shown by A, it is an action like "play cricket", "do homework", "read a book" etc.
    2) Duration: It is shown by D, it is a duration like "1 hour", "30 minutes", "3 hours" etc.
    3) Preference: It is shown by P, it is a preference like "morning", "evening", "weekends" etc.
    4) Word: It is shown by W, it is any word other than Action, Duration, Preference.
- Make sure to use different words in the sentences to have a diverse dataset.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
])

In [2]:
parser = JsonOutputKeyToolsParser(key_name="instances")

chain = prompt | llm.with_structured_output(TrainingData)

data = chain.invoke({"instances": NO_GENERATED_SAMPLES})

In [5]:
instances = data.instances

len(instances)

20

In [4]:
for instance in instances:
    if (len(instance.sentence.split(" ")) != len(instance.state_sequence)):
        print("Not matched")
    print("Sentence: ", instance.sentence)
    print("State Sequence: ", instance.state_sequence)
    print("\n")

Sentence:  I need to cook dinner for 45 minutes tonight.
State Sequence:  ['W', 'W', 'W', 'A', 'A', 'W', 'D', 'D', 'P']


Not matched
Sentence:  Plan to run 5 kilometers this afternoon.
State Sequence:  ['W', 'W', 'A', 'A', 'D', 'D', 'P', 'P']


Sentence:  Set aside 2 hours to complete homework this evening.
State Sequence:  ['W', 'W', 'D', 'D', 'W', 'A', 'A', 'P', 'P']


Sentence:  Schedule a meeting for 1 hour in the early morning.
State Sequence:  ['W', 'W', 'A', 'W', 'D', 'D', 'W', 'W', 'P', 'P']


Sentence:  Devote time to meditate for half an hour tonight.
State Sequence:  ['W', 'W', 'W', 'A', 'W', 'D', 'D', 'D', 'P']


Sentence:  I want to draw for 2 hours.
State Sequence:  ['W', 'W', 'W', 'A', 'W', 'D', 'D']


Not matched
Sentence:  Let's read a novel for an hour this afternoon.
State Sequence:  ['W', 'W', 'A', 'W', 'A', 'W', 'D', 'D', 'P', 'P']


Not matched
Sentence:  I plan to work out for 90 minutes this morning.
State Sequence:  ['W', 'W', 'A', 'A', 'W', 'D', 'D', 'P', 'P'