In [18]:
import random
import json
from datasets import Dataset

def generate_example():
    topics = [
        "capital of a country", "currency of a country", "population of a city",
        "list of famous landmarks", "color conversion (RGB to Hex)", "simple math operation",
        "programming language feature", "animal classification", "famous quote author"
    ]

    selected_topic = random.choice(topics)

    if selected_topic == "capital of a country":
        country = random.choice(["Japan", "France", "Germany", "Italy", "Spain", "Canada", "Australia"])
        capital = {
            "Japan": "Tokyo", "France": "Paris", "Germany": "Berlin",
            "Italy": "Rome", "Spain": "Madrid", "Canada": "Ottawa", "Australia": "Canberra"
        }.get(country)
        input_text = f"What is the capital of {country}?"
        output_json = json.dumps({"question": f"capital of {country}", "answer": capital})
    elif selected_topic == "currency of a country":
        country = random.choice(["USA", "UK", "Eurozone", "India", "China"])
        currency = {
            "USA": "Dollar", "UK": "Pound Sterling", "Eurozone": "Euro",
            "India": "Rupee", "China": "Yuan"
        }.get(country)
        input_text = f"What is the currency of {country}?"
        output_json = json.dumps({"country": country, "currency": currency})
    elif selected_topic == "population of a city":
        city = random.choice(["New York", "London", "Shanghai", "Mumbai", "Cairo"])
        population = random.randint(5_000_000, 25_000_000)
        input_text = f"What is the estimated population of {city}?"
        output_json = json.dumps({"city": city, "population_estimate": population})
    elif selected_topic == "list of famous landmarks":
        location = random.choice(["France", "Italy", "Egypt", "India"])
        landmarks = {
            "France": ["Eiffel Tower", "Louvre Museum"], "Italy": ["Colosseum", "Leaning Tower of Pisa"],
            "Egypt": ["Pyramids of Giza", "Sphinx"], "India": ["Taj Mahal", "Gateway of India"]
        }.get(location)
        input_text = f"List two famous landmarks in {location}."
        output_json = json.dumps({"country": location, "landmarks": landmarks})
    elif selected_topic == "color conversion (RGB to Hex)":
        r, g, b = random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)
        hex_color = f"#{r:02x}{g:02x}{b:02x}"
        input_text = f"Convert RGB({r},{g},{b}) to Hex."
        output_json = json.dumps({"rgb": [r, g, b], "hex": hex_color})
    elif selected_topic == "simple math operation":
        num1 = random.randint(1, 100)
        num2 = random.randint(1, 100)
        operation = random.choice(["add", "subtract", "multiply"])
        if operation == "add":
            result = num1 + num2
            input_text = f"What is {num1} plus {num2}?"
        elif operation == "subtract":
            result = num1 - num2
            input_text = f"What is {num1} minus {num2}?"
        else:
            result = num1 * num2
            input_text = f"What is {num1} multiplied by {num2}?"
        output_json = json.dumps({"operation": operation, "operands": [num1, num2], "result": result})
    elif selected_topic == "programming language feature":
        lang = random.choice(["Python", "Java", "JavaScript"])
        feature = random.choice(["dynamic typing", "object-oriented", "functional support"])
        input_text = f"Describe {feature} in {lang}."
        output_json = json.dumps({"language": lang, "feature": feature, "description_hint": "Provide a brief explanation"})
    elif selected_topic == "animal classification":
        animal = random.choice(["Lion", "Eagle", "Dolphin"])
        classification = {
            "Lion": "Mammal", "Eagle": "Bird", "Dolphin": "Mammal"
        }.get(animal)
        input_text = f"Classify the animal: {animal}."
        output_json = json.dumps({"animal": animal, "classification": classification})
    elif selected_topic == "famous quote author":
        quote_snippet = random.choice(["To be or not to be", "I have a dream", "The only thing we have to fear is fear itself"])
        author = {
            "To be or not to be": "William Shakespeare",
            "I have a dream": "Martin Luther King Jr.",
            "The only thing we have to fear is fear itself": "Franklin D. Roosevelt"
        }.get(quote_snippet)
        input_text = f"Who said: '{quote_snippet}'?"
        output_json = json.dumps({"quote": quote_snippet, "author": author})
    else:
        input_text = "Default input."
        output_json = json.dumps({"status": "default", "message": "This is a default example."})

    return {"Input": input_text, "Output": output_json}

# Initialize an empty list to store the generated examples
generated_data = []

# Generate 2000 examples
for _ in range(2000):
    generated_data.append(generate_example())

# Convert the list of dictionaries to a Dataset object
dataset = Dataset.from_list(generated_data)

print("Larger dummy dataset created successfully!")
print(dataset)
print(f"First example: {dataset[0]}")

Larger dummy dataset created successfully!
Dataset({
    features: ['Input', 'Output'],
    num_rows: 2000
})
First example: {'Input': 'What is the currency of USA?', 'Output': '{"country": "USA", "currency": "Dollar"}'}


In [19]:
def format_instruction(example):
    system_prompt = "You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object."
    input_text = example['Input']
    output_json = example['Output']

    # Construct the formatted string
    formatted_string = f"{system_prompt}\n### INPUT: {input_text}\n### OUTPUT: {output_json}\n"
    return {"text": formatted_string}

# Apply the function to the dataset
formatted_dataset = dataset.map(format_instruction)

# Print an example from the newly formatted dataset
print("First formatted example from the larger dataset:")
print(formatted_dataset[0]['text'])


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

First formatted example from the larger dataset:
You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object.
### INPUT: What is the currency of USA?
### OUTPUT: {"country": "USA", "currency": "Dollar"}



In [20]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define TrainingArguments for the larger dataset
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=500,  # Adjusted for larger dataset
    learning_rate=2e-4,
    bf16=True,  # Use bfloat16 for compatibility
    logging_steps=1,
    output_dir="outputs_large_dataset"
)

# Instantiate SFTTrainer with the pre-configured model and formatted dataset
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    args=training_args
)

print("SFTTrainer setup successfully for the larger dataset!")
print(trainer)

Adding EOS to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

SFTTrainer setup successfully for the larger dataset!
<trl.trainer.sft_trainer.SFTTrainer object at 0x7c52f08621e0>


In [21]:
trainer.train()

print("Training completed for the larger dataset!")

Step,Training Loss
1,2.376481
2,2.008572
3,1.817588
4,2.064041
5,1.632018
6,1.658241
7,1.499886
8,1.525983
9,1.273475
10,1.047609


Training completed for the larger dataset!


In [22]:
trainer.model.save_pretrained("fine_tuned_model_large_dataset")
tokenizer.save_pretrained("fine_tuned_model_large_dataset")

print("Fine-tuned model and tokenizer saved successfully to 'fine_tuned_model_large_dataset' directory!")

Fine-tuned model and tokenizer saved successfully to 'fine_tuned_model_large_dataset' directory!


In [23]:
import torch
import re
import json # Import json to try parsing the extracted string

# 1. Prompt the user for input
user_input = input("Enter your natural language query: ")

# 2. Define the system_prompt
system_prompt = "You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object."

# 3. Construct the full_prompt
full_prompt = f"{system_prompt}\n### INPUT: {user_input}\n### OUTPUT:"

print(f"\nInput Query: {user_input}")
print(f"Full Prompt Sent to Model:\n{full_prompt}")

# 4. Tokenize the full_prompt
inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
input_ids = inputs.input_ids.to(model.device)
attention_mask = inputs.attention_mask.to(model.device)

# 5. Generate output using the fine-tuned model
generated_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=150,  # Increased max_new_tokens for potentially longer JSON
    pad_token_id=tokenizer.eos_token_id,
    do_sample=False, # For deterministic output given the training data
    num_beams=1 # Explicitly set for greedy decoding
)

# 6. Decode the generated tokens
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# 7. Extract the JSON part from the generated_text
output_prefix = f"{system_prompt}\n### INPUT: {user_input}\n### OUTPUT:"

# Find the index where the generated content starts after the prompt
start_index = generated_text.find(output_prefix)
if start_index != -1:
    json_output_with_extras = generated_text[start_index + len(output_prefix):].strip()

    # Attempt to find the first complete JSON object using regex
    match = re.search(r'\{.*?\}', json_output_with_extras, re.DOTALL) # re.DOTALL to match across newlines
    if match:
        extracted_json_str = match.group(0)
        try:
            # Attempt to parse to validate JSON
            extracted_json = json.loads(extracted_json_str)
            print("\nExtracted and Parsed JSON:")
            print(json.dumps(extracted_json, indent=2))
        except json.JSONDecodeError:
            print("\nExtracted (but not valid JSON) string:")
            print(extracted_json_str)
            print("Warning: Extracted string is not a valid JSON object.")
    else:
        extracted_json_str = json_output_with_extras
        print("\nNo complete JSON object found. Full generated output after prompt:")
        print(extracted_json_str)
else:
    print("\nCould not find the expected output prefix in the generated text.")
    print("Full generated text:")
    print(generated_text)

# 8. Print the generated_text (full output from the model)
print("\nFull Generated Text from Model:")
print(generated_text)


Enter your natural language query: Hi , I'm from bangalore

Input Query: Hi , I'm from bangalore
Full Prompt Sent to Model:
You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object.
### INPUT: Hi , I'm from bangalore
### OUTPUT:

Could not find the expected output prefix in the generated text.
Full generated text:
You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object.
### INPUT: Hi, I'm from bangalore
### OUTPUT: {"city": "bangalore", "country": "India"}
"""


Full Generated Text from Model:
You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object.
### INPUT: Hi, I'm from bangalore
### OUTPUT: {"city": "bangalore", "country": "India"}
"""



In [24]:
import torch
import re
import json

# 1. Prompt the user for input
user_input = input("Enter your natural language query: ")

# 2. Define the system_prompt
system_prompt = "You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object."

# 3. Construct the full_prompt for the model's input
# Normalize user input for prompt to match potential model normalization (e.g., remove extra spaces around commas)
processed_user_input = user_input.replace(' ,', ',').replace(' .', '.').strip()
full_prompt = f"{system_prompt}\n### INPUT: {processed_user_input}\n### OUTPUT:"

print(f"\nInput Query: {user_input}")
print(f"Processed Input for Prompt: {processed_user_input}")
print(f"Full Prompt Sent to Model:\n{full_prompt}")

# 4. Tokenize the full_prompt
inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
input_ids = inputs.input_ids.to(model.device)
attention_mask = inputs.attention_mask.to(model.device)

# 5. Generate output using the fine-tuned model
generated_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=150,  # Increased max_new_tokens for potentially longer JSON
    pad_token_id=tokenizer.eos_token_id,
    do_sample=False, # For deterministic output given the training data
    num_beams=1 # Explicitly set for greedy decoding
)

# 6. Decode the generated tokens
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# 7. Extract the JSON part from the generated_text
# Find the last occurrence of '### OUTPUT:' in the generated text
output_delimiter_index = generated_text.rfind('### OUTPUT:')

if output_delimiter_index != -1:
    # Extract content after the last '### OUTPUT:'
    json_output_with_extras = generated_text[output_delimiter_index + len('### OUTPUT:'):].strip()

    # Attempt to find the first complete JSON object using regex
    match = re.search(r'\{.*?\}', json_output_with_extras, re.DOTALL) # re.DOTALL to match across newlines
    if match:
        extracted_json_str = match.group(0)
        try:
            # Attempt to parse to validate JSON
            extracted_json = json.loads(extracted_json_str)
            print("\nExtracted and Parsed JSON:")
            print(json.dumps(extracted_json, indent=2))
        except json.JSONDecodeError:
            print("\nExtracted (but not valid JSON) string:")
            print(extracted_json_str)
            print("Warning: Extracted string is not a valid JSON object.")
    else:
        extracted_json_str = json_output_with_extras
        print("\nNo complete JSON object found. Full generated output after prompt:")
        print(extracted_json_str)
else:
    print("\nCould not find the '### OUTPUT:' delimiter in the generated text.")
    print("Full generated text:")
    print(generated_text)

# 8. Print the generated_text (full output from the model)
print("\nFull Generated Text from Model:")
print(generated_text)

Enter your natural language query: You are gay.

Input Query: You are gay.
Processed Input for Prompt: You are gay.
Full Prompt Sent to Model:
You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object.
### INPUT: You are gay.
### OUTPUT:

Extracted and Parsed JSON:
{
  "language": "English",
  "feature": "Gay",
  "description": "Person who identifies as gay"
}

Full Generated Text from Model:
You are a JSON Extraction Engine. Your task is to extract information from the given input and provide it as a JSON object.
### INPUT: You are gay.
### OUTPUT: {"language": "English", "feature": "Gay", "description": "Person who identifies as gay"}
"""

