In [1]:
import os
import sys
import json

notebook_path = os.getcwd()
project_path = os.path.abspath(os.path.join(notebook_path, '..','..','..'))
print(f"Project path: {project_path}")
# Add the project path to sys.path
sys.path.append(project_path)


Project path: c:\Users\dalej\Documents\_Coding\DragonRegen


In [2]:
from AIGuardian.Tasks.KafkaAgent import KafkaAgent
ka = KafkaAgent()

---
## Data Create Struct

In [3]:
from src.AIGuardian.Tasks.DataStructCreate import DataStructCreate
from src.MetaFort.AILoggingTopics import AILoggingTopics

start_action = DataStructCreate()
# prompt = """Create a set of tables for a simple online car insurance quoting engine."""
prompt = """Create a set of tables for a television ads management and sales attribution system. The data comes from multiple companies and is stored in a data lake. Some data can be semi-structured."""
ka.submit_task(start_action, prompt)
output = ka.wait_on_task(start_action.task_id)
print("Task completed.")
print(type(output))
print(output)

==> Task 2d53b174-50ee-4d28-8874-57a4e3f8a9fa completed.
Task completed.
<class 'str'>
{"companies": {"purpose": "Stores information about the companies that provide ad data", "fields": {"company_id": "unique ID for each company", "company_name": "name of the company", "industry": "industry the company operates in", "contact_info": "JSON object containing contact details", "created_at": "timestamp when the record was created", "updated_at": "timestamp when the record was last updated"}}, "ad_campaigns": {"purpose": "Tracks television ad campaigns run by companies", "fields": {"campaign_id": "unique ID for each campaign", "company_id": "foreign key to companies table", "campaign_name": "name of the campaign", "start_date": "start date of the campaign", "end_date": "end date of the campaign", "budget": "total budget allocated for the campaign", "target_audience": "JSON object describing target demographics", "goals": "campaign objectives and KPIs", "status": "current status of the campai

In [4]:
start_action.output_params = json.loads(output)
print(start_action.output_params.keys())
print(json.dumps(start_action.output_params, indent=2)) 

dict_keys(['companies', 'ad_campaigns', 'ad_creatives', 'tv_networks', 'ad_spots', 'sales_data', 'attribution_models', 'attribution_results', 'data_sources', 'data_quality_logs'])
{
  "companies": {
    "purpose": "Stores information about the companies that provide ad data",
    "fields": {
      "company_id": "unique ID for each company",
      "company_name": "name of the company",
      "industry": "industry the company operates in",
      "contact_info": "JSON object containing contact details",
      "created_at": "timestamp when the record was created",
      "updated_at": "timestamp when the record was last updated"
    }
  },
  "ad_campaigns": {
    "purpose": "Tracks television ad campaigns run by companies",
    "fields": {
      "campaign_id": "unique ID for each campaign",
      "company_id": "foreign key to companies table",
      "campaign_name": "name of the campaign",
      "start_date": "start date of the campaign",
      "end_date": "end date of the campaign",
      

---
## Export The Output for easy relaod

In [6]:
# export the schema to a JSON file
import json

output_file_path = os.path.join(notebook_path, 'test_tvad_schema_kafka.json')
with open(output_file_path, 'w') as output_file:
    json.dump(start_action.output_params, output_file, indent=2)
print(f"Schema exported to {output_file_path}")

Schema exported to c:\Users\dalej\Documents\_Coding\DragonRegen\docs\AIGuardian\Tasks\test_tvad_schema_kafka.json


In [3]:
# load the schema from the JSON file
output_file_path = os.path.join(notebook_path, 'test_tvad_schema_kafka.json')
with open(output_file_path, 'r') as output_file:
    loaded_schema = json.load(output_file)

---
## Schema execution

In [4]:
print(loaded_schema)

{'companies': {'purpose': 'Stores information about the companies that provide ad data', 'fields': {'company_id': 'unique ID for each company', 'company_name': 'name of the company', 'industry': 'industry the company operates in', 'contact_info': 'JSON object containing contact details', 'created_at': 'timestamp when the record was created', 'updated_at': 'timestamp when the record was last updated'}}, 'ad_campaigns': {'purpose': 'Tracks television ad campaigns run by companies', 'fields': {'campaign_id': 'unique ID for each campaign', 'company_id': 'foreign key to companies table', 'campaign_name': 'name of the campaign', 'start_date': 'start date of the campaign', 'end_date': 'end date of the campaign', 'budget': 'total budget allocated for the campaign', 'target_audience': 'JSON object describing target demographics', 'goals': 'campaign objectives and KPIs', 'status': 'current status of the campaign (active, completed, paused)', 'created_at': 'timestamp when the record was created',

In [None]:
# Run the Scheme Refiner
from src.AIGuardian.Tasks.SchemaRefiner import SchemaRefiner

test_schema_refiner = SchemaRefiner(input_params=loaded_schema, parent_task=None)
print(test_schema_refiner)
print(test_schema_refiner.task_id)
ka.submit_task(test_schema_refiner)
print("Submitted and waiting.")
schema_ref_output = ka.wait_on_task(test_schema_refiner.task_id, timeout=20)
print("Task completed.")
print(type(schema_ref_output))
print(schema_ref_output)

<src.AIGuardian.Tasks.SchemaRefiner.SchemaRefiner object at 0x000001FB29E0F610>
008d1e3e-fde2-4425-9e6c-461a38b0ad19
Submitted and waiting.
Task completed.
<class 'NoneType'>
None


In [13]:
print("Submitted and waiting.")
schema_ref_output = ka.wait_on_task(test_schema_refiner.task_id, timeout=300)
print("Task completed.")
print(type(schema_ref_output))
print(schema_ref_output)

Submitted and waiting.
Task completed.
<class 'NoneType'>
None


In [16]:
from src.MetaFort.SysLogs.KafkaEngine import KafkaEngine
from src.MetaFort.AILoggingTopics import AILoggingTopics
kafka_engine = KafkaEngine('localhost:9092')

batch_messages = kafka_engine.search_batch_topic(AILoggingTopics.AI_TASK_COMPLETED_TOPIC, search_value='008d1e3e-fde2-4425-9e6c-461a38b0ad19', search_key='task_id')



Searching topic 'ai_tasks_completed' for value: 008d1e3e-fde2-4425-9e6c-461a38b0ad19
This will timeout after 60 seconds if no more messages are available.
Processing 513 messages from partition TopicPartition(topic='ai_tasks_completed', partition=1)
Processing 472 messages from partition TopicPartition(topic='ai_tasks_completed', partition=2)
Processing 462 messages from partition TopicPartition(topic='ai_tasks_completed', partition=3)
Found match at offset 461 in partition 3
Processing 464 messages from partition TopicPartition(topic='ai_tasks_completed', partition=0)
Processing 492 messages from partition TopicPartition(topic='ai_tasks_completed', partition=4)
Search completed in 1.16 seconds.
Processed 2403 messages, found 1 matches.


In [17]:
print(type(batch_messages))

<class 'list'>


In [22]:

schema_ref_output = None
for record in batch_messages:
    print(record)
    print(record['value'])
    print(record['value']['output_artifacts'])
    print(type(record['value']['output_artifacts']))
    schema_ref_output = record['value']['output_artifacts']

{'topic': 'ai_tasks_completed', 'partition': 3, 'offset': 461, 'timestamp': 1748031635212, 'key': None, 'value': {'task_id': '008d1e3e-fde2-4425-9e6c-461a38b0ad19', 'task_name': 'SchemaRefiner', 'group_task_id': '008d1e3e-fde2-4425-9e6c-461a38b0ad19', 'insert_dt': '2025-05-23T16:20:35.211035', 'output_artifacts': '{"companies": {"purpose": "Stores information about the companies that provide ad data", "fields": ["{\\"name\\": \\"created_at\\", \\"type\\": \\"Timestamp\\", \\"nullable\\": false, \\"metadata\\": {\\"description\\": \\"timestamp when the record was created\\", \\"unique_fl\\": false, \\"default_value\\": \\"CURRENT_TIMESTAMP\\", \\"col_type\\": null}}", "{\\"name\\": \\"company_id\\", \\"type\\": \\"Integer\\", \\"nullable\\": false, \\"metadata\\": {\\"description\\": \\"unique ID for each company\\", \\"unique_fl\\": true, \\"default_value\\": null, \\"col_type\\": null}}", "{\\"name\\": \\"contact_info\\", \\"type\\": \\"JSON\\", \\"nullable\\": true, \\"metadata\\": {

In [23]:
test_schema_refiner.output_params = json.loads(schema_ref_output)
print(test_schema_refiner.input_params.keys())
print(test_schema_refiner.output_params.keys())
print(json.dumps(test_schema_refiner.output_params, indent=2))

dict_keys(['companies', 'ad_campaigns', 'ad_creatives', 'tv_networks', 'ad_spots', 'sales_data', 'attribution_models', 'attribution_results', 'data_sources', 'data_quality_logs', 'user_prompt'])
dict_keys(['companies', 'ad_spots', 'attribution_results', 'data_quality_logs', 'attribution_models', 'data_sources', 'ad_campaigns', 'ad_creatives', 'sales_data', 'tv_networks'])
{
  "companies": {
    "purpose": "Stores information about the companies that provide ad data",
    "fields": [
      "{\"name\": \"created_at\", \"type\": \"Timestamp\", \"nullable\": false, \"metadata\": {\"description\": \"timestamp when the record was created\", \"unique_fl\": false, \"default_value\": \"CURRENT_TIMESTAMP\", \"col_type\": null}}",
      "{\"name\": \"company_id\", \"type\": \"Integer\", \"nullable\": false, \"metadata\": {\"description\": \"unique ID for each company\", \"unique_fl\": true, \"default_value\": null, \"col_type\": null}}",
      "{\"name\": \"contact_info\", \"type\": \"JSON\", \"n

In [12]:
count_task = 0
for table_name, table_info in test_schema_refiner.input_params.items():
    if isinstance(table_info, dict):
        # Generate tasks for each table and its fields
        print(f"SchemaRefiner: {table_info}")
        for col in table_info["fields"]:
            count_task += 1

print(count_task)

SchemaRefiner: {'purpose': 'Stores information about the companies that provide ad data', 'fields': {'company_id': 'unique ID for each company', 'company_name': 'name of the company', 'industry': 'industry the company operates in', 'contact_info': 'JSON object containing contact details', 'created_at': 'timestamp when the record was created', 'updated_at': 'timestamp when the record was last updated'}}
SchemaRefiner: {'purpose': 'Tracks television ad campaigns run by companies', 'fields': {'campaign_id': 'unique ID for each campaign', 'company_id': 'foreign key to companies table', 'campaign_name': 'name of the campaign', 'start_date': 'start date of the campaign', 'end_date': 'end date of the campaign', 'budget': 'total budget allocated for the campaign', 'target_audience': 'JSON object describing target demographics', 'goals': 'campaign objectives and KPIs', 'status': 'current status of the campaign (active, completed, paused)', 'created_at': 'timestamp when the record was created', 

In [24]:
# export the schema to a JSON file
import json

output_file_path = os.path.join(notebook_path, 'tvad_final_schema_kafka.json')
with open(output_file_path, 'w') as output_file:
    json.dump(test_schema_refiner.output_params, output_file, indent=2)
print(f"Schema exported to {output_file_path}")

Schema exported to c:\Users\dalej\Documents\_Coding\DragonRegen\docs\AIGuardian\Tasks\tvad_final_schema_kafka.json
