## MySQL code

In [21]:
from src.mysql import get_all_table_names, get_table_definition, get_knowledge_base_schema_for_table, query_random_rows

db_connection_params = {
    'host': 'localhost',
    'port': 3306,
    'user': 'daver',
    'password': 'pizzatime',
    'database': 'daver_db'
}

## Ollama Code

In [119]:
from ollama import ChatResponse
from ollama import Client


client = Client(
    # host='http://localhost:11434',
    host='http://192.168.178.38:11434',
    headers={'Content-Type': 'application/json'},
)
model = "deepseek-r1:8b"
options = {
    # "temperature": 0.2
    }

In [120]:
schema = {
  "type": "object",
  "properties": {
    "name":    {"type": "string"},
    "context": {"type": "string"},
    "fields": {
      "type": "array",
      "items": {
        "type":  "object",
        "properties": {
          "name":     {"type": "string"},
          "type":     {"type": "string"},
          "nullable": {"type": "string"},
          "key":      {"type": "string"},
          "default":  {"type": ["string", "null"]},
          "extra":    {"type": "string"},
          "context":  {"type": "string"}
        },
        "required": ["name", "context",]
      }
    }
  },
  "required": ["name", "context", "fields"]
}

In [121]:
demo_json = {
  "name": "demo",
  "context": "A one-column demo table.",
  "fields": [
    {
      "name": "id",
      "type": "int",
      "nullable": "NO",
      "key": "PRI",
      "default": None,
      "extra": "auto_increment",
      "context": "Primary identifier; monotonically increasing. Often only one digit"
    }
  ]
}

In [122]:
system_message = {
            "role": "system",
            "content": (
                "You are a data analyst.\n"
                "Your ONLY task is to write a concise, human-readable knowledge base"
                "about the table represented by the columns and rows that you are shown.\n"
                "You will be given a sample of rows from the table to extract insights from. Don't make any comment about the number of rows you were given.\n"
                "When making numerical statements, consider this to be a sample of the entire table, therefore any statements you make can only be relative to the sample.\n"
                "Your output should be readable as knowledge base by another LLM."
                "• Never output code, markdown fences, or JSON.\n"
                "• Never describe the JSON format itself — only the database facts.\n"
                "• Some columns might be in Binary Format, or other formats that are not human readable. If you can distinguish the format, describe it, otherwise ignore it.\n"
                "If you are tempted to write code, STOP.\n"
                "Return ONLY a JSON object that matches the given schema. Fill in you knowledge base about the table in the top level 'context' field.\n"
                "Fill in the 'context' field in each of the 'fields' objects with your knowledge base about the column.\n"
                "Do not leave the 'context' field empty!"
            )
        },

In [123]:
reprimand = {
        "role": "system",
        "content": (
            "⚠️  Your previous reply violated the formatting rules.\n"
            "You MUST follow these constraints:\n"
            "• Do not leave the 'context' field empty. You must fill it with your knowledge base about the table and the columns.\n"
        )
    }

In [None]:
example_A_user = {
  "name": "book_inventory",
  "context": "",
  "fields": [
    {"name": "isbn",       "type": "char(13)",     "nullable": "NO",  "key": "PRI", "default": null, "extra": "",            "context": ""},
    {"name": "title",      "type": "varchar(255)", "nullable": "NO",  "key": "",    "default": null, "extra": "",            "context": ""},
    {"name": "author",     "type": "varchar(255)", "nullable": "YES", "key": "",    "default": null, "extra": "",            "context": ""},
    {"name": "pub_year",   "type": "year",         "nullable": "YES", "key": "",    "default": null, "extra": "",            "context": ""},
    {"name": "in_stock",   "type": "int",          "nullable": "NO",  "key": "",    "default": 0,    "extra": "",            "context": ""}
  ]
}
example_A_assistant = {
  "name": "book_inventory",
  "context": "One row per title kept in the warehouse. Primary key is the 13-digit ISBN; titles are unique.",
  "fields": [
    {"name": "isbn",       "type": "char(13)",     "nullable": "NO",  "key": "PRI", "default": null, "extra": "", "context": "Global book identifier used as the primary key."},
    {"name": "title",      "type": "varchar(255)", "nullable": "NO",  "key": "",    "default": null, "extra": "", "context": "Human-readable title; duplicates possible across editions."},
    {"name": "author",     "type": "varchar(255)", "nullable": "YES", "key": "",    "default": null, "extra": "", "context": "Main author(s); blank for works in the public domain."},
    {"name": "pub_year",   "type": "year",         "nullable": "YES", "key": "",    "default": null, "extra": "", "context": "Four-digit publication year; helps with edition matching."},
    {"name": "in_stock",   "type": "int",          "nullable": "NO",  "key": "",    "default": 0,    "extra": "", "context": "Physical copies currently available for sale."}
  ]
}

example_B_user = {
  "name": "iot_sensor_data",
  "context": "",
  "fields": [
    {"name": "sensor_id",   "type": "varchar(40)", "nullable": "NO",  "key": "PRI", "default": null, "extra": "", "context": ""},
    {"name": "ts_utc",      "type": "timestamp",   "nullable": "NO",  "key": "PRI", "default": null, "extra": "", "context": ""},
    {"name": "temperature", "type": "float",       "nullable": "YES", "key": "",    "default": null, "extra": "", "context": ""},
    {"name": "humidity",    "type": "float",       "nullable": "YES", "key": "",    "default": null, "extra": "", "context": ""},
    {"name": "battery_mv",  "type": "smallint",    "nullable": "YES", "key": "",    "default": null, "extra": "", "context": ""}
  ]
}
example_B_assistant = {
  "name": "iot_sensor_data",
  "context": "Time-series readings streamed from edge devices; composite primary key (sensor_id, ts_utc).",
  "fields": [
    {"name": "sensor_id",   "type": "varchar(40)", "nullable": "NO",  "key": "PRI", "default": null, "extra": "", "context": "UUID assigned to each deployed sensor."},
    {"name": "ts_utc",      "type": "timestamp",   "nullable": "NO",  "key": "PRI", "default": null, "extra": "", "context": "ISO timestamp in UTC when the packet was recorded."},
    {"name": "temperature", "type": "float",       "nullable": "YES", "key": "",    "default": null, "extra": "", "context": "Degrees Celsius; may be NULL if probe is faulty."},
    {"name": "humidity",    "type": "float",       "nullable": "YES", "key": "",    "default": null, "extra": "", "context": "Relative humidity percentage."},
    {"name": "battery_mv",  "type": "smallint",    "nullable": "YES", "key": "",    "default": null, "extra": "", "context": "Remaining battery in millivolts; values <3100 signal low-power state."}
  ]
}

In [124]:
import json

few_show_examples = [
    # Tiny demo so the model sees the pattern
    {"role": "user",      "content": "Describe this table:\n" + json.dumps(demo_json)},
    {"role": "assistant", "content": json.dumps(demo_json)},   # ✔ good answer
]

In [125]:
person_schema =get_knowledge_base_schema_for_table('person', db_connection_params)

In [126]:
person_rows_df = query_random_rows('person', 100, db_connection_params)

Retrieved 100 rows from person


In [129]:
# Get column names
columns = list(person_rows_df.columns)
max_rows = min(100, len(person_rows_df))  # Limit to 100 rows to avoid context overflow
rows = [person_rows_df.iloc[i].tolist() for i in range(max_rows)]

messages = [
        system_message,
        *few_show_examples,
        {
            "role": "assistant",
            "content": (
                f"Knowledge base for table {'person'}:\n"
                "• Analyzing table structure and data patterns.\n"
                f"These are the column names: {columns}\n"
            )
        },
        {"role": "user", "content": "This is the table schema definition:\n" + json.dumps(person_schema)},
        *[{"role": "user", "content": f"This is row {i}: {row}"} for i, row in enumerate(rows)]
    ]

for i in range(10):
    print(f"Iteration {i}")
    response = client.chat(
            model=model,
            messages=messages,
            stream=False,
            options=options,
            format=schema
        )
    
    response_json = json.loads(response.message.content)
    
    print(response_json)
    if response_json['context'] == '':
        new_messages = [
            messages[0],
            reprimand,
            *messages[1:],
            {"role": "assistant", "content": response.message.content},
            messages[-1],
        ]
        messages = new_messages
        continue

    continue_flag = False
    for field in response_json['fields']:
        if field['context'] == '':
            new_messages = [
            messages[0],
            reprimand,
            *messages[1:],
            {"role": "assistant", "content": response.message.content},
            messages[-1],
            ]
            messages = new_messages
            continue_flag = True
    if continue_flag:
        continue
    break


Iteration 0
{'name': 'person', 'context': '', 'fields': [{'name': 'id', 'context': '', 'type': 'int', 'nullable': 'NO', 'key': 'PRI', 'default': None, 'extra': 'auto_increment'}, {'name': 'full_name', 'context': '', 'type': 'varchar(500)', 'nullable': 'YES', 'key': '', 'default': None, 'extra': ''}, {'name': 'gender', 'context': '', 'type': 'varchar(10)', 'nullable': 'YES', 'key': '', 'default': None, 'extra': ''}, {'name': 'height', 'context': '', 'type': 'int', 'nullable': 'YES', 'key': '', 'default': None, 'extra': ''}, {'name': 'weight', 'context': '', 'type': 'int', 'nullable': 'YES', 'key': '', 'default': None, 'extra': ''}]}
Iteration 1
{'name': 'person', 'context': '', 'fields': [{'name': 'id', 'context': '', 'type': 'int', 'nullable': 'NO', 'key': 'PRI', 'default': None, 'extra': 'auto_increment'}, {'name': 'full_name', 'context': '', 'type': 'varchar(500)', 'nullable': 'YES', 'key': '', 'default': None, 'extra': ''}, {'name': 'gender', 'context': '', 'type': 'varchar(10)', 'n

In [115]:
print(response.message.content)

{

"name": "person",
"context": "",
"fields": [
    {
        "name": "id",
        "context": "Primary key; monotonically increasing integer.",
        "type": "int"
    },
    {
        "name": "full_name", "context": "The full name of the person, which may include middle names or nicknames in parentheses. Some entries have multiple parts due to aliases or titles."
    },
    {
        "name": "gender",
        "context": "",
        "type": "varchar(10)"
    },
    {
        "name": "height", "context": "Height of the person, stored as an integer representing centimeters. Some entries are zero which might indicate missing data.",
        "type": "int"
    },
    {
        "name": "weight", "context": "Weight of the person, stored as an integer representing kilograms. Some entries are zero which might indicate missing or placeholder data.",
        "type": "int"
    }
]
}

  
