In [50]:
!pip install psycopg
!pip install ollama
!pip install tqdm
!pip install jupyter ipywidgets

Collecting psycopg
  Using cached psycopg-3.2.9-py3-none-any.whl.metadata (4.5 kB)
Using cached psycopg-3.2.9-py3-none-any.whl (202 kB)
Installing collected packages: psycopg
Successfully installed psycopg-3.2.9
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Collecting jupyter
  Using cached jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting ipywidgets
  Using cached ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting notebook (from jupyter)
  Using cached notebook-7.4.3-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Using cached jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting ipykernel (from jupyter)
  Using cached ipykernel-6.29.5-py3-none-any.whl.metadata (6.3

In [68]:
# connect to the database
import psycopg
from tqdm.notebook import tqdm

conn = psycopg.connect(
    host="localhost",
    user="postgres",
    password="pasta",
    dbname="production",
    port=5432
)

# create a cursor
cursor = conn.cursor()


response =cursor.execute("""
            SELECT table_name 
            FROM information_schema.tables 
            WHERE table_schema = 'public' 
            AND table_type = 'BASE TABLE'
        """)

tables =response.fetchall()

results = {}

# For each table, get up to 100 random entries
limit = 100
for table in tqdm(tables):
    table_name = table[0]
    print(f"Querying table: {table_name}")
    
    try:
        # Get random entries from the table
        cursor.execute(f"""
            SELECT * FROM {table_name} 
            ORDER BY RANDOM() 
            LIMIT {limit}
        """)
        
        # Get column names
        column_names = [desc[0] for desc in cursor.description]
        
        # Fetch the data
        rows = cursor.fetchall()
        
        # Store results
        results[table_name] = {
            'columns': column_names,
            'data': rows,
            'row_count': len(rows)
        }
        
        print(f"  Retrieved {len(rows)} rows from {table_name}")
        
    except Exception as e:
        print(f"  Error querying table {table_name}: {e}")
        continue

  0%|          | 0/67 [00:00<?, ?it/s]

Querying table: databasechangelog
  Retrieved 100 rows from databasechangelog
Querying table: spatial_ref_sys
  Retrieved 100 rows from spatial_ref_sys
Querying table: databasechangeloglock
  Retrieved 1 rows from databasechangeloglock
Querying table: vin_equipment
  Retrieved 100 rows from vin_equipment
Querying table: vin_app_version
  Retrieved 2 rows from vin_app_version
Querying table: vin_address
  Retrieved 5 rows from vin_address
Querying table: vin_equipment_task_work_log
  Retrieved 100 rows from vin_equipment_task_work_log
Querying table: vin_equipment_attachment
  Retrieved 0 rows from vin_equipment_attachment
Querying table: vin_gps_tracking_segment
  Retrieved 100 rows from vin_gps_tracking_segment
Querying table: vin_company_module_config
  Retrieved 100 rows from vin_company_module_config
Querying table: vin_company
  Retrieved 83 rows from vin_company
Querying table: vin_dosage_rule
  Retrieved 100 rows from vin_dosage_rule
Querying table: vin_generic_field_value
  Ret

In [69]:
# get first value of results
table_names = list(results.keys())
r1 = results[table_names[0]]
r2 = results['vin_company']

columns = r2['columns']
rows = r2['data']

In [73]:
columns

['id',
 'deleted',
 'identifier',
 'name',
 'registration_number',
 'address_id',
 'settings_id',
 'location']

In [71]:
from ollama import ChatResponse
from ollama import Client

client = Client(
  host='http://localhost:11434',
  headers={'Content-Type': 'application/json'},
)


messages = [
    # 1) Hard rules (this is what stops it from spewing code)
    {
        "role": "system",
        "content": (
            "You are a data analyst.\n"
            "Your ONLY task is to write a concise, human-readable knowledge base "
            "about the table represented by the columns and rows that you are shown.\n"
            "You will be given a sample of 100 rows from the table to extract insights from. Dont make any comment about the number of rows you were given.\n"
            "WHen making numerical statements, consider this to be a sample of the entire table.\n"
            "Your output should be readable as knowledge base by another LLM."
            "• Never output code, markdown fences, or JSON.\n"
            "• Never describe the JSON format itself — only the database facts.\n"
            "• Some columns might be in Binary Format, or other formats that are not human readable. If you can distinguish the format, describe it, otherwise ignore it.\n"
            "If you are tempted to write code, STOP."
        )
    },
    {
        "role": "assistant",
        "content": (
            "Knowledge base:\n"
            "• The table has a column **id** (uuid primary key).\n"
            "• Each row represents a single record.\n"
        )
    },
    # 3) Your real sample — the model must now produce the KB for it
    {
        "role": "user",
        "content": f"These are the column names: {columns}"
    },
]

messages.extend([{
        "role": "user",
        "content": f"This is the {i}th row: {row}"
    } for i, row in enumerate(rows)])
response: ChatResponse = client.chat(
    # model="llama3.2",
    model="deepseek-r1:8b",
    messages=messages,
    stream=False,
    # format={"type": "string"},
    options={
        "temperature": 0.2,
        # "top_k": 10,
        # "top_p": 0.8,
        # "max_tokens": 1000,
        # "stop": ["```"]
    },
)

# print message with max line length, add newlines every 100 characters
content = response.message.content
lines = []
current_line = ""

for word in content.split():
    if len(current_line + " " + word) > 100:
        lines.append(current_line)
        current_line = word
    else:
        current_line = (current_line + " " + word).strip()

if current_line:
    lines.append(current_line)

print("\n".join(lines))

<think> Okay, let's start by understanding the user's query. They provided a dataset of 82 rows from
what seems to be a database or log file containing UUIDs and some structured data about vineyards in
Austria. The assistant's task is to create a thought process that mimics how they would approach
analyzing this data. First, I need to parse through the given examples. Each row has eight elements:
two UUIDs, then maybe names, address parts, phone numbers, and coordinates. Wait, actually looking
at the data again, it seems like each entry is structured with a tuple of values. The first element
is always a UUIDv4 for the winery ID, followed by another UUIDv4 as an external reference (like a
contact or location). Then there are name fields that sometimes have missing values represented as
None or empty strings. Hmm, I notice some patterns here. There's a mix of German and Austrian names
in the names field, which makes sense given the context—likely related to wineries in Austria. The
addre