In [6]:
!pip install ollama
!pip install tqdm
!pip install jupyter ipywidgets
!pip install mysql-connector-python
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━

## MySQL code

In [7]:
import mysql.connector
import pandas as pd
import random

cursor = None
connection = None

try:
    connection = mysql.connector.connect(
        host='localhost',
        port=3306,
        user='daver',
        password='pizzatime',
        database='daver_db'
    )
    if connection.is_connected():
        db_info = connection.server_info
        print("Connected to MySQL Server version ", db_info)

    cursor = connection.cursor(dictionary=True)

except mysql.connector.Error as err:
    print("Error while connecting to MySQL", err)
finally:
    if cursor:
        cursor.close()
    if connection and connection.is_connected():
        connection.close()
        print("MySQL connection is closed")

Connected to MySQL Server version  8.4.5
MySQL connection is closed


In [8]:
def get_all_table_names(connection_params):
    """
    Get all table names from the database
    
    Args:
        connection_params (dict): Database connection parameters
    
    Returns:
        list: List of table names
    """
    connection = None
    cursor = None
    table_names = []
    
    try:
        connection = mysql.connector.connect(**connection_params)
        cursor = connection.cursor()
        
        # Get all table names
        cursor.execute("SHOW TABLES")
        tables = cursor.fetchall()
        table_names = [table[0] for table in tables]
        
    except mysql.connector.Error as err:
        print(f"Error getting table names: {err}")
    finally:
        if cursor:
            cursor.close()
        if connection and connection.is_connected():
            connection.close()
    
    return table_names

In [9]:
def query_random_rows(table_name, num_rows, connection_params):
    """
    Query X random rows from a specified table
    
    Args:
        table_name (str): Name of the table to query
        num_rows (int): Number of random rows to retrieve
        connection_params (dict): Database connection parameters
    
    Returns:
        pandas.DataFrame: DataFrame with results, all values as strings
    """
    connection = None
    cursor = None
    df = pd.DataFrame()
    
    try:
        connection = mysql.connector.connect(**connection_params)
        cursor = connection.cursor(dictionary=True)
        
        # First check if table exists and has data
        cursor.execute(f"SELECT COUNT(*) as count FROM `{table_name}`")
        row_count = cursor.fetchone()['count']
        
        if row_count == 0:
            print(f"Table {table_name} is empty")
            return pd.DataFrame()
        
        # Limit num_rows to actual available rows
        actual_rows = min(num_rows, row_count)
        
        # Query random rows
        query = f"SELECT * FROM `{table_name}` ORDER BY RAND() LIMIT {actual_rows}"
        cursor.execute(query)
        
        # Fetch results
        results = cursor.fetchall()
        
        if results:
            # Create DataFrame
            df = pd.DataFrame(results)
            
            # Convert all values to strings
            df = df.astype(str)
            
            print(f"Retrieved {len(df)} rows from {table_name}")
        else:
            print(f"No data retrieved from {table_name}")
            
    except mysql.connector.Error as err:
        print(f"Error querying table {table_name}: {err}")
    finally:
        if cursor:
            cursor.close()
        if connection and connection.is_connected():
            connection.close()
    
    return df

In [10]:
def collect_all_table_data(num_rows_per_table=100):
    """
    Iterate through all tables and collect random rows from each
    
    Args:
        num_rows_per_table (int): Number of random rows to collect from each table
    
    Returns:
        dict: Dictionary with table names as keys and DataFrames as values
    """
    # Connection parameters from your notebook
    connection_params = {
        'host': 'localhost',
        'port': 3306,
        'user': 'daver',
        'password': 'pizzatime',
        'database': 'daver_db'
    }
    
    # Get all table names
    table_names = get_all_table_names(connection_params)
    print(f"Found {len(table_names)} tables in database")
    
    # Dictionary to store results
    results = {}
    
    # Iterate through each table
    for table_name in table_names:
        print(f"\nQuerying table: {table_name}")
        df = query_random_rows(table_name, num_rows_per_table, connection_params)
        results[table_name] = df
    
    return results

In [11]:
all_table_data = collect_all_table_data(num_rows_per_table=100)

Found 11 tables in database

Querying table: city
Retrieved 42 rows from city

Querying table: competitor_event
Retrieved 100 rows from competitor_event

Querying table: event
Retrieved 100 rows from event

Querying table: games
Retrieved 51 rows from games

Querying table: games_city
Retrieved 52 rows from games_city

Querying table: games_competitor
Retrieved 100 rows from games_competitor

Querying table: medal
Retrieved 4 rows from medal

Querying table: noc_region
Retrieved 100 rows from noc_region

Querying table: person
Retrieved 100 rows from person

Querying table: person_region
Retrieved 100 rows from person_region

Querying table: sport
Retrieved 66 rows from sport


In [15]:
for table_name, df in all_table_data.items():
    print(f"\nTable: {table_name}")
    display(df.head())
    print("\n")



Table: city


Unnamed: 0,id,city_name
0,38,Sapporo
1,17,Torino
2,21,Squaw Valley
3,2,London
4,30,Melbourne





Table: competitor_event


Unnamed: 0,event_id,competitor_id,medal_id
0,162,319,4
1,70,87449,4
2,11,12939,4
3,437,88381,4
4,265,98365,4





Table: event


Unnamed: 0,id,sport_id,event_name
0,298,2,Alpine Skiing Women's Combined
1,606,26,Freestyle Skiing Men's Ski Cross
2,67,6,Athletics Men's 4 x 100 metres Relay
3,627,48,"Shooting Men's Small Bore-Rifle, Standing, 50 ..."
4,496,18,Cross Country Skiing Men's 10/10 kilometres Pu...





Table: games


Unnamed: 0,id,games_year,games_name,season
0,32,1956,1956 Winter,Winter
1,24,1964,1964 Winter,Winter
2,49,1932,1932 Winter,Winter
3,29,1988,1988 Summer,Summer
4,11,1980,1980 Winter,Winter





Table: games_city


Unnamed: 0,games_id,city_id
0,41,38
1,46,41
2,43,39
3,25,23
4,44,40





Table: games_competitor


Unnamed: 0,id,games_id,person_id,age
0,127536,37,96177,18
1,83755,12,63347,24
2,179108,20,134766,27
3,120826,38,91212,27
4,157450,1,118503,29





Table: medal


Unnamed: 0,id,medal_name
0,4,
1,1,Gold
2,2,Silver
3,3,Bronze





Table: noc_region


Unnamed: 0,id,noc,region_name
0,102,IVB,"Virgin Islands, British"
1,149,NGR,Nigeria
2,223,VNM,Vietnam (pre)
3,189,STP,Sao Tome and Principe
4,214,UNK,Unknown





Table: person


Unnamed: 0,id,full_name,gender,height,weight
0,89337,"Thomas Charles ""Tom"""" O'Rourke""",M,0,0
1,80869,Musa Khozh-Akhmatovich Mogushkov,M,170,66
2,135015,"John Luther Zimmerman, IV",M,183,83
3,50180,Tomas Georg Hoszek,M,194,85
4,47252,Christian Hein,M,183,78





Table: person_region


Unnamed: 0,person_id,region_id
0,74996,64
1,40066,193
2,2677,155
3,42563,186
4,37930,70





Table: sport


Unnamed: 0,id,sport_name
0,24,Figure Skating
1,56,Synchronized Swimming
2,59,Tennis
3,7,Badminton
4,3,Alpinism






In [71]:
from ollama import ChatResponse
from ollama import Client

client = Client(
  host='http://localhost:11434',
  headers={'Content-Type': 'application/json'},
)


messages = [
    # 1) Hard rules (this is what stops it from spewing code)
    {
        "role": "system",
        "content": (
            "You are a data analyst.\n"
            "Your ONLY task is to write a concise, human-readable knowledge base "
            "about the table represented by the columns and rows that you are shown.\n"
            "You will be given a sample of 100 rows from the table to extract insights from. Dont make any comment about the number of rows you were given.\n"
            "WHen making numerical statements, consider this to be a sample of the entire table.\n"
            "Your output should be readable as knowledge base by another LLM."
            "• Never output code, markdown fences, or JSON.\n"
            "• Never describe the JSON format itself — only the database facts.\n"
            "• Some columns might be in Binary Format, or other formats that are not human readable. If you can distinguish the format, describe it, otherwise ignore it.\n"
            "If you are tempted to write code, STOP."
        )
    },
    {
        "role": "assistant",
        "content": (
            "Knowledge base:\n"
            "• The table has a column **id** (uuid primary key).\n"
            "• Each row represents a single record.\n"
        )
    },
    # 3) Your real sample — the model must now produce the KB for it
    {
        "role": "user",
        "content": f"These are the column names: {columns}"
    },
]

messages.extend([{
        "role": "user",
        "content": f"This is the {i}th row: {row}"
    } for i, row in enumerate(rows)])
response: ChatResponse = client.chat(
    # model="llama3.2",
    model="deepseek-r1:8b",
    messages=messages,
    stream=False,
    # format={"type": "string"},
    options={
        "temperature": 0.2,
        # "top_k": 10,
        # "top_p": 0.8,
        # "max_tokens": 1000,
        # "stop": ["```"]
    },
)

# print message with max line length, add newlines every 100 characters
content = response.message.content
lines = []
current_line = ""

for word in content.split():
    if len(current_line + " " + word) > 100:
        lines.append(current_line)
        current_line = word
    else:
        current_line = (current_line + " " + word).strip()

if current_line:
    lines.append(current_line)

print("\n".join(lines))

<think> Okay, let's start by understanding the user's query. They provided a dataset of 82 rows from
what seems to be a database or log file containing UUIDs and some structured data about vineyards in
Austria. The assistant's task is to create a thought process that mimics how they would approach
analyzing this data. First, I need to parse through the given examples. Each row has eight elements:
two UUIDs, then maybe names, address parts, phone numbers, and coordinates. Wait, actually looking
at the data again, it seems like each entry is structured with a tuple of values. The first element
is always a UUIDv4 for the winery ID, followed by another UUIDv4 as an external reference (like a
contact or location). Then there are name fields that sometimes have missing values represented as
None or empty strings. Hmm, I notice some patterns here. There's a mix of German and Austrian names
in the names field, which makes sense given the context—likely related to wineries in Austria. The
addre