In [8]:
import json
from google import genai
from google.genai import types

In [2]:
client = genai.Client()

In [None]:
mock_ddl = {
    "table_name": "country",
    "database": "analytics_db",
    "layer": "raw",
    "purpose": "Master list of countries and their demographics.",
    
    "schema": [
        { "name": "Code", "type": "string", "nullable": False },
        { "name": "Name", "type": "string", "nullable": False },
        { "name": "Continent", "type": "string", "nullable": True },
        { "name": "Population", "type": "int", "nullable": True },
        { "name": "SurfaceArea", "type": "float", "nullable": True },
        { "name": "IndepYear", "type": "int", "nullable": True }
    ],
    "row_count": 239,
    "column_stats": {
        "Continent": {
            "null_pct": 0.0,
            "distinct_count": 7,
            "top_values": [
                 {"value": "Asia", "count": 51},
                 {"value": "Africa", "count": 58},
                 {"value": "Europe", "count": 46},
                 {"value": "North America", "count": 37},
                 {"value": "South America", "count": 14},
                 {"value": "Oceania", "count": 28},
                 {"value": "Antarctica", "count": 5}
            ]
        },
        "Population": {
             "min": 0,
             "max": 1400000000,
             "p95": 50000000
        },
        "IndepYear": {
            "min": 870,
            "max": 2023, 
            "null_pct": 0.15 
        }
    },
    
    "job_summary": {
        "job_name": "ingest_country_reference",
        "inputs": ["s3_raw_csv"],
        "filters": ["exclude where Code is null"]
    }
}

In [None]:
payload = f"""
You are a Senior Data Engineer Copilot. 
I will provide you with a JSON context object describing a data table.
Your goal is to generate a Data Quality Contract that balances **Strictness** (for clean data) with **Flexibility** (to avoid false alarms).
### INPUT CONTEXT:
{json.dumps(mock_ddl, indent=2)}

### INSTRUCTIONS:
Please note that the below instructions are only a guide and you can deviate from them if you see fit and if it is a necessary data quality check or 
# a necessary post load test. and keep the rules and test together if they belong to the same table in the respective JSON object.
# Think longer and more carefully before generating the rules and tests and make sure to cover all the necessary checks and you are not missing any edge cases at all.
# post generating the rules and tests, review your own rules. Did you miss any security checks (PII)? Did you miss any logical consistency checks? If so, add them.

1. **Analyze Schema & Logic (Hard Rules):**
   - Columns marked `nullable: False` must have `not_null` checks.
   - **CRITICAL**: For String columns that are `nullable: False`, ALSO check that they are not empty strings or whitespace only.
   - Numeric columns (Population, SurfaceArea) must generally be >= 0.
   - **Severity:** "ERROR" (Fundamental data issues).
   - **Action:** "FAIL_JOB" (if critical) or "DROP_ROW".
2. **Analyze Patterns & Types (Pattern Rules):**
   - Look at column names. If they imply a standard format (e.g., "Code", "Email", "IP", "UUID"), generate a `regex` rule.
   - Example: "Code" usually implies a fixed length or specific character set (e.g., ISO 3-letter code).
3. **Analyze Stats (Soft Rules & Buffers):**
   - Use `column_stats` to define expected ranges, BUT apply a **20% Buffer** to handle Data Drift.
   - If `distinct_count` is low (< 20) and stable (like Continent), create an `allowed_values` list.
   - **Severity:** "WARNING".
   - **Action:** "WARN".
4. **Cross-Column & Advanced Logic:**
   - Look for relationships. Does one column's value depend on another? (e.g., "If `IndepYear` is not null, `GovernmentForm` should not be 'Colony'").
   - If no obvious cross-column logic exists, skip this.
5. **Post-Load Tests (The Safety Net):**
   - Generate SQL tests for:
     - **Uniqueness**: Check Primary Keys (or composite keys) for duplicates.
     - **Completeness**: Row count > 0.
     - **Freshness/Future**: Ensure dates (like `IndepYear`) are not in the future relative to `CURRENT_DATE` (avoid hardcoded years).
   - These run AFTER the load.
6. **Output Format:**
   Return ONLY valid JSON in this structure:
   {{
      "data_quality": {{
          "rules": [
              {{ 
                "column": "col_name", 
                "rule_type": "not_null|not_empty|min|max|allowed_values|regex|custom_sql", 
                "condition": "value or list or sql_expression", 
                "severity": "ERROR|WARNING",
                "action": "DROP_ROW|FAIL_JOB|WARN",
                "description": "Reasoning..."
              }}
          ]
      }},
      "tests": [
          {{ "name": "test_name", "sql": "SELECT ...", "description": "..." }}
      ],
      "docs_markdown": "# Table Documentation..."
   }}
"""

In [45]:
response = client.models.generate_content(model='gemini-2.5-pro', contents=payload, 
config=types.GenerateContentConfig(response_mime_type="application/json" ))

In [46]:
print(response.text)

{
  "data_quality": {
    "rules": [
      {
        "column": "Code",
        "rule_type": "pk",
        "condition": null,
        "severity": "ERROR",
        "action": "FAIL_JOB",
        "description": "Column Code is the primary key and must be unique for each country."
      },
      {
        "column": "Code",
        "rule_type": "not_null",
        "condition": null,
        "severity": "ERROR",
        "action": "FAIL_JOB",
        "description": "Country code must not be null."
      },
      {
        "column": "Code",
        "rule_type": "not_empty",
        "condition": null,
        "severity": "ERROR",
        "action": "DROP_ROW",
        "description": "Country code must not be an empty or whitespace-only string."
      },
      {
        "column": "Code",
        "rule_type": "regex",
        "condition": "^[A-Z]{3}$",
        "severity": "ERROR",
        "action": "DROP_ROW",
        "description": "Country code must be a 3-letter uppercase ISO code."
      },
   