
# PostgreSQL テーブル格納状況チェック（NeuralForecast 分析テーブル）

このノートブックは、PostgreSQL 上のテーブルに **データが格納されているか** を確認します。  
対象は `nf_%` プレフィックス、または `model_profile` / `dataset_profile` / `training_state` / `weight_statistics` / `model_complexity` / `model_diagnosis` / `parameter_sensitivity` / `optimization_suggestions` です。

- 接続情報は環境変数（`PGHOST`, `PGPORT`, `PGDATABASE`, `PGUSER`, `PGPASSWORD`）または `DATABASE_URL` を利用します。
- 任意で `TABLE_GLOB`（例: `nf_%`）を変更可能。
- 主要機能：テーブル列挙、件数、先頭サンプル、`VARCHAR(100)` の長さ超過チェック。


In [1]:

import os, sys
from typing import List, Tuple
import pandas as pd

# まず SQLAlchemy で接続を試み、無ければ psycopg2 を使う
engine = None
conn = None
use_sqlalchemy = True

DATABASE_URL = os.getenv("DATABASE_URL")
if not DATABASE_URL:
    host = os.getenv("PGHOST", "127.0.0.1")
    port = os.getenv("PGPORT", "5432")
    db   = os.getenv("PGDATABASE", "postgres")
    user = os.getenv("PGUSER", "postgres")
    pw   = os.getenv("PGPASSWORD", "z")
    DATABASE_URL = f"postgresql+psycopg2://{user}:{pw}@{host}:{port}/{db}"  # SQLAlchemy形式

TABLE_GLOB = os.getenv("TABLE_GLOB", "nf_%")

print(f"[info] 接続先: {DATABASE_URL}")
print(f"[info] 検索パターン: {TABLE_GLOB}")


[info] 接続先: postgresql+psycopg2://postgres:z@127.0.0.1:5432/postgres
[info] 検索パターン: nf_%


In [2]:

# 接続確立
try:
    from sqlalchemy import create_engine, text
    engine = create_engine(DATABASE_URL, pool_pre_ping=True)
    with engine.connect() as cx:
        cx.execute(text("SELECT 1"))
    print("[ok] SQLAlchemy経由で接続成功")
except Exception as e:
    print(f"[warn] SQLAlchemy接続に失敗: {e}\n-> psycopg2直接続にフォールバックします。")
    use_sqlalchemy = False

if not use_sqlalchemy:
    try:
        import psycopg2
        import psycopg2.extras as ex
        # DATABASE_URL が postgresql:// の場合に対応
        if DATABASE_URL.startswith("postgresql"):
            import urllib.parse as up
            # 粗めのパース（必要に応じて調整）
            # 例: postgresql://user:pass@host:port/dbname
            url = DATABASE_URL.replace("postgresql+psycopg2://", "postgresql://")
            u = up.urlparse(url)
            conn = psycopg2.connect(
                host=u.hostname or os.getenv("PGHOST", "127.0.0.1"),
                port=u.port or int(os.getenv("PGPORT", "5432")),
                dbname=(u.path or "/postgres").lstrip("/"),
                user=u.username or os.getenv("PGUSER", "postgres"),
                password=u.password or os.getenv("PGPASSWORD", ""),
            )
        else:
            # 環境変数ベース
            conn = psycopg2.connect(
                host=os.getenv("PGHOST", "127.0.0.1"),
                port=int(os.getenv("PGPORT", "5432")),
                dbname=os.getenv("PGDATABASE", "postgres"),
                user=os.getenv("PGUSER", "postgres"),
                password=os.getenv("PGPASSWORD", ""),
            )
        print("[ok] psycopg2経由で接続成功")
    except Exception as e:
        print(f"[error] psycopg2接続にも失敗: {e}")
        raise


[ok] SQLAlchemy経由で接続成功


In [9]:

from typing import Dict

def q(sql: str, params: Dict=None) -> pd.DataFrame:
    params = params or {}
    if use_sqlalchemy:
        from sqlalchemy import text
        with engine.begin() as cx:
            return pd.read_sql(text(sql), cx, params=params)
    else:
        import pandas as pd
        with conn.cursor() as cur:
            cur.execute(sql, params)
            cols = [d.name for d in cur.description] if cur.description else []
            rows = cur.fetchall() if cur.description else []
        return pd.DataFrame(rows, columns=cols)


In [10]:

# 1) テーブル列挙
tables = q("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema NOT IN ('pg_catalog','information_schema')
  AND (table_name ILIKE %(glob)s OR table_name IN
      ('model_profile','dataset_profile','training_state','weight_statistics',
       'model_complexity','model_diagnosis','parameter_sensitivity','optimization_suggestions'))
ORDER BY table_schema, table_name
""", {"glob": TABLE_GLOB})
tables


UnboundLocalError: cannot access local variable 'pd' where it is not associated with a value

In [6]:

# 2) 各テーブルの件数
counts_list = []
for _, r in tables.iterrows():
    sch, name = r['table_schema'], r['table_name']
    df = q(f'SELECT COUNT(*) AS rows FROM "{sch}"."{name}"')
    counts_list.append((f"{sch}.{name}", int(df.iloc[0,0])))
pd.DataFrame(counts_list, columns=["table", "rows"]).sort_values("table").reset_index(drop=True)


NameError: name 'tables' is not defined

In [7]:

# 3) 代表的テーブルのサンプル先頭5件
samples = {}
for tname in ['model_profile','dataset_profile','training_state','weight_statistics',
              'model_complexity','model_diagnosis','parameter_sensitivity','optimization_suggestions']:
    hit = tables[tables['table_name'] == tname]
    if not hit.empty:
        sch = hit.iloc[0]['table_schema']
        samples[tname] = q(f'SELECT * FROM "{sch}"."{tname}" LIMIT 5')
samples  # セルを実行すると dict[str, DataFrame] の形で出力されます


NameError: name 'tables' is not defined

In [None]:

# 4) VARCHAR(100) の長さ超過チェック
violations = q("""
WITH targets AS (
  SELECT c.table_schema, c.table_name, c.column_name, c.character_maximum_length
  FROM information_schema.columns c
  JOIN information_schema.tables t
    ON c.table_schema=t.table_schema AND c.table_name=t.table_name
  WHERE c.table_schema NOT IN ('pg_catalog','information_schema')
    AND t.table_type='BASE TABLE'
    AND c.data_type='character varying'
    AND c.character_maximum_length IS NOT NULL
    AND c.character_maximum_length <= 100
    AND (c.table_name ILIKE %(glob)s OR c.table_name IN
        ('model_profile','dataset_profile','training_state','weight_statistics',
         'model_complexity','model_diagnosis','parameter_sensitivity','optimization_suggestions'))
)
SELECT t.table_schema, t.table_name, t.column_name, t.character_maximum_length AS limit,
       (SELECT MAX(length(COALESCE((%s)::text, ''))) FROM (SELECT * FROM "%s"."%s") s) AS max_len
FROM targets t
ORDER BY t.table_schema, t.table_name, t.column_name
""".replace('%s', '{col}').replace('%s', '{sch}').replace('%s', '{tbl}'), {"glob": TABLE_GLOB})
# 上のクエリは動的列参照がSQL標準で難しいため、そのままだとプレースホルダが展開されません。
# シンプルな代替として、Python側で1列ずつ測る関数を用意します。
violations = []

for _, r in q("""
SELECT c.table_schema, c.table_name, c.column_name, c.character_maximum_length AS limit
FROM information_schema.columns c
JOIN information_schema.tables t
  ON c.table_schema=t.table_schema AND c.table_name=t.table_name
WHERE c.table_schema NOT IN ('pg_catalog','information_schema')
  AND t.table_type='BASE TABLE'
  AND c.data_type='character varying'
  AND c.character_maximum_length IS NOT NULL
  AND c.character_maximum_length <= 100
  AND (c.table_name ILIKE %(glob)s OR c.table_name IN
      ('model_profile','dataset_profile','training_state','weight_statistics',
       'model_complexity','model_diagnosis','parameter_sensitivity','optimization_suggestions'))
ORDER BY c.table_schema, c.table_name, c.ordinal_position
""", {"glob": TABLE_GLOB}).iterrows():
    sch, tbl, col, limit = r['table_schema'], r['table_name'], r['column_name'], int(r['limit'])
    try:
        df = q(f'SELECT MAX(length(COALESCE("{col}"::text, ''))) AS max_len, '
               f'COUNT(*) FILTER (WHERE length(COALESCE("{col}"::text, '')) > {limit}) AS over_limit_rows '
               f'FROM "{sch}"."{tbl}"')
        max_len = int(df.iloc[0]['max_len']) if pd.notna(df.iloc[0]['max_len']) else 0
        over = int(df.iloc[0]['over_limit_rows']) if pd.notna(df.iloc[0]['over_limit_rows']) else 0
        violations.append((f"{sch}.{tbl}", col, limit, max_len, over))
    except Exception as e:
        violations.append((f"{sch}.{tbl}", col, limit, None, f"error: {e}"))

pd.DataFrame(violations, columns=["table","column","limit","max_len","over_limit_rows"])


In [8]:
%%bash

# 事前に接続情報を環境変数で用意（例）
export PGHOST=127.0.0.1
export PGPORT=5432
export PGDATABASE=postgres
export PGUSER=postgres
export PGPASSWORD=your_password

# 実行（エラー時停止）
psql "host=$PGHOST port=$PGPORT dbname=$PGDATABASE user=$PGUSER password=$PGPASSWORD" \
  -v ON_ERROR_STOP=1 -f nf_pg_check.sql


=== 対象テーブルの列挙（nf_* または分析テーブル名） ===
 table_schema |         table_name          
--------------+-----------------------------
 public       | nf_calibration
 public       | nf_ckpt
 public       | nf_config
 public       | nf_dataset_profile
 public       | nf_dm_test
 public       | nf_drift
 public       | nf_eval_interval
 public       | nf_eval_point
 public       | nf_forecasts
 public       | nf_hparams
 public       | nf_metrics
 public       | nf_model
 public       | nf_model_complexity
 public       | nf_model_diagnosis
 public       | nf_model_profile
 public       | nf_models
 public       | nf_optimization_suggestions
 public       | nf_parameter_sensitivity
 public       | nf_pkl
 public       | nf_predictions
 public       | nf_quantiles
 public       | nf_residual_stats
 public       | nf_run
 public       | nf_runs
 public       | nf_series
 public       | nf_training_state
 public       | nf_weight_statistics
(27 �s)


=== 各テーブルの行数（存在するものだ���） ===
         table       