In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json

In [None]:
import os
import json
import pandas as pd
import numpy as np

INPUT_DIR = "data"
OUTPUT_DIR = "metadata"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_nan(value):
    """
    Convert null, None, NaT to string 'NaN'.
    Keep other values unchanged.
    """
    if pd.isna(value):
        return "NaN"
    return value

# List CSV files only
csv_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(INPUT_DIR, file)
    df = pd.read_csv(file_path)

    # Apply null cleaning
    df_clean = df.applymap(clean_nan)

    table_name = file.replace(".csv", "")

    metadata = {
        "table": table_name,
        "description": "",
        "columns": {col: "" for col in df_clean.columns},
        "example_rows": df_clean.head(3).to_dict(orient="records")
    }

    output_path = os.path.join(OUTPUT_DIR, f"{table_name}.json")

    # Save with UTF-8 + pretty formatting
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

    print(f"Generated metadata for: {file} ‚Üí {output_path}")

print("\nSelesai! Semua metadata sudah di-generate dan null telah diubah menjadi string 'NaN'.")


Generated metadata for: ref_mkt_seki_exchange.csv ‚Üí metadata/ref_mkt_seki_exchange.json
Generated metadata for: ref_mkt_seki_interest.csv ‚Üí metadata/ref_mkt_seki_interest.json
Generated metadata for: ref_mkt_seki_savings.csv ‚Üí metadata/ref_mkt_seki_savings.json
Generated metadata for: ref_mkt_seki_ihk.csv ‚Üí metadata/ref_mkt_seki_ihk.json
Generated metadata for: ref_mkt_seki_pareto_terpisah.csv ‚Üí metadata/ref_mkt_seki_pareto_terpisah.json
Generated metadata for: ref_mkt_seki_indeks_harga.csv ‚Üí metadata/ref_mkt_seki_indeks_harga.json
Generated metadata for: ref_mkt_seki_investasi.csv ‚Üí metadata/ref_mkt_seki_investasi.json
Generated metadata for: ref_mkt_seki_transaksi_berjalan_internasional.csv ‚Üí metadata/ref_mkt_seki_transaksi_berjalan_internasional.json
Generated metadata for: ref_mkt_seki_indonesia_ringkasan.csv ‚Üí metadata/ref_mkt_seki_indonesia_ringkasan.json
Generated metadata for: ref_mkt_seki_devisa.csv ‚Üí metadata/ref_mkt_seki_devisa.json
Generated metadata for

  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)
  df_clean = df.applymap(clean_nan)


In [5]:
import pandas as pd
import json
import ollama
import os

def process_single_file(file_path, output_folder):
    """
    Fungsi helper untuk memproses satu file CSV menjadi JSON metadata.
    """
    filename = os.path.basename(file_path)
    base_name = os.path.splitext(filename)[0] # Hilangkan ekstensi .csv
    output_path = os.path.join(output_folder, f"{base_name}.json")

    print(f"\nüìÇ Memproses: {filename}...")

    # 1. BACA CSV DAN BUAT SAMPEL
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"   ‚ùå Gagal membaca file {filename}: {e}")
        return

    # Logika Sampling (Pinggir - Tengah - Pinggir)
    total_rows = len(df)
    if total_rows >= 4:
        indices = [0, int(total_rows / 3), int(total_rows * 2 / 3), total_rows - 1]
    else:
        indices = list(range(total_rows))
    
    sample_df = df.iloc[indices]
    sample_rows = sample_df.where(pd.notnull(sample_df), None).to_dict(orient='records')

    # Persiapan Konteks Prompt
    columns = df.columns.tolist()
    context_row = sample_rows[0] if sample_rows else {}

    # 2. GENERATE DESKRIPSI DENGAN OLLAMA
    print(f"   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...")
    
    prompt = f"""
    Anda adalah Data Engineer. Buat metadata JSON untuk tabel ini.
    
    NAMA TABEL: {base_name}
    KOLOM: {', '.join(columns)}
    CONTOH DATA: {json.dumps(context_row)}

    INSTRUKSI:
    1. Buat 'description' (deskripsi tabel).
    2. Buat 'columns' (key: nama kolom, value: penjelasan kolom dalam Bahasa Indonesia).
    3. Output STRICTLY JSON valid.
    
    FORMAT JSON:
    {{
      "table": "{base_name}",
      "description": "...",
      "columns": {{ "kolom1": "penjelasan...", ... }}
    }}
    """

    try:
        response = ollama.chat(
            model='qwen2.5:7b',
            messages=[{'role': 'user', 'content': prompt}],
            format='json',
            options={'temperature': 0.2}
        )
        ai_output = json.loads(response['message']['content'])
    except Exception as e:
        print(f"   ‚ùå Error Ollama pada file {filename}: {e}")
        return

    # 3. GABUNGKAN HASIL
    final_metadata = {
        "table": ai_output.get("table", base_name),
        "description": ai_output.get("description", "Deskripsi otomatis."),
        "columns": ai_output.get("columns", {}),
        "example_rows": sample_rows
    }

    # 4. SIMPAN KE FILE JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_metadata, f, indent=2, ensure_ascii=False)
    
    print(f"   ‚úÖ Disimpan: {output_path}")


def generate_metadata_from_folder(input_folder_path, output_folder_path):
    """
    Fungsi utama untuk iterasi folder.
    """
    # Pastikan folder output ada, jika tidak buat baru
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
        print(f"üìÅ Membuat folder output: {output_folder_path}")

    # Cari semua file CSV
    files = [f for f in os.listdir(input_folder_path) if f.endswith('.csv')]
    
    if not files:
        print("‚ö†Ô∏è Tidak ada file CSV ditemukan di folder input.")
        return

    print(f"Menemukan {len(files)} file CSV. Memulai proses...\n" + "="*40)

    # Loop setiap file
    for file in files:
        full_path = os.path.join(input_folder_path, file)
        process_single_file(full_path, output_folder_path)
        
    print("\n" + "="*40 + "\nüéâ Selesai memproses semua file.")

# --- KONFIGURASI DAN EKSEKUSI ---
if __name__ == "__main__":
    # Tentukan folder input (tempat CSV berada)
    INPUT_FOLDER = './data/SEKI-v2'  
    
    # Tentukan folder output (tempat JSON akan disimpan)
    OUTPUT_FOLDER = './data/metadata-SEKI-v2' 

    generate_metadata_from_folder(INPUT_FOLDER, OUTPUT_FOLDER)

Menemukan 13 file CSV. Memulai proses...

üìÇ Memproses: ref_mkt_seki_exchange.csv...
   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...
   ‚úÖ Disimpan: ./data/metadata-SEKI-v2/ref_mkt_seki_exchange.json

üìÇ Memproses: ref_mkt_seki_interest.csv...
   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...
   ‚úÖ Disimpan: ./data/metadata-SEKI-v2/ref_mkt_seki_interest.json

üìÇ Memproses: ref_mkt_seki_savings.csv...
   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...
   ‚úÖ Disimpan: ./data/metadata-SEKI-v2/ref_mkt_seki_savings.json

üìÇ Memproses: ref_mkt_seki_ihk.csv...
   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...
   ‚úÖ Disimpan: ./data/metadata-SEKI-v2/ref_mkt_seki_ihk.json

üìÇ Memproses: ref_mkt_seki_pareto_terpisah.csv...
   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...
   ‚úÖ Disimpan: ./data/metadata-SEKI-v2/ref_mkt_seki_pareto_terpisah.json

üìÇ Memproses: ref_mkt_seki_indeks_harga.csv...
   ü§ñ Mengirim ke Ollama (qwen2.5:7b)...
   ‚úÖ Disimpan: ./data/metadata-SEKI-v2/ref_mkt_seki_indeks_harga.json

üìÇ M

In [6]:
import os
import pandas as pd
import sqlite3

INPUT_DIR = "data"
DB_PATH = "database.db"

# Buat database
conn = sqlite3.connect(DB_PATH)

# Loop semua CSV
for file in os.listdir(INPUT_DIR):
    if file.endswith(".csv"):
        path = os.path.join(INPUT_DIR, file)
        df = pd.read_csv(path)

        # Nama tabel = nama file tanpa .csv
        table_name = file.replace(".csv", "")

        # Migrasi ke SQLite
        df.to_sql(table_name, conn, if_exists="replace", index=False)

        print(f"[OK] Imported {file} ‚Üí table '{table_name}'")

conn.close()
print("\nSelesai! Semua CSV sudah dimigrasikan ke database.db")


[OK] Imported ref_mkt_seki_exchange.csv ‚Üí table 'ref_mkt_seki_exchange'
[OK] Imported ref_mkt_seki_interest.csv ‚Üí table 'ref_mkt_seki_interest'
[OK] Imported ref_mkt_bps_jumlah_penduduk_by_usia.csv ‚Üí table 'ref_mkt_bps_jumlah_penduduk_by_usia'
[OK] Imported ref_mkt_bps_jumlah_tenaga_kesehatan.csv ‚Üí table 'ref_mkt_bps_jumlah_tenaga_kesehatan'
[OK] Imported ref_mkt_bps_produk_domestik_reg_bruto.csv ‚Üí table 'ref_mkt_bps_produk_domestik_reg_bruto'
[OK] Imported ref_mkt_bps_jumlah_penduduk.csv ‚Üí table 'ref_mkt_bps_jumlah_penduduk'
[OK] Imported ref_mkt_seki_savings.csv ‚Üí table 'ref_mkt_seki_savings'
[OK] Imported ref_mkt_seki_ihk.csv ‚Üí table 'ref_mkt_seki_ihk'
[OK] Imported ref_mkt_seki_pareto_terpisah.csv ‚Üí table 'ref_mkt_seki_pareto_terpisah'
[OK] Imported ref_mkt_bps_jumlah_ibuhamil.csv ‚Üí table 'ref_mkt_bps_jumlah_ibuhamil'
[OK] Imported ref_mkt_bps_persentase_bayi_asi_eksklusif.csv ‚Üí table 'ref_mkt_bps_persentase_bayi_asi_eksklusif'
[OK] Imported ref_mkt_bps_umr.cs

In [7]:
import sqlite3
conn = sqlite3.connect("database.db")
cur = conn.cursor()

cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cur.fetchall()

print("Tables in DB:")
for t in tables:
    print("-", t[0])

conn.close()


Tables in DB:
- ref_mkt_seki_exchange
- ref_mkt_seki_interest
- ref_mkt_bps_jumlah_penduduk_by_usia
- ref_mkt_bps_jumlah_tenaga_kesehatan
- ref_mkt_bps_produk_domestik_reg_bruto
- ref_mkt_bps_jumlah_penduduk
- ref_mkt_seki_savings
- ref_mkt_seki_ihk
- ref_mkt_seki_pareto_terpisah
- ref_mkt_bps_jumlah_ibuhamil
- ref_mkt_bps_persentase_bayi_asi_eksklusif
- ref_mkt_bps_umr
- ref_mkt_seki_indeks_harga
- ref_mkt_seki_investasi
- ref_mkt_bps_gini_ratio
- ref_mkt_seki_transaksi_berjalan_internasional
- ref_mkt_seki_indonesia_ringkasan
- ref_mkt_bps_jumlah_balita
- ref_mkt_seki_devisa
- ref_mkt_bps_angka_kelahiran
- ref_mkt_bps_inflasi_nasional
- ref_mkt_bps_jumlah_pns
- ref_mkt_seki_inflasi
- ref_mkt_seki_export_import
- ref_mkt_bps_pengeluaran_per_kapita
- ref_mkt_seki_pdb


In [8]:
import sqlite3

conn = sqlite3.connect("database.db")
cur = conn.cursor()

table = "nama_tabel"
cur.execute(f"PRAGMA table_info({table});")
print(cur.fetchall())

conn.close()


[]


# BPS SEKI AGENT

In [2]:
from langchain_ollama import OllamaLLM

user_llm = OllamaLLM(model="qwen2.5:7b-latest", temperature=0)
sql_llm = OllamaLLM(model="qwen-coder:latest", temperature=0)


In [3]:
import json, os

def load_metadata(path="metadata"):
    metas = {}
    for f in os.listdir(path):
        if f.endswith(".json"):
            with open(os.path.join(path, f), "r", encoding="utf-8") as fh:
                metas[f.replace(".json","")] = json.load(fh)
    return metas

metadata = load_metadata()


In [4]:
import datetime
import json

def log_event(event_type, data, file="logs.txt"):
    entry = {
        "timestamp": datetime.datetime.now().isoformat(),
        "event": event_type,
        "data": data
    }
    with open(file, "a", encoding="utf-8") as f:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")


In [5]:
from langchain_core.prompts import ChatPromptTemplate

SQL_TEMPLATE = """
You are an expert SQLite SQL generator.
User question: {question}

Use ONLY the following table metadata:
{metadata}

User region = "{region}"
User leveldata = "{leveldata}"

Rules:
- If table has 'access_column', ALWAYS filter by region.
- Only return pure SQL without explanation.
- No comments, no markdown, no natural language.

SQL:
"""

sql_prompt = ChatPromptTemplate.from_template(SQL_TEMPLATE)
sql_chain = sql_prompt | sql_llm


In [6]:
import re

forbidden = ["drop", "delete", "update", "insert", "alter"]

def is_safe_sql(sql):
    low = sql.lower()
    return low.startswith("select") and all(x not in low for x in forbidden)

In [7]:
def enforce_region(sql, access_column, region):
    low = sql.lower()
    if access_column.lower() not in low:
        return sql  # no region needed

    if f"{access_column.lower()} =" in low or "where" in low:
        return sql

    return sql + f" WHERE {access_column} LIKE '{region}%'"


In [8]:
import sqlite3
import pandas as pd

def execute_sql(sql, db_path="database.db"):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query(sql, conn)
    conn.close()
    return df


In [9]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

def simple_forecast(df, date_col, val_col, periods=3):
    df = df.dropna().sort_values(date_col)
    X = np.arange(len(df)).reshape(-1,1)
    y = df[val_col].astype(float).values

    model = LinearRegression().fit(X, y)
    future_idx = np.arange(len(df), len(df)+periods).reshape(-1,1)
    preds = model.predict(future_idx)

    future_dates = pd.date_range(
        start=df[date_col].iloc[-1], periods=periods+1, freq="M"
    )[1:]

    return pd.DataFrame({date_col: future_dates, "prediction": preds})


In [10]:
from langgraph.graph import StateGraph, END

class State(dict):
    pass

workflow = StateGraph(State)

# Node 1: Router
def router_node(state):
    q = state["question"].lower()

    if any(k in q for k in ["prediksi", "forecast", "ramal"]):
        state["intent"] = "forecast"
    elif any(k in q for k in ["berapa", "tampilkan", "list", "total", "select"]):
        state["intent"] = "sql"
    else:
        state["intent"] = "clarify"

    log_event("router", state)
    return state

workflow.add_node("router", router_node)
workflow.set_entry_point("router")


<langgraph.graph.state.StateGraph at 0x713ab111fb00>

In [11]:
def planner_node(state):
    intent = state["intent"]

    if intent == "forecast":
        state["next"] = "forecast_agent"
    elif intent == "sql":
        state["next"] = "sql_agent"
    else:
        state["next"] = "clarify_agent"

    log_event("planner", state)
    return state

workflow.add_node("planner", planner_node)
workflow.add_edge("router", "planner")


<langgraph.graph.state.StateGraph at 0x713ab111fb00>

In [12]:
def sql_agent_node(state):
    table_meta = metadata  # pakai semua dulu
    sql = sql_chain.run(
        question=state["question"],
        metadata=json.dumps(table_meta),
        region=state["region"],
        leveldata=state["leveldata"]
    )

    log_event("sql_raw", sql)

    if not is_safe_sql(sql):
        state["error"] = "unsafe sql"
        state["next"] = END
        return state

    # region enforcement
    for t, m in metadata.items():
        access_col = m.get("access_column")
        if access_col:
            sql = enforce_region(sql, access_col, state["region"])

    log_event("sql_final", sql)

    df = execute_sql(sql)
    state["result"] = df
    state["next"] = END
    return state

workflow.add_node("sql_agent", sql_agent_node)
workflow.add_edge("planner", "sql_agent")


<langgraph.graph.state.StateGraph at 0x713ab111fb00>

In [13]:
def forecast_agent_node(state):
    # For forecasting, first fetch the timeseries
    sql = "SELECT year, value FROM some_table"
    df = execute_sql(sql)

    log_event("forecast_sql", sql)
    log_event("forecast_data", df.head().to_dict())

    fc = simple_forecast(df, "year", "value", periods=3)
    state["result"] = fc
    state["next"] = END
    return state

workflow.add_node("forecast_agent", forecast_agent_node)
workflow.add_edge("planner", "forecast_agent", condition=lambda s: s["next"]=="forecast_agent")


TypeError: StateGraph.add_edge() got an unexpected keyword argument 'condition'

In [None]:
def clarify_agent_node(state):
    reply = user_llm(f"Pertanyaanmu kurang jelas: {state['question']}")
    state["result"] = reply
    state["next"] = END
    log_event("clarify", state)
    return state

workflow.add_node("clarify_agent", clarify_agent_node)
workflow.add_edge("planner", "clarify_agent", condition=lambda s: s["next"]=="clarify_agent")


In [None]:
graph = workflow.compile()


In [None]:
question = input("Pertanyaan user: ")

state = {
    "question": question,
    "region": "RM III JABAR",
    "leveldata": "2_KABUPATEN_JAWA_BARAT"
}

result_state = graph.invoke(state)
result_state["result"]
