# JSONファイルからRAG用のベクトルDBを作る

LangChain を使って RAG を試してみた #AI - Qiita
> https://qiita.com/tinymouse/items/4d359674f6b2494bb22d

LLMアプリケーション開発のためのLangChain 後編⑤ 外部ドキュメントのロード、分割及び保存 - qiita
> https://qiita.com/utanesuke/items/6efc03eca94f7de3b9cd#json-%E3%83%AD%E3%83%BC%E3%83%80%E3%83%BC


## install

In [None]:
!pip install unsloth langchain langchain_community langchain-huggingface sentence-transformers transformers accelerate chromadb

In [None]:
!pip install 

## imports

In [1]:
## main models
# need to import unsloth 1st
from unsloth import FastLanguageModel
import transformers
from sentence_transformers import SentenceTransformer
import torch

## langchain (need to make RAG)
from langchain_huggingface import HuggingFacePipeline
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma

import os
import gc
import chromadb
import json
from pathlib import Path
import pprint


## prepare tokenizer & model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    max_seq_length = 40960,
    load_in_4bit = True,            # 4bit uses much less memory
    load_in_8bit = False,           # A bit more accurate, uses 2x memory
)

## prepare pipeline

In [None]:
pipe = transformers.pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    dtype=torch.float16
)

llm = HuggingFacePipeline(
    pipeline = pipe
)

## load json datas

In [2]:
# need to avoid unicode escape & marge instruction, output
loader = JSONLoader(
    file_path="./jvn_results_merged.json",
#    jq_schema=".[]",
    jq_schema=".[] | .instruction, .output",
    text_content=False
)
docs_raw = loader.load()

docs = []
for i in range(0, len(docs_raw), 2):
    inst = docs_raw[i].page_content
    out = docs_raw[i+1].page_content
    inst_meta = docs_raw[i].metadata["seq_num"]
    out_meta = docs_raw[i+1].metadata["seq_num"]

    # page_content に instruction と output をまとめる
    content = (
        "Instruction: "
        f"{inst}\n"
        "Output: "
        f"{out}\n"
    )

    # メタデータとして元データを保持（任意）
    metadata = {
        "instruction": inst_meta,
        "output": out_meta
    }
    docs.append(Document(page_content=content, metadata=metadata))

pprint.pprint(docs[1])

Document(metadata={'instruction': 3, 'output': 4}, page_content='Instruction: SQL インジェクションの脆弱性の例を教えて\nOutput: carmelogarcia の Simple Leave Manager In PHP With Source Code における SQL インジェクションの脆弱性が発表された。A flaw has been found in code-projects Simple Leave Manager 1.0. This vulnerability affects unknown code of the file /user.php. This manipulation of the argument table causes sql injection. Remote exploitation of the attack is possible. The exploit has been published and may be used. この脆弱性はcarmelogarcia\nSimple Leave Manager In PHP With Source Code 1.0に影響を及ぼす。\n')


In [None]:
# # ひとまずこれで試してみる
# file_path = "./jvn_results_merged.json"
# docs = json.loads(Path(file_path).read_text())

## prepare embedding model

In [3]:
embeddings = SentenceTransformerEmbeddings(
    model_name="./infloat_multilingual-e5-large"
)

  embeddings = SentenceTransformerEmbeddings(


## make DB

In [None]:
# Error no longer appear
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings
)

In [None]:
print("1. "+docs_sim[0].page_content)
print("--------------------------------------------------\n")
print(docs_sim[0])

## save DB

In [None]:
# # if not reduce DB-LOCK, enable these
# os.environ["OMP_NUM_THREADS"] = "1"
# os.environ["OPENBLAS_NUM_THREADS"] = "1"
# os.environ["MKL_NUM_THREADS"] = "1"

In [None]:
# if appear -wal, -shm, -jourbal, the above has been completed
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory="./chromadb"   # ★ 保存先
)

# データベースを明示的にディスクへ書き出す
vectorstore.persist()
vectorstore = None
gc.collect()
del vectorstore

## load DB

In [None]:
vectorstore = Chroma(
    persist_directory="./chromadb",
    embedding_function=embeddings
)

## search DB

In [None]:
content_base = """
あなたは与えられたソースコードから脆弱性を発見するホワイトハッカーです。
回答の際は以下の3つを行いなさい。
1. 脆弱性の指摘
2. 放置することによるリスクの提示
3. 脆弱性の解消方法の提示

出力の際は以下の3つに注意しなさい。
1. 日本語で出力
2. 明快かつ簡潔に
3. マークダウン形式を使用

また出力の際にはコンテキストの内容を元に生成しなさい
以下は100面ダイスを振るアプリのソースコードです。
このコードにはインジェクションの脆弱性は存在しますか？
---
<?php
// php_d100_roller.php
// Simple 100-sided dice roller application (single file)
// Usage: Place this file on your web server and open it in a browser.
// For local testing: run `php -S localhost:8000` and visit http://localhost:8000/php_d100_roller.php

session_start();

// Keep roll history in session (max 100 entries)
if (!isset($_SESSION['d100_history'])) {
    $_SESSION['d100_history'] = [];
}

$errors = [];
$results = [];
$total = 0;
$count = 1;

if ($_SERVER['REQUEST_METHOD'] === 'POST') {
    // Get roll count (clamped between 1–100)
    $count = isset($_POST['count']) ? intval($_POST['count']) : 1;
    if ($count < 1) $count = 1;
    if ($count > 100) $count = 100;

    // Optional label
    $label = isset($_POST['label']) ? trim($_POST['label']) : '';

    // Perform rolls
    for ($i = 0; $i < $count; $i++) {
        $roll = random_int(1, 100);
        $results[] = $roll;
        $total += $roll;
    }

    // Add to history (newest first)
    $entry = [
        'time' => date('Y-m-d H:i:s'),
        'count' => $count,
        'label' => $label,
        'results' => $results,
        'total' => $total,
    ];

    array_unshift($_SESSION['d100_history'], $entry);
    if (count($_SESSION['d100_history']) > 100) {
        $_SESSION['d100_history'] = array_slice($_SESSION['d100_history'], 0, 100);
    }
}

$history = $_SESSION['d100_history'];
?>
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>100-sided Dice Roller</title>
<style>
    body { font-family: system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; padding: 20px; }
    .box { max-width: 820px; margin: 0 auto; }
    input[type="number"] { width: 80px; }
    .badge { display:inline-block; padding:6px 10px; margin:4px; border-radius:6px; background:#eee; }
    .roll { font-weight:700; }
    .history { margin-top:20px; }
    .card { padding:12px; border:1px solid #ddd; border-radius:8px; margin-bottom:12px; }
    .muted { color:#666; font-size:0.9rem; }
</style>
</head>
<body>
<div class="box">
    <h1>100-sided Dice Roller</h1>
    <form method="post" action="<?php echo htmlspecialchars($_SERVER['PHP_SELF']); ?>">
    <label>Number of rolls (1–100): <input type="number" name="count" value="<?php echo htmlspecialchars($count); ?>" min="1" max="100"></label>
    &nbsp;
    <label>Label (optional): <input type="text" name="label" value=""></label>
    &nbsp;
    <button type="submit">Roll</button>
    </form>

    <?php if (!empty($results)): ?>
    <div class="card">
        <div class="muted">Timestamp: <?php echo htmlspecialchars($entry['time']); ?></div>
        <h2>Results</h2>
        <div>
        <?php foreach ($results as $i => $r): ?>
            <span class="badge roll">#<?php echo $i+1; ?>: <?php echo $r; ?></span>
        <?php endforeach; ?>
        </div>
        <p>Total: <strong><?php echo $total; ?></strong> / Average: <strong><?php echo count($results) ? round($total / count($results), 2) : 0; ?></strong></p>
        <?php if ($entry['label'] !== ''): ?><p>Label: <?php echo htmlspecialchars($entry['label']); ?></p><?php endif; ?>
    </div>
    <?php endif; ?>

    <div class="history">
    <h2>History (last <?php echo count($history); ?> rolls)</h2>
    <?php if (empty($history)): ?>
        <p class="muted">No rolls yet.</p>
    <?php else: ?>
        <?php foreach ($history as $idx => $h): ?>
        <div class="card">
            <div class="muted"><?php echo htmlspecialchars($h['time']); ?> — Rolls: <?php echo $h['count']; ?><?php if ($h['label'] !== ''): ?> — Label: <?php echo htmlspecialchars($h['label']); ?><?php endif; ?></div>
            <div style="margin-top:8px;">
            <?php foreach ($h['results'] as $i => $r): ?>
                <span class="badge"><?php echo $r; ?></span>
            <?php endforeach; ?>
            </div>
            <p style="margin-top:8px;">Total: <?php echo $h['total']; ?> / Average: <?php echo count($h['results']) ? round($h['total'] / count($h['results']), 2) : 0; ?></p>
        </div>
        <?php endforeach; ?>
    <?php endif; ?>
    </div>

    <div style="margin-top:20px;" class="muted">
    <p>Note: Uses <code>random_int(1, 100)</code> for cryptographically secure random number generation. The session keeps up to 100 history entries.</p>
    </div>
</div>
</body>
</html>
"""

In [None]:
docs_sim = vectorstore.similarity_search(query=query, k=5)
print(docs_sim[0])

## add context for prompt

In [None]:
# make message from template
context = ""
for loop in range(len(docs_sim)):
    context = context+(f"context_{loop+1}. {docs_sim[loop].page_content}")

messages = [{"role" : "user", "content" : content_base +"\n---\ncontext: "+ context}]

print(messages)

In [None]:
# infer
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 4096, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)