In [1]:
#1. 파일 불러오기
#2. 각 줄을 하나의 document로
#3. 만약 길이 길면 호단위로 자르기

In [1]:
import json
import re
import os
from pathlib import Path
import glob

In [2]:
try:
  from langchain_core.documents import Document
except ImportError:
  !pip install langchain_core
  from langchain_core.documents import Document

try:
  from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
  !pip install langchain_text_splitters
  from langchain_text_splitters import RecursiveCharacterTextSplitter



In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1400,
    chunk_overlap=200,
    separators = ["다.", "다만", ".", ",", " ", ""]
)

In [4]:
documents = []

# path = "./drive/MyDrive/rag/preprocessing_rag/법인세법 시행규칙.jsonl"
for file_path in glob.glob("/content/drive/MyDrive/rag/preprocessing_rag/*.jsonl"):
  with open(file_path, 'r', encoding="utf-8") as f:
    for no, line in enumerate(f, 1):
      line = line.strip()
      if not line:
        continue
      data = json.loads(line)
      content = data.get('chunk_text')

      metadata = dict(data)
      metadata.pop("chunk_text")

      metadata['조문_index'] = metadata.pop("chunk_index")
      metadata['file_line_no'] = no

      if len(content) > 1400:
        pieces = text_splitter.split_text(content)
        for i, p in enumerate(pieces, 1):
          metadata = metadata.copy()
          metadata['piece_index'] = i
          metadata['piece_count'] = len(pieces)
          documents.append(Document(page_content=p, metadata=metadata))
      else:
        metadata['piece_index'] = 1
        metadata['piece_count'] = 1
        documents.append(Document(page_content=content, metadata=metadata))


for i, document in enumerate(documents[-10:], 1):
  print(f"청크 {i}번")
  print(f"청크 내용 {document.page_content}")
  print('-'*30)
  print(f"메타데이터 {document.metadata}")
  print("\n")

print(len(documents))

청크 1번
청크 내용 ⑥영 제173조제2항제2호에 규정하는 양도소득금액계산명세서는 국세청장이 정하는 바에 의한다. <개정 2000.4.3>
------------------------------
메타데이터 {'law_name': '소득세법 시행규칙', 'law_id': '007507', '공포일자': '20260102', '시행일자': '20260102', '조문번호': '103', '조문가지번호': None, '조문제목': '양도소득세 관련서식', 'level': '항', '항번호': '⑥', '호번호': None, '목번호': None, 'source': '법령정보센터', '조문_index': 7, 'file_line_no': 1181, 'piece_index': 1, 'piece_count': 1}


청크 2번
청크 내용 ⑦ 삭제 <2016.3.16>
------------------------------
메타데이터 {'law_name': '소득세법 시행규칙', 'law_id': '007507', '공포일자': '20260102', '시행일자': '20260102', '조문번호': '103', '조문가지번호': None, '조문제목': '양도소득세 관련서식', 'level': '항', '항번호': '⑦', '호번호': None, '목번호': None, 'source': '법령정보센터', '조문_index': 8, 'file_line_no': 1182, 'piece_index': 1, 'piece_count': 1}


청크 3번
청크 내용 ⑧영 제175조의2제4항의 규정에 의한 물납신청은 별지 제86호서식에 의하며, 동조제5항의 규정에 의한 물납결정상황통지는 별지 제87호서식에 의한다. <신설 1996.3.30, 1997.4.23, 2015.3.13>
------------------------------
메타데이터 {'law_name': '소득세법 시행규칙', 'law_id': '007507', '공포일자': '20260102', '시행일자': '20

In [5]:
try:
  from langchain_openai import OpenAIEmbeddings
except ImportError:
  !pip install langchain_openai
  from langchain_openai import OpenAIEmbeddings

try:
  import chromadb
except ImportError:
  !pip install chromadb
  import chromadb

try:
  from openai import OpenAI
except ImportError:
  !pip install openai
  from openai import OpenAI

In [6]:
import os, shutil

local_path = "/content/chromadb"
# db_path = "/content/drive/MyDrive/rag/chromadb"
# db_backup = "/content/drive/MyDrive/rag/old_chromadb"

# shutil.move(db_path, db_backup)

if os.path.exists(local_path):
  shutil.rmtree(local_path)
os.makedirs(local_path, exist_ok = True)

client = chromadb.PersistentClient(path=local_path)

try:
  client.delete_collection(name="tax_law")
except Exception as e:
  pass

collection = client.get_or_create_collection(name = "tax_law")

print(collection.count())

0


In [7]:
def none_case(value, name):
  return str(value).strip() if value not in [None, ""] else f"NO_{name}"

def doc_id(metadata):
  law_id = metadata.get("law_id")
  line_no = metadata.get("file_line_no")
  조문_index = metadata.get("조문_index")
  piece_index = metadata.get("piece_index")
  # line_no, 조문_index, piece_index 모두 1부터 시작

  조문번호 = none_case(metadata.get("조문번호"), "조문번호")
  조문가지번호 = none_case(metadata.get("조문가지번호"), "조문가지번호")
  항번호 = none_case(metadata.get("항번호"), "항번호")
  호번호 = none_case(metadata.get("호번호"), "호번호")
  목번호 = none_case(metadata.get("목번호"), "목번호")

  key = f"{조문번호}_{조문가지번호}_{항번호}_{호번호}_{목번호}"

  return f"{law_id}:{key}:j{조문_index}:p{piece_index}:l{line_no}"

documents_content = [doc.page_content for doc in documents]
documents_metadatas = [doc.metadata for doc in documents]
documents_ids = [doc_id(doc.metadata) for doc in documents]


In [8]:
from google.colab import userdata

embedding_model = "text-embedding-3-small"

openai_ = OpenAI(api_key = userdata.get('OPENAI_API_KEY'))

def embedding(doc):
  embeddings = openai_.embeddings.create(model=embedding_model, input=doc)
  return [emb.embedding for emb in embeddings.data]


In [9]:
def batch_upsert(documents, metadatas, ids, collection, batch_size=500):

  print(collection.id, collection.name)
  total = 0
  for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]

    batch_embeddings = embedding(batch_docs)
    collection.upsert(documents = batch_docs, metadatas = batch_metadatas, ids=batch_ids, embeddings=batch_embeddings)
    total += len(batch_docs)

  return total


In [10]:
batch_upsert(documents_content, documents_metadatas, documents_ids, collection)

ba5aec34-1aaa-4b07-9630-68554296d81d tax_law


14850

In [11]:
print("current collection id:", collection.id, "name:", collection.name)

current collection id: ba5aec34-1aaa-4b07-9630-68554296d81d name: tax_law


In [12]:
print(collection.count())

res = collection.get(ids=documents_ids[-1])
print(res.keys())

14850
dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])


In [13]:
print(len(documents_ids), len(set(documents_ids)))

14850 14850


In [14]:
print(res["documents"][-1][:200])
print(res["metadatas"][-1])


⑮ 영 제178조의12제4항에 따른 납부유예신청서는 별지 제105호서식에 따른다. <신설 2017.3.10, 2019.3.20, 2021.3.16, 2022.3.18, 2024.12.31>
{'조문제목': '양도소득세 관련서식', 'piece_count': 1, 'source': '법령정보센터', '공포일자': '20260102', 'piece_index': 1, 'level': '항', '항번호': '⑮', '조문_index': 16, 'file_line_no': 1190, 'law_id': '007507', 'law_name': '소득세법 시행규칙', '조문번호': '103', '시행일자': '20260102'}


In [15]:
print(res["ids"][-1])
print(res["metadatas"][-1])

007507:103_NO_조문가지번호_⑮_NO_호번호_NO_목번호:j16:p1:l1190
{'조문제목': '양도소득세 관련서식', 'piece_count': 1, 'source': '법령정보센터', '공포일자': '20260102', 'piece_index': 1, 'level': '항', '항번호': '⑮', '조문_index': 16, 'file_line_no': 1190, 'law_id': '007507', 'law_name': '소득세법 시행규칙', '조문번호': '103', '시행일자': '20260102'}


In [16]:
drive_path = "/content/drive/MyDrive/rag/chromadb_backup"

os.makedirs(drive_path, exist_ok = True)
!rsync -a --delete /content/chromadb/ /content/drive/MyDrive/rag/chromadb_backup/