In [None]:
%pip install python-docx

# 1. 문서 내용 읽기

- python-docx

In [None]:
from docx import Document

document = Document('law_1.docx')
document

In [None]:
print(dir(document))

In [None]:
document.paragraphs

In [None]:
type(document.paragraphs)

In [None]:
document.paragraphs[0].text

In [None]:
document.paragraphs[1].text

In [None]:
for index, paragraph in enumerate(document.paragraphs):
    print(f'{index}. {paragraph.text}')

    if index == 4:
        break

In [None]:
## 읽어 온 단락을 하나의 문서로 만들기
full_text = ''

for paragraph in document.paragraphs:
    full_text += f'{paragraph.text}'

full_text

# 2. 문서 분할(쪼개기)

In [None]:
%pip install tiktoken

In [None]:
import tiktoken

encoding_model = tiktoken.encoding_for_model('gpt-4o')
encoding_model

In [None]:
## 토큰화
encoding = encoding_model.encode(full_text)
print(encoding)

In [None]:
## 9516, 11734, 5537
decoding = encoding_model.decode([9516])
decoding

In [None]:
decoding = encoding_model.decode([11734])
decoding

In [None]:
decoding = encoding_model.decode([5537])
decoding

In [None]:
len(encoding)

In [None]:
def split_text(full_text, chunk_size):
    encoder_model = tiktoken.encoding_for_model('gpt-4o')
    total_encoding = encoder_model.encode(full_text)
    total_token_count = len(total_encoding)

    text_list = []
    for i in range(0, total_token_count, chunk_size):
        chunk = total_encoding[i: i+chunk_size]
        decoding = encoding_model.decode(chunk)
        text_list.append(decoding)

    return text_list



In [None]:
chunk_list = split_text(full_text, 1500)
chunk_list

In [None]:
len(chunk_list)

# 3. 벡터 데이터베이스에 저장

In [None]:
%pip install chromadb

In [None]:
## 임베딩 모델 지정
import os
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

OPEN_API_KEY = os.getenv('OPENAI_API_KEY')
oepnai_embedding = OpenAIEmbeddingFunction(
    api_key=OPEN_API_KEY, 
    model_name='text-embedding-3-large'
)

In [None]:
## 크로마 클라이언트 생성
import chromadb
chroma_client = chromadb.Client()

In [None]:
collection_name = 'law_collection'

law_collection = chroma_client.get_or_create_collection(
    collection_name, 
    embedding_function=oepnai_embedding
)

In [None]:
## 컬렉션에 설정할 인덱스 생성
id_list = []

for index in range(len(chunk_list)):
    id_list.append(f'{index}')

## 길이 확인
len(id_list), len(chunk_list) 

In [None]:
law_collection.add(documents=chunk_list, ids=id_list)

In [None]:
law_collection

In [None]:
## chromadb에 생성된 컬렉션 목록 확인
collections = chroma_client.list_collections()
collections

# 4. 질문이 있으면, 벡터 데이터베이스에서 유사도 검색

In [None]:
query = '전세사기피해자 금융지원에 대해 설명해주세요.'

In [None]:
## 유사도 검색
retrieved_doc = law_collection.query(query_texts=query, n_results=3)
retrieved_doc

In [None]:
type(retrieved_doc)

In [None]:
type(retrieved_doc['documents'])

In [None]:
len(retrieved_doc['documents'])

In [None]:
retrieved_doc['documents']

In [None]:
type(retrieved_doc['documents'])

In [None]:
type(retrieved_doc['documents'][0])

In [None]:
len(retrieved_doc['documents'][0])

In [None]:
retrieved_doc['documents'][0][0]

# 5. 질문 -> 답변

In [None]:
retrieved_doc['documents'][0]

In [None]:
from openai import OpenAI

client = OpenAI()

response = client.responses.create(
    model="gpt-4o",
    instructions=f'''당신은 전세사기피해 법률 전문가입니다. 
        [context]를 참고하여 사용자의 질문에 답변해주세요.
        [context]
        {retrieved_doc['documents'][0][0]}
        ''',
    input=query,
)

print(response.output_text)

In [None]:
response.output[0].content[0].text

In [None]:
response

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.responses.create(
    model="gpt-4.1",
    input=[
        {"role": "developer",
         "content": f'''당신은 전세사기피해 법률 전문가입니다. 
[context]를 참고하여 사용자의 질문에 답변해주세요.
[context]
{retrieved_doc['documents'][0]}'''},
        {"role": "user", "content": query}
    ]
)

print(response.output_text)