In [17]:
from bs4 import BeautifulSoup
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

In [None]:
mainPaths =[ 
    './templates/contents/contents_1/귀어귀촌개념.html', 
    './templates/contents/contents_1/귀어귀촌준비절차.html', 
    './templates/contents/contents_1/어업면허허가.html', 
    './templates/contents/contents_1/어업정의.html', 
    './templates/contents/contents_2/귀어정책지원교육.html', 
    './templates/contents/contents_2/양식장및어선임대사업.html', 
    './templates/contents/contents_2/어촌이사혜택.html', 
    './templates/contents/contents_2/청년어촌정착지원금.html', 
    './templates/contents/contents_5/어업종류.html', 
    './templates/contents/contents_5/낚시및해루질_금어기.html', 
    './templates/contents/contents_5/낚시및해루질_낚시금지구역.html', 
    './templates/contents/contents_5/낚시및해루질_종류별낚시설명및초보자팁.html', 
    './templates/contents/contents_5/낚시및해루질_지역별낚시명소.html', 
    './templates/contents/contents_5/낚시및해루질_지역별낚시용품점.html'
    ]

file_path = mainPaths
contentList = []

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     # 총 길이
    chunk_overlap=50,   # 겹치는 길이
    separators=["\n\n", "\n", ".", " ", ""],  # 끊는 우선순위
)



for i in file_path:
    with open(i, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    folder = os.path.basename(os.path.dirname(i)) 
    page = os.path.splitext(os.path.basename(i))[0]  
    soup = BeautifulSoup(html_content, 'html.parser')
    sections = soup.select('[name="section"]')
    titles = None
    contents = None
    title = None
    content = None
    for j in sections:
        titles = j.select('[name="content_title"]')
        contents = j.select('[name="content_text"]')
        # 둘 다 1개 이상일 때
        if len(titles) > 1 and len(contents) > 1:
            count = min(len(titles), len(contents))  # 길이 맞추기
            for k in range(count):
                title = titles[k].get_text(strip=True)

                content = contents[k].get_text(strip=True)
                

                chunks = splitter.split_text(content)
                for k, chunk in enumerate(chunks):
                    contentList.append([title, chunk, folder, page])

        # 하나만 있을 경우 (정확히 하나일 때만 출력)
        elif len(titles) == 1 and len(contents) == 1:
            title = titles[0].get_text(strip=True)
            content = contents[0].get_text(strip=True)
            chunks = splitter.split_text(content)
            for k, chunk in enumerate(chunks):
                contentList.append([title, chunk, folder, page])
        else:
            print(f"{f}의 제목 또는 내용이 없습니다.")
            # print(j)

df = pd.DataFrame(contentList,columns=["title", "content", "folder", "page"])
df



Unnamed: 0,title,content,folder,page
0,지역별 낚시용품점,낚시를 하기 위해서는 여러가지 낚시에 필요한 장비들이 있습니다.이곳에 지역별로 몇몇...,contents_5,낚시및해루질_지역별낚시용품점
1,1. 경기·인천·서울,1. 아오맥스(평택시 청북읍)낚시대 개발·제조·판매031-663-78982. 런커 ...,contents_5,낚시및해루질_지역별낚시용품점
2,2. 부산·울산·경상,1. 유승조구산업(부산 사하구)낚시바늘 제조2. 긱스(부산 수영구)장비 및 콘텐츠 ...,contents_5,낚시및해루질_지역별낚시용품점
3,2. 부산·울산·경상,. 충청도1. 새서울낚시(당진시 송악읍)종합 낚시용품점041-357-58052. 당...,contents_5,낚시및해루질_지역별낚시용품점
4,3. 전라도,"1. 여수만물상낚시(여수시 이순신광장로 67 (중앙동))선상낚시, 문어·쭈꾸미·갑오...",contents_5,낚시및해루질_지역별낚시용품점
5,4. 충청도,1. 새서울낚시(당진시 송악읍)종합 낚시용품점041-357-58052. 당진낚시할인...,contents_5,낚시및해루질_지역별낚시용품점
6,5. 제주도,1. 탐라낚시(제주도 이도2동 413-1)종합 낚시 용품점064-721-78262....,contents_5,낚시및해루질_지역별낚시용품점


In [19]:
# 2. 문서 객체 생성
documents = [
    Document(page_content=row['content'], metadata={"title": row['title'], "folder": row['folder'], "page": row['page']})
    for _, row in df.iterrows()
]

embedding_model = HuggingFaceEmbeddings(model_name="jhgan/ko-sroberta-multitask")

vectorstore = FAISS.from_documents(documents=documents, embedding=embedding_model)

vectorstore.save_local("chatbot/vector_store/contents")