In [None]:
import os
import sys
from pathlib import Path

repo_dir = Path.cwd().parents[0]
if repo_dir.as_posix() not in sys.path:
    sys.path.append(repo_dir.as_posix())

# load pdf test

In [None]:
from src.store import load_pdfs, load_pdfs_use_pypdfium2

In [None]:
def load_pdfs_test(pdfs_path):
    input_path = pdfs_path

    pdf_file_paths = [
        os.path.join(input_path, f)
        for f in os.listdir(input_path)
        if f.endswith(".pdf")
    ]

    pdf_data_list = load_pdfs_use_pypdfium2(pdf_file_paths)

    for item in pdf_data_list:
        print("====== ファイル名:", item["file_name"], "======")
        print("[1ページ目の情報]")
        print("  page_number:", item["first_page"].metadata["page_number"])
        print("  content:", item["first_page"].page_content)
        
        print("[2ページ目以降]")
        for doc in item["other_pages"]:
            print("  page_number:", doc.metadata["page_number"])
            print("  content:", doc.page_content)
        
    return pdf_data_list

In [None]:
load_pdfs_test(repo_dir.joinpath("data", "documents"))

# Preprocessing

In [None]:
from src.store import format_texts_with_gpt4o

In [None]:
pdfs_path = repo_dir.joinpath("data", "documents")

pdf_file_paths = [
    os.path.join(pdfs_path, f)
    for f in os.listdir(pdfs_path)
    if f.endswith(".pdf")
]

pdf_data_list = load_pdfs_use_pypdfium2(pdf_file_paths)

format_texts_with_gpt4o(pdf_data_list, output_dir=repo_dir.joinpath("data", "formatted_texts"))

# FAQ

In [None]:
from src.store import load_pdfs, load_pdfs_use_pypdfium2
from src.store import summarize_formatted_texts_with_gpt4o, summarize_texts_with_gpt4o

In [None]:
pdfs_path = repo_dir.joinpath("data", "documents")

pdf_file_paths = [
    os.path.join(pdfs_path, f)
    for f in os.listdir(pdfs_path)
    if f.endswith(".pdf")
]

pdf_data_list = load_pdfs_use_pypdfium2(pdf_file_paths)
summarize_texts_with_gpt4o(pdf_data_list, repo_dir.joinpath("data", "summarized_texts"))

# Extract words

In [None]:
from src.store import extract_words_with_llmchain

In [None]:
csv_dir = repo_dir.joinpath("data", "summarized_texts")

csv_file_paths = [
    os.path.join(csv_dir, f)
    for f in os.listdir(csv_dir)
    if f.endswith(".csv")
]

extract_words_with_llmchain(csv_file_paths, str(repo_dir.joinpath("data", "words.json")))

# Vectorize

## Process Json data

In [None]:
from src.store import process_and_build_json_vector_store

In [None]:
vector_store_dir = str(repo_dir.joinpath("data", "vectorize_dir", "words"))        # ベクトルストアの保存先ディレクトリ
collection_name = "words" 

vectorize_json = False
if vectorize_json:
    process_and_build_json_vector_store(
        json_file="/home/atsushi/repositories/RAG-FDUA/data/words.json",
        vector_store_dir=vector_store_dir,
        collection_name=collection_name)

In [None]:
from src.retrieve import retrieve_vector_store

In [None]:
query = "東洋エンジニアリングの独自開発のスケジュール最適化システムの名前は？"
shortened_query = query[:20]
results = retrieve_vector_store(
    query=shortened_query,
    vector_store_dir=vector_store_dir,
    collection_name=collection_name,
    top_k=3,
    calculate_score=True
)

print("[JSONベクトルストア] 検索結果:")
for i, result in enumerate(results, start=1):
    print(f"{i}: コンテンツ: {result['content']}")
    print(f"   ファイル名: {result['file_name']}")
    print(f"   類似度スコア: {result['similarity']}")

## build bi-vector stores

In [None]:
from src.store import build_bi_vector_stores
from src.retrieve import retrieve_vector_store

In [None]:
csv_dir = repo_dir.joinpath("data", "summarized_texts")
csv_file_paths = [
    os.path.join(csv_dir, f)
    for f in os.listdir(csv_dir)
    if f.endswith(".csv")
]   

output_dir = repo_dir.joinpath("data", "vectorize_dir", "bi-faq")

vectorize_bi_faq = False
if vectorize_bi_faq:
    for csv_file_path in csv_file_paths:
        build_bi_vector_stores(
            csv_file=csv_file_path,
            output_dir=output_dir
        )

In [None]:
query = "4℃ホールディングスの2024年2月29日現在の連結での従業員数は何名か。"
vector_store_dir = "/home/atsushi/repositories/RAG-FDUA/data/vectorize_dir/bi-faq/1_summarized/faq_question"
results = retrieve_vector_store(
    query=query,
    vector_store_dir=vector_store_dir,
    top_k=3,
    calculate_score=True
)

print("[JSONベクトルストア] 検索結果:")
for i, result in enumerate(results, start=1):
    print(f"{i}: コンテンツ: {result['content']}")
    print(f"   ファイル名: {result['file_name']}")
    print(f"   類似度スコア: {result['similarity']}")

In [None]:
from src.store import vectorize_faqs_from_csv

In [None]:
formatted_texts_dir = repo_dir.joinpath("data", "formatted_texts")  # フォーマット済みテキストファイルのディレクトリ
vector_store_dir = repo_dir.joinpath("data", "vectorize_dir", "ver1")        # ベクトルストアの保存先ディレクトリ
collection_name = "pdf_pages_ver1"                # Chromaコレクション名

retriever = vectorize_faqs_from_csv(
    formatted_texts_dir=str(formatted_texts_dir),
    vector_store_dir=str(vector_store_dir),
    collection_name=str(collection_name)
)