# Question - Answering with Retrieval

본 대회의 과제는 중앙정부 재정 정보에 대한 **검색 기능**을 개선하고 활용도를 높이는 질의응답 알고리즘을 개발하는 것입니다. <br>이를 통해 방대한 재정 데이터를 일반 국민과 전문가 모두가 쉽게 접근하고 활용할 수 있도록 하는 것이 목표입니다. <br><br>
베이스라인에서는 평가 데이터셋만을 활용하여 source pdf 마다 Vector DB를 구축한 뒤 langchain 라이브러리와 llama-2-ko-7b 모델을 사용하여 RAG 프로세스를 통해 추론하는 과정을 담고 있습니다. <br>( train_set을 활용한 훈련 과정은 포함하지 않으며, test_set  에 대한 추론만 진행합니다. )

# Download Library

In [1]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install transformers[torch] -U

!pip install datasets
!pip install langchain
!pip install langchain_community
!pip install PyMuPDF
!pip install sentence-transformers
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple/
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting transformers[torch]
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: tr

# Import Library

In [2]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm
import fitz  # PyMuPDF

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig
)
from accelerate import Accelerator

# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

2024-08-04 03:31:55.865116: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 03:31:55.865288: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 03:31:56.039654: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
!pip install kiwipiepy rank_bm25 openai tiktoken

Collecting kiwipiepy
  Downloading kiwipiepy-0.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting openai
  Downloading openai-1.38.0-py3-none-any.whl.metadata (22 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting kiwipiepy-model<0.19,>=0.18 (from kiwipiepy)
  Downloading kiwipiepy_model-0.18.0.tar.gz (34.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.7/34.7 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading kiwipiepy-0.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8

In [4]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [5]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.2-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading 

# Vector DB

In [6]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from konlpy.tag import Kkma, Okt
from kiwipiepy import Kiwi

kiwi = Kiwi()
kkma = Kkma()
okt = Okt()

In [7]:
def kiwi_tokenize(text):
    return [token.form for token in kiwi.tokenize(text)]

def kkma_tokenize(text):
    return [token for token in kkma.morphs(text)]

def okt_tokenize(text):
    return [token for token in okt.morphs(text)]

In [10]:
# def process_pdf(file_path, chunk_size=1500, chunk_overlap=200):
#     """PDF 텍스트 추출 후 chunk 단위로 나누기"""
#     # PDF 파일 열기
#     doc = fitz.open(file_path)
#     text = ''
#     # 모든 페이지의 텍스트 추출
#     for page in doc:
#         text += page.get_text()
#     # 텍스트를 chunk로 분할
#     splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap
#     )
#     chunk_temp = splitter.split_text(text)
#     # Document 객체 리스트 생성
#     chunks = [Document(page_content=t) for t in chunk_temp]
#     return chunks

import pdfplumber
from langchain.schema import Document

import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def process_pdf(file_path, chunk_size=200, chunk_overlap=20):
        """PDF를 페이지마다 청크로 나누고 메타데이터에 파일 이름 추가"""
        # 파일 이름 추출
        file_name = os.path.basename(file_path)
        
        # PDF 파일 열기
        pdf = pdfplumber.open(file_path)
        all_chunks = []
        
        # 페이지별로 처리
        for page_number, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                # 페이지별 텍스트 청크로 분할
                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_overlap
                )
                chunk_temp = splitter.split_text(text)
                
                # Document 객체 리스트 생성 (파일 이름과 페이지 번호를 메타데이터에 포함)
                page_chunks = [Document(page_content=t, metadata={"Source": file_name[:-4], "page": page_number}) for t in chunk_temp]
                all_chunks.extend(page_chunks)
        
        pdf.close()  # PDF 파일 닫기
        return all_chunks


def create_vector_db(chunks, model_path="jhgan/ko-sroberta-multitask"):
    """FAISS DB 생성"""
    # 임베딩 모델 설정
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    # FAISS DB 생성 및 반환
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db

def normalize_path(path):
    """경로 유니코드 정규화"""
    return unicodedata.normalize('NFC', path)


def process_pdfs_from_dataframe(df, base_directory):
    """딕셔너리에 pdf명을 키로해서 DB, retriever 저장"""
    pdf_databases = {}
    unique_paths = df['Source_path'].unique()
    
    for path in tqdm(unique_paths, desc="Processing PDFs"):
        # 경로 정규화 및 절대 경로 생성
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path
        
        pdf_title = os.path.splitext(os.path.basename(full_path))[0]
        print(f"Processing {pdf_title}...")
        
        # PDF 처리 및 벡터 DB 생성
        chunks = process_pdf(full_path)
#         bm25 = BM25Retriever.from_documents(chunks)
#         kiwi_bm25 = BM25Retriever.from_documents(chunks, preprocess_func=kiwi_tokenize,  search_kwargs={'k': 20})
        kkma_bm25 = BM25Retriever.from_documents(chunks, preprocess_func=kkma_tokenize,  search_kwargs={'k': 20})
        okt_bm25 = BM25Retriever.from_documents(chunks, preprocess_func=okt_tokenize,  search_kwargs={'k': 20})
        db = create_vector_db(chunks)
        faiss = db.as_retriever(search_kwargs={'k': 20})
        
        # Retriever 생성
        retriever = EnsembleRetriever(
                    retrievers=[okt_bm25, faiss],  # 사용할 검색 모델의 리스트
                    weights=[0.3, 0.7],  # 각 검색 모델의 결과에 적용할 가중치
                    search_type="mmr",  # 검색 결과의 다양성을 증진시키는 MMR 방식을 사용
                    search_kwargs={'k': 20, 'fetch_k': 20}, 
                )
        
        
        # 결과 저장
        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases

# DB 생성

In [9]:
# # Train과 Test CSV 파일 모두 로드
# df_train = pd.read_csv('/kaggle/input/pdf-files/train.csv')
# df_test = pd.read_csv('/kaggle/input/pdf-files/test.csv')

# # 두 데이터프레임 합치기
# df_combined = pd.concat([df_train, df_test], ignore_index=True)

# # 중복된 Source_path 제거 (같은 PDF가 train과 test에 모두 있을 경우)
# df_combined = df_combined.drop_duplicates(subset=['Source_path'])

# base_directory = '/kaggle/input/pdf-files' # Your Base Directory
# pdf_databases = process_pdfs_from_dataframe(df_combined, base_directory)

In [None]:
%pip install --upgrade --quiet  sentence-transformers > /dev/null

In [None]:
# base_directory = '/kaggle/input/pdf-files' # Your Base Directory
# df = pd.read_csv('/kaggle/input/pdf-files/test.csv')
# pdf_databases = process_pdfs_from_dataframe(df, base_directory)
# # pdf_databases = process_pdfs_from_dataframe(df, base_directory)

In [12]:
# import pandas as pd

# # df_train = pd.read_csv('/kaggle/input/pdf-files/train.csv')
# df_test = pd.read_csv('/kaggle/input/pdf-files/test.csv')

# def group_pdfs(df):
#     """비슷한 PDF 파일을 그룹화"""
#     groups = {}
#     for _, row in df.iterrows():
#         source_path = row['Source_path']
#         group_key = source_path.split('_')[0]  # 파일 이름의 특정 부분을 그룹 키로 사용 (예: 'group1_file1.pdf' -> 'group1')
#         if group_key not in groups:
#             groups[group_key] = []
#         groups[group_key].append(source_path)
#     return groups

# df_combined = pd.concat([df_train, df_test], ignore_index=True)
# df_combined = df_combined.drop_duplicates(subset=['Source_path'])
# pdf_groups = group_pdfs(df_combined)
# pdf_groups

{'./train': ['./train_source/1-1 2024 주요 재정통계 1권.pdf',
  './train_source/2024 나라살림 예산개요.pdf',
  './train_source/재정통계해설.pdf',
  './train_source/국토교통부_전세임대(융자).pdf',
  './train_source/고용노동부_청년일자리창출지원.pdf',
  './train_source/고용노동부_내일배움카드(일반).pdf',
  './train_source/보건복지부_노인일자리 및 사회활동지원.pdf',
  './train_source/중소벤처기업부_창업사업화지원.pdf',
  './train_source/보건복지부_생계급여.pdf',
  './train_source/국토교통부_소규모주택정비사업.pdf',
  './train_source/국토교통부_민간임대(융자).pdf',
  './train_source/고용노동부_조기재취업수당.pdf',
  './train_source/2024년도 성과계획서(총괄편).pdf',
  './train_source/조세지출_연계관리.pdf',
  './train_source/재정융자사업.pdf',
  './train_source/월간 나라재정 2023년 12월호.pdf'],
 './test': ['./test_source/중소벤처기업부_혁신창업사업화자금(융자).pdf',
  './test_source/보건복지부_부모급여(영아수당) 지원.pdf',
  './test_source/보건복지부_노인장기요양보험 사업운영.pdf',
  './test_source/산업통상자원부_에너지바우처.pdf',
  './test_source/국토교통부_행복주택출자.pdf',
  './test_source/재정조정제도.pdf',
  './test_source/핵심재정사업성과관리.pdf',
  './test_source/재정성과관리제도.pdf',
  './test_source/우발부채.pdf']}

In [14]:
import pickle
import os

def save_databases(pdf_databases, save_dir):
    """벡터 데이터베이스와 retriever 저장"""
    os.makedirs(save_dir, exist_ok=True)
    for pdf_title, data in pdf_databases.items():
        db_path = os.path.join(save_dir, f"{pdf_title}_db.pkl")
        retriever_path = os.path.join(save_dir, f"{pdf_title}_retriever.pkl")
        
        # DB 저장
        data['db'].save_local(db_path)
        
        # Retriever 저장
        with open(retriever_path, 'wb') as f:
            pickle.dump(data['retriever'], f)
        
    print(f"Databases and retrievers saved in {save_dir}")
    
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

def load_databases(load_dir, model_path="jhgan/ko-sroberta-multitask"):
    """저장된 벡터 데이터베이스와 retriever 로드"""
    pdf_databases = {}
    
    # 임베딩 모델 설정
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    
    for filename in os.listdir(load_dir):
        if filename.endswith("_db.pkl"):
            pdf_title = filename[:-7]  # Remove "_db.pkl"
            db_path = os.path.join(load_dir, filename)
            retriever_path = os.path.join(load_dir, f"{pdf_title}_retriever.pkl")
            
            # DB 로드 (allow_dangerous_deserialization 파라미터 추가)
            db = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
            
            # Retriever 로드
            with open(retriever_path, 'rb') as f:
                retriever = pickle.load(f)
            
            pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
            }
    
    print(f"Loaded {len(pdf_databases)} databases from {load_dir}")
    return pdf_databases


# # # 데이터베이스 생성 후 저장
# Train과 Test CSV 파일 모두 로드

df_test = pd.read_csv('/kaggle/input/pdf-files/test.csv')

base_directory = '/kaggle/input/pdf-files' # Your Base Directory
pdf_databases = process_pdfs_from_dataframe(df_test, base_directory)

save_dir = '/kaggle/working/'
save_databases(pdf_databases, save_dir)

# 나중에 데이터베이스 로드
pdf_databases = load_databases(save_dir)

Processing PDFs:   0%|          | 0/9 [00:00<?, ?it/s]

Processing 중소벤처기업부_혁신창업사업화자금(융자)...


  warn_deprecated(


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing PDFs:  11%|█         | 1/9 [01:01<08:14, 61.80s/it]

Processing 보건복지부_부모급여(영아수당) 지원...


Processing PDFs:  22%|██▏       | 2/9 [01:12<03:41, 31.69s/it]

Processing 보건복지부_노인장기요양보험 사업운영...


Processing PDFs:  33%|███▎      | 3/9 [01:33<02:41, 26.99s/it]

Processing 산업통상자원부_에너지바우처...


Processing PDFs:  44%|████▍     | 4/9 [02:03<02:21, 28.20s/it]

Processing 국토교통부_행복주택출자...


Processing PDFs:  56%|█████▌    | 5/9 [02:20<01:36, 24.13s/it]

Processing 재정조정제도...


Processing PDFs:  67%|██████▋   | 6/9 [03:19<01:48, 36.02s/it]

Processing 핵심재정사업성과관리...


Processing PDFs:  78%|███████▊  | 7/9 [04:22<01:29, 44.66s/it]

Processing 재정성과관리제도...


Processing PDFs:  89%|████████▉ | 8/9 [05:05<00:44, 44.26s/it]

Processing 우발부채...


Processing PDFs: 100%|██████████| 9/9 [06:00<00:00, 40.06s/it]


Databases and retrievers saved in /kaggle/working/
Loaded 9 databases from /kaggle/working/


In [None]:
# # save_dir = '/kaggle/working/'
# # # save_databases(pdf_databases, save_dir)
# import pickle

# def load_databases(load_dir, model_path="paraphrase-multilingual-mpnet-base-v2"):
#     """저장된 벡터 데이터베이스와 retriever 로드"""
#     pdf_databases = {}
    
#     # 임베딩 모델 설정
#     model_kwargs = {'device': 'cpu'}
#     encode_kwargs = {'normalize_embeddings': True}
#     embeddings = HuggingFaceEmbeddings(
#         model_name=model_path,
#         model_kwargs=model_kwargs,
#         encode_kwargs=encode_kwargs
#     )
    
#     for filename in os.listdir(load_dir):
#         if filename.endswith("_db.pkl"):
#             pdf_title = filename[:-7]  # Remove "_db.pkl"
#             db_path = os.path.join(load_dir, filename)
#             retriever_path = os.path.join(load_dir, f"{pdf_title}_retriever.pkl")
            
#             # DB 로드 (allow_dangerous_deserialization 파라미터 추가)
#             db = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
            
#             # Retriever 로드
#             with open(retriever_path, 'rb') as f:
#                 retriever = pickle.load(f)
            
#             pdf_databases[pdf_title] = {
#                 'db': db,
#                 'retriever': retriever
#             }
    
#     print(f"Loaded {len(pdf_databases)} databases from {load_dir}")
#     return pdf_databases

# save_dir = '/kaggle/working/'
# # # 나중에 데이터베이스 로드
# pdf_databases = load_databases(save_dir)

# MODEL Import

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# from huggingface_hub import login

# # 인증 토큰 설정
# login(token='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')

# def setup_llm_pipeline():
#     # 모델 ID 
#     model_id = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

#     # 토크나이저 로드 및 설정
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     tokenizer.use_default_system_prompt = False

#     # 모델 로드
#     model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         torch_dtype=torch.float16,  # 16비트 부동소수점 사용
#         device_map="auto",
#         trust_remote_code=True )

#     # HuggingFacePipeline 객체 생성
#     text_generation_pipeline = pipeline(
#         model=model,
#         tokenizer=tokenizer,
#         task="text-generation",
#         temperature=0.4,
#         do_sample= True,
#         return_full_text=False,
#         max_new_tokens=512,
#         repetition_penalty=1.2,  # 반복 억제
#         no_repeat_ngram_size=3,  # n-gram 반복 방지
#         num_beams=4,  # beam search 사용
#         early_stopping=True,  
#     )

#     hf = HuggingFacePipeline(pipeline=text_generation_pipeline)

#     return hf


In [None]:
# GPU 메모리 비우기
import gc
del llm
gc.collect()
torch.cuda.empty_cache()

In [6]:
def setup_llm_pipeline():
    # 4비트 양자화 설정
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # 모델 ID 
    model_id = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
#     token='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'
    # 토크나이저 로드 및 설정
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False

    # 모델 로드 및 양자화 설정 적용
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True )

    # HuggingFacePipeline 객체 생성
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.2,
        do_sample= True,
        return_full_text=False,
        max_new_tokens=512,
        repetition_penalty=1.2,  # 반복 억제
        no_repeat_ngram_size=3,  # n-gram 반복 방지
#         num_beams=4,  # beam search 사용
#         early_stopping=True,  
    )


    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)

    return hf

In [7]:
# LLM 파이프라인
llm = setup_llm_pipeline()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  warn_deprecated(


In [16]:
!pip install groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [24]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GROQ_API_KEY")
secret_value_0

'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'

In [25]:
import os
from groq import Groq

client = Groq(
    api_key=secret_value_0,
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "always answer with korean only ",
            "role": "user",
            "content": "아이브 안유진의 키는 몇 cm? ",
        }
    ],
    model="llama3-70b-8192",
    temperature=1,
    max_tokens=1024,
    top_p=1,
    stream=True,
    stop=None,
)


for chunk in chat_completion:
    print(chunk.choices[0].delta.content or "", end="")

😊

According to various sources, including her profile on the IVE official website and other online platforms, An Yujin's height is 163 cm (5 feet 4 inches).

# Langchain 을 이용한 추론

In [None]:

    # 소스 문자열 정규화
source = normalize_string(df['Source'][0])
question = df['Question'][0]
print(question,'요게 질문')
# 정규화된 키로 데이터베이스 검색
normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
retriever = normalized_keys[source]['retriever']

In [None]:
source

In [None]:
normalized_keys[source]['retriever']

In [None]:
retriever | format_docs

In [None]:
question

In [None]:
from langchain_core.prompts import format_document
retrieved_docs = retriever.invoke(question)
print(retrieved_docs)

print("--------")


reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(retrieved_docs )
print(reordered_docs)
# print("\n\n".join(doc.page_content for doc in reordered_docs))

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
    template="{page_content} [source: {soruce}]"
)

def combine_documents(
    docs,  # 문서 목록
    # 문서 프롬프트 (기본값: DEFAULT_DOCUMENT_PROMPT)
    document_prompt=DEFAULT_DOCUMENT_PROMPT,
    document_separator="\n",  # 문서 구분자 (기본값: 두 개의 줄바꿈)
):
    # context 에 입력으로 넣기 위한 문서 병합
    doc_strings = [
        f"[{i}] {format_document(doc, document_prompt)}" for i, doc in enumerate(docs)
    ]  # 각 문서를 주어진 프롬프트로 포맷팅하여 문자열 목록 생성
    return document_separator.join(
        doc_strings
    )  # 포맷팅된 문서 문자열을 구분자로 연결하여 반환

combined = combine_documents(reordered_docs, document_separator="\n")
print(combined)
# retriever = normalized_keys[source]['retriever']
# rag_chain = (
#         retriever | format_docs )
# formatted_docs = rag_chain.invoke(question)
# print(formatted_docs)

In [None]:
question = df['Question'][0]

In [None]:
retriever.invoke(question)

In [None]:
import langchain
langchain.debug = True

print()
    
rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt)
rag_chain.invoke(question)

In [28]:
df = pd.read_csv('/kaggle/input/pdf-files/test.csv')

In [None]:
retriever

In [None]:
 prompt = PromptTemplate.from_template(
        """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Answer in Korean.

    #Question: 
    {question} 
    #Context: 
    {context} 

    #Answer:"""
    )


In [33]:
import getpass
import os

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

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 ························································


In [34]:
%pip install -qU langchain-groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [51]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.2,
    max_tokens=1024,
    timeout=None,
    stop=None,
    max_retries=2,
)

In [52]:
from langchain_community.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda
from langchain_core.prompts import format_document
from langchain_core.prompts import ChatPromptTemplate

def normalize_string(s):
    """유니코드 정규화"""
    return unicodedata.normalize('NFC', s)

# 기본 문서 프롬프트를 생성합니다. (source, metadata 등을 추가할 수 있습니다)
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
    template="{page_content} [source: {Source}]"
)

def combine_documents(
    docs,  # 문서 목록
    # 문서 프롬프트 (기본값: DEFAULT_DOCUMENT_PROMPT)
    document_prompt=DEFAULT_DOCUMENT_PROMPT,
    document_separator="\n",  # 문서 구분자 (기본값: 두 개의 줄바꿈)
):
    # context 에 입력으로 넣기 위한 문서 병합
    doc_strings = [
        f"[{i}] {format_document(doc, document_prompt)}" for i, doc in enumerate(docs)
    ]  # 각 문서를 주어진 프롬프트로 포맷팅하여 문자열 목록 생성
    return document_separator.join(
        doc_strings
    )  # 포맷팅된 문서 문자열을 구분자로 연결하여 반환


def reorder_documents(docs):
    # 재정렬
    reordering = LongContextReorder()
    reordered_docs = reordering.transform_documents(docs)
    combined = combine_documents(reordered_docs, document_separator="\n")
    return combined




def format_docs(docs):
    """검색된 문서들을 하나의 문자열로 포맷팅"""
        # docs가 리스트가 아닌 경우 (예: Retriever 객체)
    reordering = LongContextReorder()
    reordered_docs = reordering.transform_documents(docs)
    return "\n\n".join(doc.page_content for doc in reordered_docs)

import re

def remove_html_tags(text):
    """HTML 태그를 제거하는 함수"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)    

def clean_output(output):
    # "질문:" 이후의 텍스트만 반환하고 HTML 태그 제거
    if "Answer:" in output:
        output = output.split("Answer:")[-1].strip()
    return remove_html_tags(output)

# 결과를 저장할 리스트 초기화
results = []
normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
# DataFrame의 각 행에 대해 처리
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    # 소스 문자열 정규화
    source = normalize_string(row['Source'])
    question = row['Question']
    # 정규화된 키로 데이터베이스 검색
    normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
    retriever = normalized_keys[source]['retriever']
    
    # RAG 체인 구성
#     prompt = PromptTemplate.from_template(
#        template = """Given this text extracts:
#     {context}

#     -----
#     Please answer the following question:
#     {question}

#     Answer in the following languages: {language}
#     """
#     )
    


    prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that answers question with {context}.",
        ),
        ("human", "Please answer the following question: {question}. Think step by step. Answer in the following languages: {language}"),
    ]
    )

    # RAG 체인 정의
    rag_chain = (
    {
        "context": itemgetter("question")
        | retriever
        | RunnableLambda(reorder_documents),  # 질문을 기반으로 문맥을 검색합니다.
        "question": itemgetter("question"),  # 질문을 추출합니다.
        "language": itemgetter("language"),  # 답변 언어를 추출합니다.
    }
    | prompt  # 프롬프트 템플릿에 값을 전달합니다.
    | llm
    | StrOutputParser()  # 모델의 출력을 문자열로 파싱합니다.
    )

    # 답변 추론
    print(f"Question: {question}")
    full_response = rag_chain.invoke({"question": question, "language": "KOREAN"})

    # 실제 답변만 추출
    actual_answer = clean_output(full_response)
    print(f"Answer: {actual_answer}\n")

    # 결과 저장
    results.append({
        "Source": row['Source'],
        "Source_path": row['Source_path'],
        "Question": question,
        "Answer": actual_answer  # 실제 답변만 저장
    })


Answering Questions:   0%|          | 0/98 [00:00<?, ?it/s]

Question: 2022년 혁신창업사업화자금(융자)의 예산은 얼마인가요?


Answering Questions:   0%|          | 0/98 [00:00<?, ?it/s]


NotFoundError: Error code: 404 - {'error': {'message': 'The model `llama-3.1-405b-reasoning` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'code': 'model_not_found'}}

In [None]:
pdf_databases.items()

In [None]:
from langchain.schema import Document
from langchain_community.document_transformers import LongContextReorder
import unicodedata
import re

def normalize_string(s):
    """유니코드 정규화"""
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    """검색된 문서들을 하나의 문자열로 포맷팅하고 소스를 포함"""
    reordering = LongContextReorder()
    reordered_docs = reordering.transform_documents(docs)
    return "\n\n".join(f"Source: {source}\n{doc.page_content}" for doc in reordered_docs)

def remove_html_tags(text):
    """HTML 태그를 제거하는 함수"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def clean_output(output):
    """"질문:" 이후의 텍스트만 반환하고 HTML 태그 제거"""
    if "답변만 작성하세요:" in output:
        output = output.split("답변만 작성하세요:")[-1].strip()
    return remove_html_tags(output)

# 결과를 저장할 리스트 초기화
results = []
normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}

# DataFrame의 각 행에 대해 처리
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    # 소스 문자열 정규화
    source = normalize_string(row['Source'])
    question = row['Question']
    print(question, '요게 질문')

    # 정규화된 키로 데이터베이스 검색
    retriever = normalized_keys[source]['retriever']
    
    # RAG 체인 구성
    prompt = PromptTemplate.from_template(
    """당신은 사용자들의 질문과 문맥을 받아 답변을 도와주는 지능형 어시스턴트입니다. 
    반드시 다음의 문맥 조각들만 사용하여 질문에 답변하세요. 단계별로 생각한 후 답변하세요.

    답변을 가짜로 만들어내지 마세요:
     - 만약 문맥에서 질문의 답을 결정할 수 없다면 "그 질문에 대한 답을 결정할 수 없습니다."라고 하세요.
     - 문맥이 비어 있으면 "그 질문에 대한 답을 모릅니다."라고 하세요.

    답변은 반드시 한국어로 하세요. 설명은 필요 없습니다.
    
    예시 1:
    질문 : 2024년도 국세수입 중 일반회계 내국세수입은 몇 조원인가요?
    답변 : 2024년도 일반회계 내국세수입은 321.6조원입니다.
    
    예시 2:
    질문 : 2024년도 세외수입 규모와 구성은 어떤가요?
    답변 : 2024년 세외수입은 일반회계에서 11.2조원, 특별회계에서 17.0조원으로 나타났습니다.


    #문맥: 
    {context}

    #질문:
    {question}

    #답변만 작성하세요:"""
)


    # RAG 체인 정의
    rag_chain = (
    {
        "context": itemgetter("question")
        | faiss
        | RunnableLambda(reorder_documents),  # 질문을 기반으로 문맥을 검색합니다.
        "question": itemgetter("question"),  # 질문을 추출합니다.
        "language": itemgetter("language"),  # 답변 언어를 추출합니다.
    }
    | prompt  # 프롬프트 템플릿에 값을 전달합니다.
#     | ChatOpenAI()  # 언어 모델에 프롬프트를 전달합니다.
#     | StrOutputParser()  # 모델의 출력을 문자열로 파싱합니다.
)

    # 답변 추론
    print(f"Question: {question}")
    full_response = rag_chain.invoke(question)

    # 실제 답변만 추출
    actual_answer = clean_output(full_response)
    print(f"Answer: {actual_answer}\n")

    # 결과 저장
    results.append({
        "Source": row['Source'],
        "Source_path": row['Source_path'],
        "Question": question,
        "Answer": actual_answer  # 실제 답변만 저장
    })


# Submission

In [50]:
# 제출용 샘플 파일 로드
submit_df = pd.read_csv("/kaggle/input/pdf-files/sample_submission.csv")

# 생성된 답변을 제출 DataFrame에 추가
submit_df['Answer'] = [item['Answer'] for item in results]
submit_df['Answer'] = submit_df['Answer'].fillna("데이콘")     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

# 결과를 CSV 파일로 저장
submit_df.to_csv("./baseline_submission_ensembel_405.csv", encoding='UTF-8-sig', index=False)