In [3]:
%%capture
!pip install -U transformers accelerate trl bitsandbytes pyarrow peft
!pip install langchain langchain-community langchain_core pymupdf sentence_transformers faiss-gpu pypdf tabula-py
!pip install rank_bm25

In [7]:
pip list show

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
accelerate                       0.33.0
aiohappyeyeballs                 2.4.0
aiohttp                          3.10.5
aiosignal                        1.3.1
alabaster                        0.7.16
albucore                         0.0.13
albumentations                   1.4.14
altair                           4.2.2
annotated-types                  0.7.0
anyio                            3.7.1
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array_record                     0.5.1
arviz                            0.18.0
asn1crypto                       1.5.1
astropy                          6.1.2
astropy-iers-data                0.2024.8.26.0.31.57
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.1.0
attrs                            24.2.0
audioread              

# 환경세팅
- 현재 파일 경로들이 상대 경로로 지정되어 있음 -- 마무리 작업 때는 절대 경로로 변경
- 클래스화는 제출할 때 적용. 아직 실험이 안 끝난 Prompt Engineering, Inference는 모듈화 진행 X (모델, 프롬프트 고정되면 진행)

In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Gemma2ForCausalLM
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from langchain_community.embeddings import HuggingFaceEmbeddings

import os, torch
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm

In [None]:
from huggingface_hub import login

hf_token = input("hugging face token 입력 :") #hf_token 입력
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import pandas as pd

# test 데이터 로드
test = pd.read_csv('./data/test.csv')

In [None]:
# 문서 별 값 저장한 json 파일 로드
with open('pdf_opt.json', 'r', encoding='utf-8') as f:
    pdfs_opt = json.load(f)

In [None]:
# tokenizer 생성
base_model_url = "rtzr/ko-gemma-2-9b-it"
tokenizer = AutoTokenizer.from_pretrained(base_model_url)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# 문서 임베딩 모델 로드
embedding_model = 'BAAI/bge-m3'
model_kwargs = {'device':'cuda'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
    )

  warn_deprecated(


# 1. Retriever (문서 별로 처리)
- 문서 전처리(replace 등) 하는 내용 추가해줘야 함 (pdf_to_chunk)
- rerank도 문서별로 몇 개는 적용하고 몇 개는 적용 안 할지? (따로 적용할거면 pdfs_opt.json에 추가해두기)
- source 별로 retriever dict에 저장해두고 사용해야 함

In [None]:
from pdf_to_retriever import pdf_to_chunk, chunk_to_retriever

In [None]:
%%time
# 전체 문서 retriever 일괄 생성
pdf_database = {}
source_path = './data/test_source/'

for source in test.Source.unique() :
    pdf_path = source_path + source + '.pdf'
    chunk_documents = pdf_to_chunk(pdf_path, pdfs_opt, tokenizer)
    retriever = chunk_to_retriever(chunk_documents, pdf_path, pdfs_opt, embeddings)

    # 결과 저장
    pdf_database[source] = retriever

Processing 「FIS 이슈&포커스」 22-2호 《재정성과관리제도》.pdf...
ensemble retriever 생성
rerank retriever 생성
Processing 「FIS 이슈 & 포커스」(신규) 통권 제1호 《우발부채》.pdf...
ensemble retriever 생성
rerank retriever 생성
CPU times: user 5.93 s, sys: 376 ms, total: 6.3 s
Wall time: 7.15 s


# 2.Model Load

In [None]:
base_model_url = "rtzr/ko-gemma-2-9b-it" # kogemma

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(base_model_url)

In [None]:
model = Gemma2ForCausalLM.from_pretrained(
                base_model_url,
                quantization_config=quantization_config,
                device_map={"":0},
                torch_dtype=torch.float16,
                trust_remote_code=True,
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

# 2. Prompt Engineering

In [5]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
prompt_text = """
    주어진 정보를 바탕으로 주어진 질문에 대해 답변을 생성하세요.
    질문의 주어를 포함해 완성된 문장으로 대답해주세요.
    모든 답변은 격식체, 존댓말로 완성된 문장으로 대답해주세요.
    관련된 문서 내용은 모두 반영해 대답해주세요.
    수치, 값은 문서에 나온 표현을 활용해 답변해주세요.
    주어진 질문 외 추가 질문을 생성하지 마세요.  :

    문맥: {context}
    질문: {question}
    답변:
    """
context_prompt = '문서'

In [None]:
# PromptTemplate 수정하기 (모델별로)
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_text
)

In [None]:
# template 변경하기, k 최대 값 얼마인지 확인
def format_docs(query, pdf_nm, pdfs_opt, context_prompt):
    pdf_retriever = pdf_database[pdf_nm]
    docs = pdf_retriever.get_relevant_documents(query, consider_metadata=False)

    context = ""
    for i in range(1, len(docs)+1) :
        context += f"""
        < {context_prompt} {i} >
        {docs[i-1].page_content}

        """
    prompt = prompt_template.format(context=context, question=query)

    return prompt

# 3. Inference

## Make Answer
- 모델 답변 별로 전처리 방식 달라질 수 있음
- pdfs_opt input으로 안 넣고 만들어보기

In [None]:
def make_answer(query, pdf_nm, pdfs_opt, context_prompt) :
    # pdf_nm = test.Source
    prompt = format_docs(query, pdf_nm, pdfs_opt, context_prompt)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda:0")
    l = len(tokenizer(prompt)["input_ids"])
    outputs = model.generate(**inputs, max_length=l+300, num_return_sequences = 1)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return text

In [None]:
results = []
for _, row in tqdm(test.iterrows(), total=len(test), desc="Answering Questions"):
    source = row['Source']
    query = row['Question']

    answer = make_answer(query, source, pdfs_opt, context_prompt).split("답변:\n    ")[-1]

    # 답변 전처리
    if '질문' in answer :
        answer = answer.split('질문')[0]

    answer = answer.split('\n')[0]

    # 확인용
    print(f'Question: {query}')
    print(f'Answer: {answer}')

    # 결과 저장
    results.append({
        'SAMPLE_ID': row['SAMPLE_ID'],
        'Source' : source,
        'Question': query,
        'Answer': answer
    })

In [None]:
results_df = pd.DataFrame(results)
save_nm = '저장 파일명.csv'
results_df.to_csv('./results/'+save_nm, index=False, encoding='utf-8-sig')

### 제출용 답안 생성

In [None]:
submit_df = results_df[['SAMPLE_ID', 'Answer']]
submit_df.to_csv('./submits/'+'[Submit] '+save_nm, index=False, encoding='utf-8-sig')