In [None]:
pip install streamlit

In [2]:
%%writefile chatbot.py

import os
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, DataFrameLoader
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.tools.retriever import create_retriever_tool
from langchain.prompts import ChatPromptTemplate
import tempfile
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.agents import create_tool_calling_agent, AgentExecutor
import pandas as pd
# .env 파일 로드
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# API 키를 환경변수로 관리하기 위한 설정 파일

# 폴더 경로 설정
folder_path = "./data"  # 분석할 파일이 저장된 폴더 경로
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# PDF 문서 로드 함수
def load_pdf_with_metadata(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load_and_split(text_splitter)
    for doc in documents:
        doc.metadata["source"] = os.path.basename(file_path)
        doc.metadata["page"] = doc.metadata.get("page", "Unknown")
    return documents

# 엑셀 문서 로드 함수
def load_excel_with_metadata(file_path):
    documents = []
    xls = pd.ExcelFile(file_path)
    for sheet_name in xls.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        loader = DataFrameLoader(df, page_content_column=df.columns[0])
        sheet_docs = loader.load_and_split(text_splitter)
        for doc in sheet_docs:
            doc.metadata["source"] = os.path.basename(file_path)
            doc.metadata["sheet_name"] = sheet_name
            doc.metadata["cell_range"] = f"A1:{df.columns[-1]}{len(df)}"  # 추가 셀 범위 정보
        documents.extend(sheet_docs)
    return documents


import pandas as pd
import os
import chardet

def load_csv_with_metadata(file_path):
    documents = []
    
    # 파일 인코딩 자동 감지
    with open(file_path, "rb") as f:
        detected_encoding = chardet.detect(f.read())["encoding"]
    
    # 감지된 인코딩으로 파일 읽기
    df = pd.read_csv(file_path, encoding=detected_encoding)
    
    # NaN 값 처리 (빈 문자열로 대체)
    df.fillna("", inplace=True)
    
    loader = DataFrameLoader(df, page_content_column=df.columns[0])
    csv_docs = loader.load_and_split(text_splitter)
    
    for doc in csv_docs:
        doc.metadata["source"] = os.path.basename(file_path)
        doc.metadata["cell_range"] = f"A1:{df.columns[-1]}{len(df)}"  # 추가 셀 범위 정보
    documents.extend(csv_docs)
    
    return documents



# # CSV 문서 로드 함수
# def load_csv_with_metadata(file_path):
#     documents = []
#     df = pd.read_csv(file_path)
#     loader = DataFrameLoader(df, page_content_column=df.columns[0])
#     csv_docs = loader.load_and_split(text_splitter)
#     for doc in csv_docs:
#         doc.metadata["source"] = os.path.basename(file_path)
#         doc.metadata["cell_range"] = f"A1:{df.columns[-1]}{len(df)}"  # 추가 셀 범위 정보
#     documents.extend(csv_docs)
#     return documents

# 폴더 내 모든 문서를 로드

def load_documents_from_folder(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".pdf"):
            documents.extend(load_pdf_with_metadata(file_path))
        elif file_name.endswith(".xlsx") or file_name.endswith(".xls"):
            documents.extend(load_excel_with_metadata(file_path))
        elif file_name.endswith(".csv"):
            documents.extend(load_csv_with_metadata(file_path))
    return documents


# 에이전트와 대화하는 함수
def chat_with_agent(user_input, agent_executor):
    result = agent_executor({"input": user_input})
    response = result['output']  # 명시적으로 출력 키를 처리
    return response

# 세션 기록 가져오는 함수
def get_session_history(session_ids):
    if session_ids not in st.session_state.session_history:
        st.session_state.session_history[session_ids] = ChatMessageHistory()
    return st.session_state.session_history[session_ids]

# 대화 내용 출력하는 함수
def print_messages():
    for msg in st.session_state["messages"]:
        st.chat_message(msg['role']).write(msg['content'])


# 모든 문서 로드
all_docs = load_documents_from_folder(folder_path)


# FAISS 인덱스 설정 및 생성
vector = FAISS.from_documents(all_docs, OpenAIEmbeddings())
print(f"Number of documents loaded: {len(all_docs)}")

retriever = vector.as_retriever()

# 도구 정의
retriever_tool = create_retriever_tool(
    retriever,
    name="pdf_search",
    description="Use this tool to search information from the pdf document"
)

# Streamlit 메인 코드
def main():
    # 페이지 설정
    st.set_page_config(page_title="AI 비서", layout="wide", page_icon="🤖")

    st.image('Cute_Robot_Tractor_with_Label.png', width=500)
    st.markdown('---')
    st.title("안녕하세요! RAG를 활용한 'AICC' 입니다")  # 시작 타이틀

    # 세션 초기화
    if "messages" not in st.session_state:
        st.session_state["messages"] = []

    if "session_history" not in st.session_state:
        st.session_state["session_history"] = {}

# return retriever_tool
    tools = [retriever_tool]

    # LLM 설정
    llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

    # Prompt 정의
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """
                You are a friendly and professional expert with over 10 years of experience at AICC. Your role is to respond to internal staff inquiries in a professional yet approachable manner. All responses should be written in Korean.

                You specialize in resolving customer issues, providing product and service information, offering technical support, and guiding staff through various processes. Your current role involves assisting internal staff in handling customer inquiries effectively, leveraging given data to promptly provide solutions.

                Analyze the given DataFrame (df) containing the columns Inquiry Type, Customer Name, Inquiry Content, and Received Date. Categorize each inquiry into one of the following categories: [Product Information], [Service Issue], [Technical Support], or [Other]. Additionally, summarize the main points of the customer inquiry and suggest an appropriate solution for internal staff.
                Please always include emojis in your responses with a friendly tone.
                When the chat begins, first introduce yourself and your goal, and then request information about the type of data you will be processing or any additional details you might need. If the keywords provided by the user do not match the predefined categories, clarify the user's intent, verify if the keywords are suitable, and request clarification if necessary.

                Responses should follow the format below:
                    
                "Your name is `AICC 도움이`. Please introduce yourself at the beginning of the conversation." 

                #FORMAT

                    * 문의 유형
                    -

                    * 문의 답변
                    -
                    -
                    -              
                    
                """

            ),
            ("placeholder", "{chat_history}"),
            ("human", "{input}"),
            ("placeholder", "{agent_scratchpad}"),
        ]
    )

    # 에이전트 생성 (initialize_agent 대신 create_tool_calling_agent 사용)
    agent = create_tool_calling_agent(llm, tools, prompt)

    # AgentExecutor 정의
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

    # 사용자 입력 처리
    user_input = st.chat_input('질문이 무엇인가요?')

    if user_input:
        session_id = "default_session"
        session_history = get_session_history(session_id)

        if session_history.messages:
            previous_messages = [{"role": msg['role'], "content": msg['content']} for msg in session_history.messages]
            response = chat_with_agent(user_input + "\n\nPrevious Messages: " + str(previous_messages), agent_executor)
        else:
            response = chat_with_agent(user_input, agent_executor)

        # 메시지를 세션에 추가
        st.session_state["messages"].append({"role": "user", "content": user_input})
        st.session_state["messages"].append({"role": "assistant", "content": response})

        # 세션 기록에 메시지를 추가
        session_history.add_message({"role": "user", "content": user_input})
        session_history.add_message({"role": "assistant", "content": response})

    # 대화 내용 출력
    print_messages()



if __name__ == "__main__":
    main()


Overwriting chatbot.py


In [None]:
pip install chardet PyMuPDFLoader



Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement PyMuPDFLoader (from versions: none)
ERROR: No matching distribution found for PyMuPDFLoader


In [None]:
%%writefile chatbot.py

import os
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.tools.retriever import create_retriever_tool
from langchain.prompts import ChatPromptTemplate
from langchain.agents import create_tool_calling_agent, AgentExecutor
from dotenv import load_dotenv

# .env 파일 로드
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ['OPENAI_API_KEY'] = st.secrets["OPENAI_API_KEY"]
# .env 파일 로드

def load_pdf_files(pdf_paths):
    all_documents = []

    for pdf_path in pdf_paths:
        # PyPDFLoader를 사용하여 파일 로드
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        all_documents.extend(documents)

    # 텍스트 분할기 설정
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_docs = text_splitter.split_documents(all_documents)

    # FAISS 인덱스 설정 및 생성
    vector = FAISS.from_documents(split_docs, OpenAIEmbeddings())
    retriever = vector.as_retriever()

    # 도구 정의
    retriever_tool = create_retriever_tool(
        retriever,
        name="pdf_search",
        description="Use this tool to search for information within the PDF document about tractors."
    )
    return retriever_tool

# 에이전트와 대화하는 함수
def chat_with_agent(user_input, agent_executor):
    result = agent_executor({"input": user_input})
    response = result['output']  # 명시적으로 출력 키를 처리
    return response

# 대화 내용 출력하는 함수
def print_messages():
    for msg in st.session_state["messages"]:
        st.chat_message(msg['role']).write(msg['content'])

# Streamlit 메인 코드
def main():
    # 페이지 설정
    st.set_page_config(page_title="대동 AICC 도움이", layout="wide", page_icon="🤖")

    st.image('Cute_Robot_Tractor_with_Label.png', width=600)
    st.markdown('---')
    st.title("안녕하세요! '대동 AICC 도움이' 입니다")  # 시작 타이틀

    # 세션 초기화
    if "messages" not in st.session_state:
        st.session_state["messages"] = []

    # 특정 PDF 경로 지정
    pdf_docs = ['./data/대형_트랙터.pdf', './data/중소_트랙터.pdf', './data/중형_트랙터.pdf', './data/대동_트랙터_정보.pdf']  # PDF 파일 경로를 직접 지정

    if pdf_docs:
        pdf_search = load_pdf_files(pdf_docs)
        tools = [pdf_search]

        # LLM 설정
        llm = ChatOpenAI(model_name="gpt-4", temperature=0)

        # 프롬프트 설정
        prompt = ChatPromptTemplate.from_messages(
            [
                ("system",
                 "Be sure to answer in Korean. You are a helpful assistant. "
                 "Make sure to use the `pdf_search` tool for searching information from the PDF document. "
                 "Please always include emojis in your responses with a friendly tone. "
                 "Your name is `대동 AICC 도움이`. Please introduce yourself at the beginning of the conversation."
                 ''' 
                You are a very friendly chatbot called `대동 AICC 도움이`. 😊 \n\n
                Always start with a greeting and provide a brief introduction about yourself. \n\n
                Always respond in a friendly tone and use emojis. You are highly knowledgeable about Daedong tractors and provide clear, helpful information about them. When answering customer inquiries, always follow the format below:
                Our company is `대동`. Our competitors include `LS엠트로` and `TYM`. For questions about product comparisons with competitors, always highlight the strengths of `대동` at the end of your response and recommend `대동` tractor products.
                # Format
                **[문의 내역]** Customer's question\n\n
                **[문의 유형]** Category of the customer's question \n\n
                **[답변]** Provide a clear and visually appealing response to customer inquiries. Use lists or tables to organize the information effectively.
                 '''),
                ("placeholder", "{chat_history}"),
                ("human", "{input} \n\n Be sure to include emoji in your responses."),
                ("placeholder", "{agent_scratchpad}"),
            ]
        )

        # 에이전트 생성
        agent = create_tool_calling_agent(llm, tools, prompt)

        # AgentExecutor 정의
        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

        # 사용자 입력 처리
        user_input = st.chat_input('질문이 무엇인가요?')

        if user_input:
            response = chat_with_agent(user_input, agent_executor)

            # 메시지를 세션에 추가
            st.session_state["messages"].append({"role": "user", "content": user_input})
            st.session_state["messages"].append({"role": "assistant", "content": response})

        # 대화 내용 출력
        print_messages()

if __name__ == "__main__":
    main()


Overwriting chatbot.py


In [7]:
pip install chardet

Note: you may need to restart the kernel to use updated packages.


