In [None]:
pip install -q PyPDF2


In [None]:
import os
import gradio as gr
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
vectorstore = None 
retriever = None 

In [None]:
VECTOR_DB_PATH = "vector_db"
os.makedirs(VECTOR_DB_PATH, exist_ok=True)

In [None]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
embeddings = OpenAIEmbeddings()

In [None]:
def chunk_doc(embedding_ready, file_path):
    source_name = os.path.basename(file_path)
    
    doc = Document(page_content=embedding_ready, metadata={"doc_name": source_name})

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents([doc])

    for chunk in chunks:
        chunk.metadata["doc_name"] = source_name

    return chunks

In [None]:
def add_to_db(chunks, db_name):
    persist_path = os.path.join("vector_db", db_name)
    if os.path.exists(persist_path):
        vectorstore = Chroma(persist_directory=persist_path, embedding_function=embeddings)
        vectorstore.add_documents(chunks)
    else:
        vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_path)
    print(len(chunks))

In [None]:
def get_subdirectories(path="."):
    return [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
def refresh_db_list():
    return gr.update(choices=get_subdirectories("vector_db"))

In [None]:
def set_active_vectorstore(selected_db):
    global vectorstore, retriever

    if not selected_db:
        return "No database selected."

    persist_path = os.path.join("vector_db", selected_db)
    
    if not os.path.exists(persist_path):
        return f"Database '{selected_db}' does not exist."

    vectorstore = Chroma(persist_directory=persist_path, embedding_function=embeddings)
    retriever = vectorstore.as_retriever()

    return f"Active VectorStore set to: {selected_db}"

In [None]:
# get file from user
user_file = None

def get_file_extension(file_path):
    """Zwraca rozszerzenie pliku, np. '.txt'."""
    return os.path.splitext(file_path)[1].lower()

def get_file_size_kb(file_path):
    """Zwraca rozmiar pliku w KB (z dokładnością do dwóch miejsc po przecinku)."""
    size_bytes = os.path.getsize(file_path)
    size_kb = size_bytes / 1024
    return round(size_kb, 2)

def detect_file_type(extension):
    """Rozpoznaje typ pliku na podstawie rozszerzenia."""
    types = {
        '.txt': 'Text file',
        '.pdf': 'PDF file',
        '.md': 'Markdown file',
        '.csv': 'CSV file'
    }
    return types.get(extension, 'Unknown file type')

def get_file_from_user(file, base_name_input, existing_db_selector):

    if file is None:
        return "No file uploaded."

    # Wybór bazy: najpierw input, jeśli pusty to dropdown
    db_name = base_name_input.strip() if base_name_input and base_name_input.strip() else existing_db_selector

    file_path = file.name  # Gradio File obj ma .name = path do pliku
    global user_file
    user_file = file_path  # ustawiamy globalnie, żeby process_user_file miało dostęp

    extension = get_file_extension(file_path)
    file_type = detect_file_type(extension)
    file_size_kb = get_file_size_kb(file_path)

    result = f"File type: {file_type}\nFile size: {file_size_kb} KB\nDatabase name: {db_name if db_name else 'No database specified'}"
    return result


def process_user_file(base_name_input, existing_db_selector):
    global user_file

    db_name = base_name_input.strip() if base_name_input and base_name_input.strip() else existing_db_selector

    if not db_name:
        return "No database name provided."
        
    if user_file is None:
        return "No file to process. Please upload a file first."

    file_size_kb = get_file_size_kb(user_file)
    if file_size_kb > 10240:  # 10 MB = 10240 KB
        return f"File size: {file_size_kb} KB. Max file size is 10MB."

    extension = get_file_extension(user_file)
    file_type = detect_file_type(extension)

    # Przygotowanie pliku w zależności od typu (tu prosta symulacja)
    if extension == '.txt' or extension == '.md':
        with open(user_file, 'r', encoding='utf-8') as f:
            content = f.read()
        embedding_ready = content  # np. surowy tekst dla embeddingu
        info = f"File ready for embedding. Type: {file_type}."
    elif extension == '.csv':
        import pandas as pd
        df = pd.read_csv(user_file)
        embedding_ready = df.to_json()  # np. konwersja dataframe do json stringa
        info = f"CSV file converted to JSON for embedding."
    elif extension == '.pdf':
        try:
            from PyPDF2 import PdfReader
            reader = PdfReader(user_file)
            content = ''
            for page in reader.pages:
                content += page.extract_text() + '\n'
            embedding_ready = content
            info = f"PDF text extracted for embedding."
        except Exception as e:
            return f"Error processing PDF: {str(e)}"
    else:
        return "Unsupported file type for embedding."

    chunks = chunk_doc(embedding_ready, user_file)
    add_to_db(chunks, db_name) 
    return f"{info}\nFile added to VectorDB: {db_name}"





In [None]:
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

def chat_with_retrieval(query):
    global retriever
    if retriever is None:
        return "No VectorStore selected. Please select a database first."

    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory
    )

    result = conversation_chain.invoke({"question": query})
    return result['answer']

In [None]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
with gr.Blocks() as ui:
    gr.Markdown("## Load your files and perform semantic search with chatAI")

    with gr.Row(equal_height=True):
        with gr.Column():
            file_input = gr.File(label="Upload your file")
        with gr.Column():
            file_output = gr.Textbox(label="File info", interactive=False)
            with gr.Column():
                with gr.Row():
                    base_name_input = gr.Textbox(label="Give name new database")
                    existing_db_selector = gr.Dropdown(
                    label="Or choose existing",
                    choices=get_subdirectories("vector_db"),
                    value="-- Choose vector DB --",
                    interactive=True,
                    allow_custom_value=True
                )
                embed_btn = gr.Button("Add to VectorBD")
    with gr.Row(equal_height=True):
        with gr.Column():
            db_selector = gr.Dropdown(
                label="Choose database",
                choices=get_subdirectories("vector_db"),
                value="-- Choose vector DB --",
                interactive=True,
                allow_custom_value=True
            )
            with gr.Row():
                accept_button = gr.Button("Accept")
                refresh_button = gr.Button("Refresh DB List")

    with gr.Row(equal_height=True):
        with gr.Column():
            gr.ChatInterface(chat, type="messages")
            
        
    
    file_input.change(get_file_from_user, inputs=[file_input, base_name_input, existing_db_selector], outputs=[file_output])
    embed_btn.click(process_user_file, inputs=[base_name_input, existing_db_selector], outputs=[file_output])

    accept_button.click(set_active_vectorstore, inputs=[db_selector], outputs=[file_output])

    refresh_button.click(refresh_db_list, outputs=[db_selector])

ui.launch()