<a href="https://colab.research.google.com/github/aghadavood/persian-qa-system/blob/main/gradio_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install openai
!pip install  --upgrade gradio
!pip install faiss-cpu
!pip install tiktoken
!pip install ijson



In [None]:
!pip install langchain-community # install the missing module
!pip install transformers
!pip install sentence-transformers




In [None]:
import os
import time
import random
import hashlib
import pickle
from pathlib import Path
from tqdm import tqdm
import ijson
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import gradio as gr
import openai





In [None]:
!pip install --upgrade openai migrate
from openai import OpenAI



In [None]:


OpenAI.api_key = ""
client = OpenAI(api_key=OpenAI.api_key)
# Set your OpenAI API key as an environment variable
#openai.api_key = os.environ.get(')


In [None]:
from google.colab import drive
drive.mount('/content/drive')
json_file_path = '/content/drive/My Drive/corpus-engineering.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Use a local embedding model
embeddings = HuggingFaceEmbeddings(model_name="distiluse-base-multilingual-cased-v2")

# Text splitter for Persian
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '،', '؛', '.'],  # Persian-specific separators
    chunk_size=1000,
    chunk_overlap=50
)


  embeddings = HuggingFaceEmbeddings(model_name="distiluse-base-multilingual-cased-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def prepare_document(item):
    text = f"شناسه: {item['id']}\nعنوان: {item['title']}\nچکیده: {item['abstract']}\nموضوع اول: {item['FirstSubject']}\nموضوع دوم: {item['SecondSubject']}"
    return text_splitter.create_documents([text])


In [None]:
def process_documents(file_path, batch_size=1000):
    with open(file_path, 'rb') as f:
        items = ijson.items(f, 'item')
        batch = []
        for item in items:
            batch.extend(prepare_document(item))
            if len(batch) >= batch_size:
                yield batch
                batch = []
        if batch:
            yield batch


In [None]:
def get_vector_db(file_path, force_rebuild=False):
    embeddings_folder = Path("embeddings")
    embeddings_folder.mkdir(exist_ok=True)

    file_hash = hashlib.md5(Path(file_path).read_bytes()).hexdigest()
    faiss_index_path = embeddings_folder / f"{file_hash}.faiss"
    index_metadata_path = embeddings_folder / f"{file_hash}_metadata.pkl"

    if not force_rebuild and faiss_index_path.exists() and index_metadata_path.exists():
        print(f"Loading existing FAISS index from {faiss_index_path}...")
        vector_db = FAISS.load_local(str(faiss_index_path), embeddings, allow_dangerous_serialization=True)
        with open(index_metadata_path, 'rb') as f:
            vector_db.docstore._dict = pickle.load(f)
        return vector_db

    print("Building new FAISS index...")
    vector_db = None
    for batch in process_documents(file_path):
        if vector_db is None:
            vector_db = FAISS.from_documents(batch, embeddings)
        else:
            vector_db.add_documents(batch)

    print(f"Saving FAISS index to {faiss_index_path}...")
    vector_db.save_local(str(faiss_index_path))
    with open(index_metadata_path, 'wb') as f:
        pickle.dump(vector_db.docstore._dict, f)

    return vector_db


In [None]:
def generate_answer(query, contents, max_retries=5):
    prompt = f"به پرسش زیر پاسخ دهید. پاسخ باید به زبان فارسی و بر اساس اطلاعات داده شده باشد:\n\nپرسش: {query}\n\nاطلاعات:\n{contents}"
    messages = [
        {"role": "system", "content": "You are a helpful assistant that answers questions in Persian based on the provided information."},
        {"role": "user", "content": prompt}
    ]
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",  # Or "gpt-4" if you have access
                messages=messages,
                max_tokens=500,
                temperature=0.7,
                n=1,
                stop=None,
            )
            return response.choices[0].message.content
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"An error occurred: {e}")
                return "متاسفانه خطایی رخ داد. لطفاً دوباره تلاش کنید."
            time.sleep(2 ** attempt + random.random())


In [None]:
# Load the vector database at the start
vector_db = get_vector_db(json_file_path)


Building new FAISS index...
Saving FAISS index to embeddings/7cee48aa58c44b086d4aeff192ce17b2.faiss...


In [None]:
def gradio_interface(query):
    # Perform similarity search
    results = vector_db.similarity_search(query, k=2)
    contents = "\n".join([doc.page_content for doc in results])

    # Generate answer
    answer = generate_answer(query, contents)
    return answer


In [None]:
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(
        lines=2,
        placeholder="سوال خود را اینجا بنویسید...",
        label="پرسش"
    ),
    outputs=gr.Textbox(
        label="پاسخ"
    ),
    title="سیستم پرسش و پاسخ علمی فارسی",
    description="سوالات علمی خود را به فارسی بپرسید و پاسخ دریافت کنید."
)


In [None]:
if __name__ == "__main__":
    iface.launch(debug = True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://d33024fffb1ef29926.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
