**Load the dataset**

In [None]:
!kaggle datasets download -d haithemhermessi/sanad-dataset

Dataset URL: https://www.kaggle.com/datasets/haithemhermessi/sanad-dataset
License(s): U.S. Government Works
Downloading sanad-dataset.zip to /content
 96% 63.0M/65.7M [00:04<00:00, 21.4MB/s]
100% 65.7M/65.7M [00:04<00:00, 16.0MB/s]


In [None]:
import zipfile

# Unzip the dataset
with zipfile.ZipFile('sanad-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset_directory')


In [None]:
!pip install langchain langchain langchain_community langchain-google-genai langchain_experimental sentence-transformers langchain_chroma langchainhub

Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-1.0.10-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.0.64-py3-none-any.whl.metadata (1.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.3-py3-none-any.whl.metadata (1.5 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading langchain_core-0.2.35-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)

In [None]:
import os
from langchain.schema import Document

In [None]:
def read_and_create_documents(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return Document(page_content=text, metadata={"source": file_path})

# Iterate through directories and files
def process_directory(directory):
    documents = []
    for category in os.listdir(directory):
        category_path = os.path.join(directory, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                file_path = os.path.join(category_path, filename)
                if file_path.endswith('.txt'):
                    documents.append(read_and_create_documents(file_path))
    return documents

In [None]:
dataset_directory = '/content/dataset_directory'
all_documents = process_directory(dataset_directory)

len(all_documents)

45500

In [None]:
all_documents[0]

Document(metadata={'source': '/content/dataset_directory/Culture/0213.txt'}, page_content='استضاف بيت الشعر مساء أمس الأول الشاعر يوسف أبولوز في أمسية حضرها حشد من جمهور الشعر والإعلاميين. قدم الأمسية الزميل حسام ميرو بمداخلة نقدية عن تجربة يوسف أبولوز الشعرية أوضح فيها فرادة صوت أبولوز في المشهد الشعري الفلسطيني حيث لم يقع في أفخاخ الايديولوجيا التي غالباً ما تقضي على الشعر وجمالياته لمصلحة الشعار العام. ومن يقرأ أعماله منذ صباح الكاتيوشا أيها المخيم حتى مجموعته الأخيرة خط الهزلاج يدرك أن هذا الشاعر مسكون بتراث حي ومتجدد هو تراث الطبيعة.ومن قصيدة قافية مكررة من ديوان خط الهزلاج قرأ أبولوز مقاطع تعبر عن مواجهة مفتوحة وعلنية بين الذات الطامحة إلى الحرية في مطلقها وواقع طافح بالمرارة والألم حيث قال:وأنا،أكتب السيرة.. الله يرحمني لا الأناس،لقد صغت حرفين من أصل حرفولونت ثيابي بهذا الدم الحلو.. دم الغزال،الذي شمني ثم طاف،وبعد.. ارتجل:الضحى للغزالات،ثم دماءك فيهاهنا في العروق كلام وماء،سماء من الأرجوان،وطين له لذعة الحب، ثمة امرأةبلغت سن يأسينيأس المرايا.. ويأسي.وفي مجموعة من القصائد القصيرة

**Encode the data into vectore database**

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from tqdm import tqdm

persist_directory = "/content/drive/MyDrive/Electro_Pi_Capston_2/vectorstore/"

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

vectorstore = Chroma.from_documents(documents=all_documents[:41666], embedding=embedding_model,
                                    persist_directory=persist_directory)

In [None]:
# Save the vectorstore in drive
vectorstore.persist()

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

**LLMs Model Setup**

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata

google_api_key = userdata.get('google_api_key')
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",max_tokens=500,google_api_key = google_api_key)

**Prompt Setup**

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
"أنت مساعد لمهام الإجابة على الأسئلة."
" استخدم الأجزاء التالية من السياق المسترجع للإجابة علي السؤال في ايجاز "
"\n\n"
"{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "هل استضاف بيت الشعر مساء أمس الأول الشاعر يوسف أبولوز ؟"})
print(response["answer"])

نعم، استضاف بيت الشعر مساء أمس الأول الشاعر يوسف أبولوز. 



**Load the Vector database from drive**

In [None]:
!pip install gdown

In [None]:
import gdown

file_id = "1-PE7eJwdjrUVYx7YFAtwE7g1frYvKAKc"
downloaded_file_path = "/content/chroma.sqlite3"

# Construct the Google Drive download URL
gdrive_url = f"https://drive.google.com/uc?id={file_id}"

# Download the file using gdown
gdown.download(gdrive_url, downloaded_file_path, quiet=False)

# Ensure the file was downloaded correctly
if os.path.exists(downloaded_file_path):
    print(f"File downloaded successfully: {downloaded_file_path}")
else:
    print("File download failed.")


In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

persist_directory = "/content"

# Initialize the embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Load the Chroma vectorstore using the persisted SQLite file
vectorstore = Chroma(embedding_function=embedding_model, persist_directory=persist_directory)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

**Making UI using Gradio**

In [None]:
import gradio as gr
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Define the function to process the input and return the outputs
def qa_system(question):
    response = rag_chain.invoke({"input": question})
    retrieved_docs = retriever.invoke(question)
    retrieved_content = "\n\n".join([doc.page_content for doc in retrieved_docs])
    return response["answer"], retrieved_content

# Create the Gradio interface with 1 input and 2 outputs
interface = gr.Interface(
    fn=qa_system,
    inputs="text",
    outputs=[
        gr.Textbox(label="Answer to the Question"),
        gr.Textbox(label="Most Similar Documents"),
    ],
    title="Question-Answering System",
    description="Enter a question to get relevant articles and the best answer."
)

interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://62e2899e9b88b3654f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


