In [9]:
!pip uninstall -y langchain langchain-core langchain-community langchain-text-splitters langgraph langgraph-prebuilt langgraph-checkpoint langchain-classic


Found existing installation: langchain 0.1.16
Uninstalling langchain-0.1.16:
  Successfully uninstalled langchain-0.1.16
Found existing installation: langchain-core 0.1.52
Uninstalling langchain-core-0.1.52:
  Successfully uninstalled langchain-core-0.1.52
Found existing installation: langchain-community 0.0.33
Uninstalling langchain-community-0.0.33:
  Successfully uninstalled langchain-community-0.0.33
Found existing installation: langchain-text-splitters 0.0.1
Uninstalling langchain-text-splitters-0.0.1:
  Successfully uninstalled langchain-text-splitters-0.0.1
Found existing installation: langgraph 1.0.1
Uninstalling langgraph-1.0.1:
  Successfully uninstalled langgraph-1.0.1
Found existing installation: langgraph-prebuilt 1.0.1
Uninstalling langgraph-prebuilt-1.0.1:
  Successfully uninstalled langgraph-prebuilt-1.0.1
Found existing installation: langgraph-checkpoint 3.0.0
Uninstalling langgraph-checkpoint-3.0.0:
  Successfully uninstalled langgraph-checkpoint-3.0.0
Found existing 

In [10]:
!pip install "langchain==0.1.16" \
              "langchain-core==0.1.52" \
              "langchain-community==0.0.33" \
              "langchain-text-splitters==0.0.1" \
              transformers sentence-transformers faiss-cpu --quiet


In [5]:
# ==============================================
# 📘 Retrieval-Augmented Generation (RAG) Arabic
# ==============================================

!pip install langchain langchain-community faiss-cpu sentence-transformers \
             transformers accelerate bitsandbytes --quiet

import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub

# ==============================================
# 1️⃣  Load your dataset (each row = one chunk)
# ==============================================
# Replace with your CSV path
csv_path = "/content/vat_instructions_dataset.csv"

df = pd.read_csv(csv_path)
df = df.dropna(subset=["text_arabic_ocr"]).reset_index(drop=True)
df = df.rename(columns={"text_arabic_ocr": "text"})

# ==============================================
# 2️⃣  Split long text into manageable chunks
# ==============================================
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = []
for i, row in df.iterrows():
    chunks = splitter.split_text(str(row["text"]))
    for ch in chunks:
        texts.append(ch)

print(f"✅ Prepared {len(texts)} text chunks for indexing")

# ==============================================
# 3️⃣  Create embeddings (Arabic-friendly model)
# ==============================================
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

vectordb = FAISS.from_texts(texts, embeddings)
vectordb.save_local("vat_faiss_index")
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# ==============================================
# 4️⃣  Choose a multilingual instruct LLM
# ==============================================
# Hugging Face Hub token
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

llm = HuggingFaceHub(
    repo_id="tiiuae/falcon-7b-instruct",
    model_kwargs={"temperature": 0.3, "max_new_tokens": 256}
)

# ==============================================
# 5️⃣  Build the RAG QA chain
# ==============================================
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# ==============================================
# 6️⃣  Ask questions interactively
# ==============================================
query = "ما هي الحالات التي يتم فيها إعفاء السلع من ضريبة القيمة المضافة؟"
result = rag_chain.invoke(query)

print("\n🔹 الجواب:")
print(result["result"])
print("\n📄 المقاطع المستخدمة:")
for i, doc in enumerate(result["source_documents"], 1):
    snippet = doc.page_content[:200].replace("\n", " ")
    print(f"{i}. {snippet}...")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/566.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m563.2/566.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Prepared 252 text chunks for indexing


AttributeError: 'InferenceClient' object has no attribute 'post'

In [14]:
# =========================================
# ✅ Minimal Arabic RAG – no LangChain
# =========================================
!pip install -q faiss-cpu sentence-transformers transformers pandas

import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import numpy as np

# ---------- 1️⃣ Load dataset ----------
csv_path = "/content/vat_instructions_dataset.csv"  # change if needed
df = pd.read_csv(csv_path).dropna(subset=["text_arabic_ocr"])
docs = df["text_arabic_ocr"].tolist()
print(f"Loaded {len(docs)} documents")

# ---------- 2️⃣ Create embeddings index ----------
embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedder.encode(docs, convert_to_numpy=True, normalize_embeddings=True)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} entries")

# ---------- 3️⃣ Load lightweight multilingual model ----------
model_id = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
gen = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)

# ---------- 4️⃣ Define RAG function ----------
def ask(question, top_k=3):
    q_emb = embedder.encode([question], normalize_embeddings=True)
    scores, idxs = index.search(q_emb, top_k)
    retrieved = [docs[i] for i in idxs[0]]
    context = "\n".join(retrieved)
    prompt = f"السؤال: {question}\n\nاستعن بالنص التالي للإجابة:\n{context}\n\nالإجابة:"
    answer = gen(prompt)[0]["generated_text"]
    return answer

# ---------- 5️⃣ Try a question ----------
q = "ما هي الحالات التي يتم فيها إعفاء السلع من ضريبة القيمة المضافة؟"
print("🔹 السؤال:", q)
print("🔸 الإجابة:", ask(q))


Loaded 85 documents
FAISS index built with 85 entries


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


🔹 السؤال: ما هي الحالات التي يتم فيها إعفاء السلع من ضريبة القيمة المضافة؟
🔸 الإجابة: <extra_id_0> - ٠٠٠ جنيه.


In [None]:
#_____________________________Cannot run on colab because of GPU_________________________________
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ====== Load your dataset ======
csv_path = "/content/vat_instructions_dataset.csv"
df = pd.read_csv(csv_path).dropna(subset=["text_arabic_ocr"]).reset_index(drop=True)
df = df.rename(columns={"text_arabic_ocr": "text"})

# ====== Split into chunks ======
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = [ch for t in df["text"] for ch in splitter.split_text(str(t))]
print(f"✅ Prepared {len(texts)} chunks")

# ====== Embeddings + Vector store ======
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
vectordb = FAISS.from_texts(texts, embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# ====== Load a small, open instruction model ======
model_id = "mistralai/Mistral-7B-Instruct-v0.2"  # open, multilingual, no token
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

gen_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=gen_pipe)

# ====== Build the RAG chain ======
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# ====== Ask a question ======
query = "ما هي الحالات التي يتم فيها إعفاء السلع من ضريبة القيمة المضافة؟"
result = rag_chain.invoke(query)

print("\n🔹 الجواب:")
print(result["result"])
print("\n📄 المقاطع المستخدمة:")
for i, doc in enumerate(result["source_documents"], 1):
    print(f"{i}. {doc.page_content[:200]}...\n")


In [2]:
!pip install -U langchain langchain-community langchain-text-splitters sentence-transformers faiss-cpu transformers accelerate bitsandbytes --quiet


In [4]:
# =====================================================
# 🔥 Arabic RAG with FAISS + Mistral-7B-Instruct
# =====================================================
!pip install -U langchain langchain-community sentence-transformers faiss-cpu \
               transformers accelerate bitsandbytes --quiet

import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# -----------------------------------------------------
# 1️⃣ Load your dataset
# -----------------------------------------------------
csv_path = "/content/vat_instructions_dataset_ocr.csv"
df = pd.read_csv(csv_path).dropna(subset=["text_arabic_ocr"]).reset_index(drop=True)
df = df.rename(columns={"text_arabic_ocr": "text"})

# -----------------------------------------------------
# 2️⃣ Split long pages into short chunks
# -----------------------------------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = [ch for t in df["text"] for ch in splitter.split_text(str(t))]
print(f"✅ Prepared {len(texts)} chunks")

# -----------------------------------------------------
# 3️⃣ Build the FAISS vector index (multilingual embeddings)
# -----------------------------------------------------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
vectordb = FAISS.from_texts(texts, embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# -----------------------------------------------------
# 4️⃣ Load a strong multilingual instruction model
# -----------------------------------------------------
model_id = "mistralai/Mistral-7B-Instruct-v0.2"   # ~13 GB, needs GPU
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",              # uses GPU automatically
    torch_dtype="auto",
    load_in_8bit=True               # halves VRAM usage if bitsandbytes available
)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.3,
    do_sample=False
)
llm = HuggingFacePipeline(pipeline=gen_pipe)

# -----------------------------------------------------
# 5️⃣ Build Retrieval-Augmented QA chain
# -----------------------------------------------------
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# -----------------------------------------------------
# 6️⃣ Ask a question in Arabic
# -----------------------------------------------------
query = "ما هي الحالات التي يتم فيها إعفاء السلع من ضريبة القيمة المضافة؟"
result = rag_chain.invoke(query)

print("\n🔹 الجواب:")
print(result["result"])
print("\n📄 المقاطع المستخدمة:")
for i, doc in enumerate(result["source_documents"], 1):
    snippet = doc.page_content[:200].replace("\n", " ")
    print(f"{i}. {snippet}...")


ModuleNotFoundError: No module named 'langchain.chains'