In [1]:
# ============================
# Install required packages
# ============================
!pip install langchain                       # Core LangChain framework (chains, prompts, integrations)
!pip install PyPDF2                          # PDF text extraction
!pip install faiss-cpu                       # FAISS vector index (CPU build)
!pip install tiktoken                        # Tokenization utilities (not strictly needed but commonly useful)
!pip install langchain-community             # Community integrations for LangChain (embeddings, LLM wrappers, stores)
!pip install -q transformers accelerate bitsandbytes sentence-transformers  # HF models, speedups, 4-bit quant, sentence embeddings


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-n

In [2]:

# ============================
# Import required libraries
# ============================
from PyPDF2 import PdfReader                  # Read and parse PDF files
from langchain.text_splitter import CharacterTextSplitter  # Split raw text into manageable chunks
from langchain.vectorstores import FAISS      # Vector store (if this import fails, use community import below)
# from langchain_community.vectorstores import FAISS  # <- Alternative import if needed
from langchain.chains.question_answering import load_qa_chain  # Simple QA chain ("stuff" type)
from langchain.prompts import PromptTemplate   # Custom prompt templates for LangChain chains

from transformers import (                     # HF Transformers to load/run the ALLaM 7B model
    AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
)
from langchain_community.llms import HuggingFacePipeline     # Wrap HF pipeline into a LangChain LLM
from langchain_community.embeddings import HuggingFaceEmbeddings  # Open-source embeddings (E5, etc.)

from google.colab import drive                 # Mount Google Drive (optional for file I/O)


In [3]:
# ============================
# Provide the path of the PDF file
# ============================
pdfreader = PdfReader('/content/sample_data/ai-principles.pdf')  # Load the target PDF from /content

In [4]:
# ============================
# Extract text from PDF pages
# ============================
raw_text = ''                                  # Accumulator for all extracted text
for i, page in enumerate(pdfreader.pages):     # Iterate over pages
    content = page.extract_text()              # Extract text from the current page
    if content:                                # Append only if extraction succeeded
        raw_text += content


In [5]:
# ============================
# Split the text into smaller chunks
# ============================
# This prevents exceeding token limits for the model and improves retrieval granularity
text_splitter = CharacterTextSplitter(
    separator = "\n",                          # Split on newline boundaries
    chunk_size = 800,                          # Each chunk target size (in characters)
    chunk_overlap  = 200,                      # Overlap to maintain context continuity
    length_function = len,                     # Use Python 'len' to measure length
)
texts = text_splitter.split_text(raw_text)     # Produce a list of chunked strings

print(f"Number of text chunks: {len(texts)}")  # Quick sanity check on chunk count

Number of text chunks: 123


In [6]:
# ============================
# Create open-source embeddings (supports Arabic)
# ============================
# E5 models work best when prefixing "passage:" for documents and "query:" for user questions
texts_for_index = [f"passage: {t}" for t in texts]  # Prepare chunks for E5 format
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-base",     # Multilingual (Arabic-friendly) embedding model
    encode_kwargs={"normalize_embeddings": True}    # L2-normalize vectors for cosine similarity
)

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [7]:
# ============================
# Create FAISS vector store from texts
# ============================
document_search = FAISS.from_texts(texts_for_index, embeddings)  # Build the FAISS index from embeddings

print("Vector store created successfully.")     # Confirm index creation


Vector store created successfully.


In [8]:
# ============================
# Load ALLaM-AI/ALLaM-7B-Instruct-preview model
# ============================
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"    # The open-source Arabic/English instruct model

# Configure 4-bit quantization to save memory (recommended for Colab T4)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                         # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",                 # NF4 quantization scheme (better accuracy)
    bnb_4bit_use_double_quant=True,            # Double quantization for memory/accuracy balance
    bnb_4bit_compute_dtype="bfloat16"          # Compute in bfloat16 on supported GPUs
)


In [9]:
# Load tokenizer and model from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)  # Tokenizer with custom chat template support
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",                         # Let HF infer the best device placement (GPU/CPU)
    quantization_config=bnb_config,            # Apply 4-bit quantization config
    torch_dtype="bfloat16",                    # Model compute dtype (matches compute dtype above)
    trust_remote_code=True,                    # Allow custom modeling code provided by the repo
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.03G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [10]:
# Create a text generation pipeline (HF)
gen_pipe = pipeline(
    "text-generation",                         # Task: causal LM generation
    model=model,                               # The loaded ALLaM model
    tokenizer=tokenizer,                       # Matched tokenizer
    max_new_tokens=512,                        # Upper bound on generated tokens per call
    do_sample=True,                            # Enable sampling for natural responses
    temperature=0.2,                           # Low temperature for more deterministic outputs
    repetition_penalty=1.05,                   # Light penalty to reduce verbatim repetition
)

Device set to use cuda:0


In [11]:
# Wrap the HF pipeline in a LangChain LLM interface
llm = HuggingFacePipeline(pipeline=gen_pipe)    # Allows using this model inside LangChain chains


  llm = HuggingFacePipeline(pipeline=gen_pipe)    # Allows using this model inside LangChain chains


In [12]:
# ============================
# Create QA chain with custom prompt
# ============================
prompt_tmpl = PromptTemplate(
    input_variables=["context", "question"],   # Variables LangChain will inject
    template=(
        "أنت مساعد أكاديمي. أجب عن السؤال بدقة وبالعربية الفصحى "
        "اعتمادًا فقط على المقاطع المعطاة.\n"
        "إن لم تجد الإجابة في المقاطع، قل: «لا يظهر ذلك في الوثيقة».\n\n"
        "السؤال: {question}\n\n"
        "المقاطع:\n{context}\n\n"
        "الإجابة:"
    ),
)

chain = load_qa_chain(                         # Build a simple 'stuff' QA chain
    llm,                                       # Our ALLaM-based LLM
    chain_type="stuff",                        # Concatenate retrieved docs then prompt the LLM
    prompt=prompt_tmpl                         # Use our Arabic instruction prompt
)


stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(                         # Build a simple 'stuff' QA chain


In [13]:
# ============================
# Helper function to search and answer
# ============================
def ask_question(query, k=4):                  # Small helper for convenience
    docs = document_search.similarity_search(  # Retrieve top-k relevant chunks from FAISS
        f"query: {query}", k=k                 # Prefix "query:" for E5 alignment
    )
    answer = chain.run(                        # Run the QA chain with docs + question
        input_documents=docs, question=query
    )
    print(f"Q: {query}")                       # Pretty print the result
    print(f"A: {answer}\n")


In [16]:
# ============================
# Example queries
# ============================
ask_question("كيف يتم انتهاك خصوصية المستخدم؟")           # Example 1
ask_question("كيف يتم التحقق من ان النموذج لايوجد فيه انتهاك ؟")               # Example 2



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Q: كيف يتم انتهاك خصوصية المستخدم؟
A: أنت مساعد أكاديمي. أجب عن السؤال بدقة وبالعربية الفصحى اعتمادًا فقط على المقاطع المعطاة.
إن لم تجد الإجابة في المقاطع، قل: «لا يظهر ذلك في الوثيقة».

السؤال: كيف يتم انتهاك خصوصية المستخدم؟

المقاطع:
passage: واﺧﺘﺒﺎرﻫﺎ ﻟﻠﻮﺻﻮل إﻟﻰ ﻧﺘﺎﺋﺞ ﻣﺤﺪدة.
 ﻳﺴﺘﻬﻠﻚ أو ﻳﺴـــﺘﺨﺪم اﻟﺴـــﻠﻊ أو اﻟﺨﺪﻣﺎت اﻟﺘﻲ أي ﺷـــﺨﺺ ذي ﺻﻔـــﺔ ﻃﺒﻴﻌﻴـــﺔ أو اﻋﺘﺒﺎرﻳـــﺔ 
ﺗﻨﺘﺠﻬﺎ أﻧﻈﻤﺔ اﻟﺬﻛﺎء اﻻﺻﻄﻨﺎﻋﻲ.
اﻟﻔﺮد اﻟﺬي ﺗﺘﻌﻠﻖ ﺑﻪ اﻟﺒﻴﺎﻧﺎت اﻟﺸﺨﺼﻴﺔ.
 ﻳﺠﻌﻞ اﻟﺘﻌﺮف ﻋﻠﻴﻪ ﻣﻤﻜﻨﺎً ﺑﺼﻔﺔ ﻣﺒﺎﺷـﺮة أو ﻏﻴﺮ أن ﻳـﺆدي إﻟـﻰ ﻣﻌﺮﻓـﺔ اﻟﻔـﺮد ﻋﻠﻰ وﺟﻪ اﻟﺘﺤﺪﻳﺪ، أو ﻛﻞ ﺑﻴﺎن -ﻣﻬﻤﺎ ﻛﺎن ﻣﺼﺪره أو ﺷﻜﻠﻪ- ﻣﻦ ﺷﺄﻧﻪ 
 ﻣﺒﺎﺷـﺮة، وﻣـﻦ ذﻟـﻚ: اﻻﺳـﻢ، ورﻗـﻢ اﻟﻬﻮﻳـﺔ 
 اﻟﺸـﺨﺼﻴﺔ، واﻟﻌﻨﺎوﻳـﻦ، وأرﻗـﺎم اﻟﺘﻮاﺻﻞ، وأرﻗﺎم 
 اﻟﺮُّﺧـﺺ واﻟﺴـﺠﻼت واﻟﻤﻤﺘﻠـﻜﺎت اﻟﺸـﺨﺼﻴﺔ، 
 وأرﻗـﺎم اﻟﺤﺴـﺎﺑﺎت اﻟﺒﻨﻜﻴـﺔ واﻟﺒﻄﺎﻗﺎت اﻻﺋﺘﻤﺎﻧﻴﺔ، 
 وﺻـﻮر اﻟﻔـﺮد اﻟﺜﺎﺑﺘـﺔ أو اﻟﻤﺘﺤﺮﻛـﺔ، وﻏﻴـﺮ ذﻟـﻚ ﻣﻦ 
اﻟﺒﻴﺎﻧﺎت ذات اﻟﻄﺎﺑﻊ اﻟﺸﺨﺼﻲ. ﻛﻞ ﺑﻴــﺎن ﺷــﺨﺼﻲ ﻳﺘﻀﻤــﻦ اﻹﺷــﺎرة إﻟــﻰ أﺻــﻞ
 اﻟﺒﻴﺎﻧــﺎت اﻷﻣﻨﻴــﺔ واﻟﺠﻨﺎﺋﻴــﺔ، أو ﺑﻴﺎﻧــﺎت اﻟﺴــﻤﺎت اﻟﺪﻳﻨــﻲ، أو اﻟﻔﻜــﺮي، أو اﻟﺴﻴﺎﺳــﻲ. وﻛﺬﻟــﻚ اﻟﻔــﺮد اﻟﻌﺮﻗــ