In [1]:
import os
import subprocess

# Define the package folder
package_folder = 'Packages'

# Create the folder if it doesn't exist
os.makedirs(package_folder, exist_ok=True)

# List of external packages
packages = [
    'chromadb',
    'deep_translator',
    'docx',
    'pandas',
    'pdfplumber',
    'tiktoken',
    'transformers'
]

# Step 1: Download the packages into the folder
print("Downloading packages...")
for package in packages:
    print(f"Downloading {package}...")
    subprocess.run([
        'pip', 'download', package, '--dest', package_folder
    ])

# Step 2: Install the packages one by one from the local folder
print("\nInstalling packages from local folder...")
for package in packages:
    print(f"Installing {package} from local folder...")
    subprocess.run([
        'pip', 'install', '--no-index', '--find-links', package_folder, package
    ])

print("\nAll packages installed successfully!")

Downloading packages...
Downloading chromadb...
Downloading deep_translator...
Downloading docx...
Downloading pandas...
Downloading pdfplumber...
Downloading tiktoken...
Downloading transformers...

Installing packages from local folder...
Installing chromadb from local folder...
Installing deep_translator from local folder...
Installing docx from local folder...
Installing pandas from local folder...
Installing pdfplumber from local folder...
Installing tiktoken from local folder...
Installing transformers from local folder...

All packages installed successfully!


In [6]:
from docx import Document
import pandas as pd

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
import pdfplumber
import contextlib

def extract_text_from_pdf(file_path):
    text = ''
    with contextlib.redirect_stderr(None):  # 🚫 يخفي التحذيرات
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'
    return text

def extract_text_from_excel(file_path):
    text = ''
    xls = pd.ExcelFile(file_path)
    for sheet_name in xls.sheet_names:
        df = xls.parse(sheet_name)
        text += f'\n\nSheet: {sheet_name}\n'
        text += df.to_string(index=False)
    return text

In [9]:
extract_text_from_docx("../data/Dataset summaries and citations.docx")
extract_text_from_docx("../data/M.Sc. Applied Psychology.docx")
extract_text_from_docx("../data/Stats.docx")
extract_text_from_excel("../data/Loan amortisation schedule1.xlsx")
extract_text_from_excel("../data/Loan analysis.xlsx")
extract_text_from_excel("../data/party budget1.xlsx")
extract_text_from_pdf("../data/new-approaches-and-procedures-for-cancer-treatment.pdf")
extract_text_from_pdf("../data/Ocean_ecogeochemistry_A_review.pdf")
extract_text_from_pdf("../data/The_Plan_of_the_Giza_Pyramids.pdf")
extract_text_from_pdf("../data/The-Alchemist.pdf")


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  for idx, row in parser.parse():
  warn(f"Print area cannot be set to Defined name: {defn.value}.")




In [23]:
import tiktoken

def chunk_text(text, chunk_size=500, overlap=50):
    # اختيار Tokenizer المناسب
    encoding = tiktoken.get_encoding("cl100k_base")
    
    # تحويل النص إلى Tokens
    tokens = encoding.encode(text)
    
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        decoded_chunk = encoding.decode(chunk)
        chunks.append(decoded_chunk)
    
    return chunks

In [25]:
def create_chunks_with_metadata(text, source_name, page_number=None):
    chunks = chunk_text(text)
    chunk_data = []
    for idx, chunk in enumerate(chunks):
        chunk_info = {
            "source": source_name,
            "page_number": page_number if page_number is not None else "N/A",
            "chunk_number": idx + 1,
            "text": chunk
        }
        chunk_data.append(chunk_info)
    return chunk_data

In [27]:
text = extract_text_from_docx("../data/Dataset summaries and citations.docx")
chunked_data = create_chunks_with_metadata(text, source_name="Dataset summaries and citations.docx")

In [31]:
for chunk in chunked_data[:3]:
    print(chunk)
    print("="*80)

{'source': 'Dataset summaries and citations.docx', 'page_number': 'N/A', 'chunk_number': 1, 'text': 'Table 1. Description of studies included in the meta-analysis. Full article citations are listed after the table.\nCitation List\nAcuña E., A. A., Pastenes V., C., & Villalobos G., L. (2017). Carbon Sequestration and Photosynthesis in Newly Established Turfgrass Cover in Central Chile. Agronomy Journal, 109(2), 397–405. https://doi.org/10.2134/agronj2016.05.0257\nBraun, R. C., & Bremer, D. J. (2019). Carbon Sequestration in Zoysiagrass Turf under Different Irrigation and Fertilization Management Regimes. Agrosystems, Geosciences & Environment, 2(1), 1–8. https://doi.org/10.2134/age2018.12.0060\nCampbell, C., Seiler, J., Wiseman, P., Strahm, B., & Munsell, J. (2014). Soil Carbon Dynamics in Residential Lawns Converted from Appalachian Mixed Oak Stands. Forests, 5(3), 425–438. https://doi.org/10.3390/f5030425\nCarley, D. S., Goodman, D., Sermons, S., Shi, W., Bowman, D., Miller, G., & Ruf

In [35]:
#Read all Document in Data Folder + Crunck them 
import os
import json
from pathlib import Path
from docx import Document
import pdfplumber
import pandas as pd
import tiktoken
import contextlib
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

# ==== Step 1: Functions to extract text ====

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])

def extract_text_from_pdf(file_path):
    text = ''
    with contextlib.redirect_stderr(None):  # suppress warnings
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'
    return text

def extract_text_from_excel(file_path):
    text = ''
    xls = pd.ExcelFile(file_path)
    for sheet_name in xls.sheet_names:
        text += f'\n\nSheet: {sheet_name}\n'
        df = xls.parse(sheet_name)

        # Clean up dataframe
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.dropna(how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)
        df.fillna('', inplace=True)

        text += df.to_string(index=False)
    return text

# ==== Step 2: Chunking function ====

def chunk_text(text, chunk_size=500, overlap=50):
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        decoded_chunk = encoding.decode(chunk)
        chunks.append(decoded_chunk)
    return chunks

def create_chunks_with_metadata(text, source_name, page_number=None):
    chunks = chunk_text(text)
    chunk_data = []
    for idx, chunk in enumerate(chunks):
        chunk_info = {
            "source": source_name,
            "page_number": page_number if page_number is not None else "N/A",
            "chunk_number": idx + 1,
            "text": chunk
        }
        chunk_data.append(chunk_info)
    return chunk_data

# ==== Step 3: Process all files in a folder ====

def process_all_files(input_folder, output_json):
    all_chunks = []

    # loop through files
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        if file_name.lower().endswith('.docx'):
            text = extract_text_from_docx(file_path)
        elif file_name.lower().endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        elif file_name.lower().endswith(('.xlsx', '.xlsm', '.xls')):
            text = extract_text_from_excel(file_path)
        else:
            print(f"Skipping unsupported file: {file_name}")
            continue

        chunks = create_chunks_with_metadata(text, source_name=file_name)
        all_chunks.extend(chunks)
        print(f"✅ Processed {file_name} - {len(chunks)} chunks")

    # save all chunks to JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=4)

    print(f"\n🎉 All done! Data saved to {output_json}")

# ==== Step 4: Run the pipeline ====

# Specify your folder and output path
input_folder = "../data"
output_json = "processed_chunks.json"

process_all_files(input_folder, output_json)

✅ Processed Dataset summaries and citations.docx - 5 chunks
✅ Processed Loan amortisation schedule1.xlsx - 1 chunks
✅ Processed Loan analysis.xlsx - 1 chunks
✅ Processed M.Sc. Applied Psychology.docx - 54 chunks
✅ Processed new-approaches-and-procedures-for-cancer-treatment.pdf - 27 chunks
✅ Processed Ocean_ecogeochemistry_A_review.pdf - 123 chunks
✅ Processed party budget1.xlsx - 1 chunks
✅ Processed Stats.docx - 4 chunks
✅ Processed The-Alchemist.pdf - 124 chunks
✅ Processed The_Plan_of_the_Giza_Pyramids.pdf - 21 chunks

🎉 All done! Data saved to processed_chunks.json


In [37]:
#download noamic Library and Install it
import os
import subprocess

# Step 1: Create Packages folder
package_folder = 'Packages'
os.makedirs(package_folder, exist_ok=True)

# Step 2: Download nomic and its dependencies
print("📦 Downloading 'nomic' and dependencies...")
subprocess.run(['pip', 'download', 'nomic', '--dest', package_folder])

# Step 3: Install nomic from local Packages folder
print("⚙️ Installing 'nomic' from local folder...")
subprocess.run(['pip', 'install', '--no-index', '--find-links', package_folder, 'nomic'])

print("✅ 'nomic' installed successfully from local packages!")


📦 Downloading 'nomic' and dependencies...
⚙️ Installing 'nomic' from local folder...
✅ 'nomic' installed successfully from local packages!


In [41]:
import json
import chromadb
from chromadb.utils import embedding_functions

def build_vector_database(json_file_path, db_path="./chromadb", collection_name="dr_x_research"):
    # ===== Step 1: Load processed chunks from JSON =====
    try:
        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"❌ Error loading JSON: {e}")
        return

    # ===== Step 2: Prepare embedding function =====
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

    # ===== Step 3: Setup ChromaDB client =====
    try:
        client = chromadb.PersistentClient(path=db_path)

        # Since your version does not support passing embedding_function here, we create collection simply
        collection = client.get_or_create_collection(name=collection_name)

    except Exception as e:
        print(f"❌ Error setting up ChromaDB client: {e}")
        return

    # ===== Step 4: Prepare data =====
    try:
        ids = [f"chunk_{i}" for i in range(len(data))]
        texts = [chunk["text"] for chunk in data]
        metadatas = [
            {
                "source": chunk["source"],
                "page_number": chunk["page_number"],
                "chunk_number": chunk["chunk_number"]
            }
            for chunk in data
        ]
    except Exception as e:
        print(f"❌ Error preparing data: {e}")
        return

    # ===== Step 5: Generate embeddings manually and add to collection =====
    try:
        print("🧩 Generating embeddings, please wait...")
        embeddings = embedding_func(texts)

        print("💾 Adding to vector database...")
        collection.add(
            ids=ids,
            embeddings=embeddings,
            documents=texts,
            metadatas=metadatas
        )

        print(f"🎉 Done! Added {len(texts)} chunks to the vector database at {db_path}")

    except Exception as e:
        print(f"❌ Error adding data to collection: {e}")

In [43]:
build_vector_database("processed_chunks.json")

🧩 Generating embeddings, please wait...
💾 Adding to vector database...
🎉 Done! Added 361 chunks to the vector database at ./chromadb


In [45]:
import chromadb
from chromadb.utils import embedding_functions

def query_vector_database(user_query, db_path="./chromadb", collection_name="dr_x_research", top_k=5):
    # إعداد نموذج التضمين (نفس الذي استخدمناه سابقًا)
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

    # إعداد ChromaDB client
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)

    # تحويل السؤال إلى embedding
    query_embedding = embedding_func([user_query])

    # البحث في قاعدة البيانات
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )

    # استخراج النصوص والميتا داتا
    documents = results['documents'][0]
    metadatas = results['metadatas'][0]

    # طباعة النتائج
    print("\n📄 Top retrieved chunks:")
    for idx, doc in enumerate(documents):
        meta = metadatas[idx]
        print(f"\n--- Chunk {meta['chunk_number']} from {meta['source']} ---")
        print(doc)

    # إرجاع النتائج لاستخدامها لاحقًا في مرحلة التوليد
    return documents


In [47]:
def generate_answer(context_chunks, user_question):
    # دمج السياق مع السؤال
    context = "\n\n".join(context_chunks)
    prompt = f"""أنت مساعد ذكي. استخدم المعلومات التالية للإجابة على السؤال.
    
المعلومات:
{context}

السؤال:
{user_question}

الإجابة:"""

    # الآن نمرر الـ prompt إلى نموذج LLM المحلي
    # مثال مع LLaMA أو أي نموذج:
    # response = model.generate(prompt)

    # للتجربة مؤقتًا حتى ندمج LLaMA
    response = "📌 (هنا سيتم توليد الإجابة بواسطة نموذج LLaMA لاحقًا.)"

    print("\n🤖 الإجابة النهائية:")
    print(response)

    return response


In [49]:
user_question = "ما هو ملخص دراسات الدكتور X حول المحيطات؟"

# خطوة 1: البحث في ChromaDB
retrieved_chunks = query_vector_database(user_question)

# خطوة 2: توليد الإجابة باستخدام LLM
generate_answer(retrieved_chunks, user_question)



📄 Top retrieved chunks:

--- Chunk 40 from The-Alchemist.pdf ---
 on our side, and do as much to help it as it’s doing to
help us. It’s called the principle of favorability. Or beginner’s luck.”
The merchant was silent for a few moments. Then he said, “The
Prophet gave us the Koran, and left us just five obligations to satisfy
during our lives. The most important is to believe only in the one true
God. The others are to pray five times a day, fast during Ramadan,
and be charitable to the poor.”
He stopped there. His eyes filled with tears as he spoke of the
Prophet. He was a devout man, and, even with all his impatience, he
wanted to live his life in accordance with Muslim law.
“What’s the fifth obligation?” the boy asked.
“Two days ago, you said that I had never dreamed of travel,” the
merchant answered. “The fifth obligation of every Muslim is a
pilgrimage. We are obliged, at least once in our lives, to visit the holy
city of Mecca.
“Mecca is a lot farther away than the Pyramids. Wh

'📌 (هنا سيتم توليد الإجابة بواسطة نموذج LLaMA لاحقًا.)'

In [51]:
import os
import subprocess

# Define the packages list
packages = ['transformers', 'torch', 'sentencepiece']

# Define the folder to store the packages
package_folder = 'Packages'
os.makedirs(package_folder, exist_ok=True)

# Step 1: Download all packages
print("📦 Downloading packages...")
for package in packages:
    print(f"Downloading {package}...")
    subprocess.run(['pip', 'download', package, '--dest', package_folder], check=True)

# Step 2: Install all packages locally, one by one
print("\n⚙️ Installing packages from local folder...")
for package in packages:
    print(f"Installing {package} from local folder...")
    subprocess.run([
        'pip', 'install', '--no-index', '--find-links', package_folder, package
    ], check=True)

print("\n✅ All packages installed successfully from local folder!")


📦 Downloading packages...
Downloading transformers...
Downloading torch...
Downloading sentencepiece...

⚙️ Installing packages from local folder...
Installing transformers from local folder...
Installing torch from local folder...
Installing sentencepiece from local folder...

✅ All packages installed successfully from local folder!


In [55]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# تحميل النموذج والـ tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# إعداد السؤال والسياق
def generate_answer(context_chunks, user_question):
    context = "\n\n".join(context_chunks)
    prompt = f"""أنت مساعد ذكي. استخدم المعلومات التالية للإجابة على السؤال.
    
المعلومات:
{context}

السؤال: {user_question}

الإجابة:"""

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n🤖 الإجابة النهائية:")
    print(answer)

    return answer


model.safetensors:  58%|#####7    | 1.27G/2.20G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [56]:
user_question = "ما هو ملخص دراسات الدكتور X حول المحيطات؟"

retrieved_chunks = query_vector_database(user_question)
generate_answer(retrieved_chunks, user_question)

Token indices sequence length is longer than the specified maximum sequence length for this model (2931 > 2048). Running this sequence through the model will result in indexing errors



📄 Top retrieved chunks:

--- Chunk 40 from The-Alchemist.pdf ---
 on our side, and do as much to help it as it’s doing to
help us. It’s called the principle of favorability. Or beginner’s luck.”
The merchant was silent for a few moments. Then he said, “The
Prophet gave us the Koran, and left us just five obligations to satisfy
during our lives. The most important is to believe only in the one true
God. The others are to pray five times a day, fast during Ramadan,
and be charitable to the poor.”
He stopped there. His eyes filled with tears as he spoke of the
Prophet. He was a devout man, and, even with all his impatience, he
wanted to live his life in accordance with Muslim law.
“What’s the fifth obligation?” the boy asked.
“Two days ago, you said that I had never dreamed of travel,” the
merchant answered. “The fifth obligation of every Muslim is a
pilgrimage. We are obliged, at least once in our lives, to visit the holy
city of Mecca.
“Mecca is a lot farther away than the Pyramids. Wh

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



🤖 الإجابة النهائية:
أنت مساعد ذكي. استخدم المعلومات التالية للإجابة على السؤال.
    
المعلومات:
 on our side, and do as much to help it as it’s doing to
help us. It’s called the principle of favorability. Or beginner’s luck.”
The merchant was silent for a few moments. Then he said, “The
Prophet gave us the Koran, and left us just five obligations to satisfy
during our lives. The most important is to believe only in the one true
God. The others are to pray five times a day, fast during Ramadan,
and be charitable to the poor.”
He stopped there. His eyes filled with tears as he spoke of the
Prophet. He was a devout man, and, even with all his impatience, he
wanted to live his life in accordance with Muslim law.
“What’s the fifth obligation?” the boy asked.
“Two days ago, you said that I had never dreamed of travel,” the
merchant answered. “The fifth obligation of every Muslim is a
pilgrimage. We are obliged, at least once in our lives, to visit the holy
city of Mecca.
“Mecca is a lot far

'أنت مساعد ذكي. استخدم المعلومات التالية للإجابة على السؤال.\n    \nالمعلومات:\n on our side, and do as much to help it as it’s doing to\nhelp us. It’s called the principle of favorability. Or beginner’s luck.”\nThe merchant was silent for a few moments. Then he said, “The\nProphet gave us the Koran, and left us just five obligations to satisfy\nduring our lives. The most important is to believe only in the one true\nGod. The others are to pray five times a day, fast during Ramadan,\nand be charitable to the poor.”\nHe stopped there. His eyes filled with tears as he spoke of the\nProphet. He was a devout man, and, even with all his impatience, he\nwanted to live his life in accordance with Muslim law.\n“What’s the fifth obligation?” the boy asked.\n“Two days ago, you said that I had never dreamed of travel,” the\nmerchant answered. “The fifth obligation of every Muslim is a\npilgrimage. We are obliged, at least once in our lives, to visit the holy\ncity of Mecca.\n“Mecca is a lot farth

In [None]:
from transformers import MarianMTModel, MarianTokenizer

def translate_text(text, src_lang="en", tgt_lang="ar"):
    if src_lang == "en" and tgt_lang == "ar":
        model_name = "Helsinki-NLP/opus-mt-en-ar"
    elif src_lang == "ar" and tgt_lang == "en":
        model_name = "Helsinki-NLP/opus-mt-ar-en"
    else:
        raise ValueError("❌ اللغات غير مدعومة! الرجاء اختيار en <-> ar فقط.")

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # ✅ Wrap text in list to fix attention_mask shape
    inputs = tokenizer([text], return_tensors="pt", padding=True)

    translated = model.generate(**inputs, max_length=512)

    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

    print(f"\n🌍 الترجمة ({src_lang} → {tgt_lang}):")
    print(translated_text.strip())

    return translated_text.strip()


In [58]:
from transformers import MarianMTModel, MarianTokenizer

def translate_text(text, src_lang="en", tgt_lang="ar"):
    # اختيار النموذج بناءً على الاتجاه
    if src_lang == "en" and tgt_lang == "ar":
        model_name = "Helsinki-NLP/opus-mt-en-ar"
    elif src_lang == "ar" and tgt_lang == "en":
        model_name = "Helsinki-NLP/opus-mt-ar-en"
    else:
        raise ValueError("❌ اللغات غير مدعومة! الرجاء اختيار en <-> ar فقط.")

    # تحميل النموذج والـ tokenizer
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # تجهيز الإدخال
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    # الترجمة
    translated = model.generate(**inputs, max_length=512)

    # فك التوكنز
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

    print(f"\n🌍 الترجمة ({src_lang} → {tgt_lang}):")
    print(translated_text.strip())

    return translated_text.strip()


In [61]:
answer=generate_answer(retrieved_chunks, user_question)
translate_text(answer, src_lang="en", tgt_lang="ar")


🤖 الإجابة النهائية:
أنت مساعد ذكي. استخدم المعلومات التالية للإجابة على السؤال.
    
المعلومات:
 on our side, and do as much to help it as it’s doing to
help us. It’s called the principle of favorability. Or beginner’s luck.”
The merchant was silent for a few moments. Then he said, “The
Prophet gave us the Koran, and left us just five obligations to satisfy
during our lives. The most important is to believe only in the one true
God. The others are to pray five times a day, fast during Ramadan,
and be charitable to the poor.”
He stopped there. His eyes filled with tears as he spoke of the
Prophet. He was a devout man, and, even with all his impatience, he
wanted to live his life in accordance with Muslim law.
“What’s the fifth obligation?” the boy asked.
“Two days ago, you said that I had never dreamed of travel,” the
merchant answered. “The fifth obligation of every Muslim is a
pilgrimage. We are obliged, at least once in our lives, to visit the holy
city of Mecca.
“Mecca is a lot far

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2624 > 512). Running this sequence through the model will result in indexing errors


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

IndexError: index out of range in self

In [64]:
import time


In [None]:
#Samurize the Text
def summarize_text(text, max_tokens=100):
    prompt = f"تلخيص النص التالي:\n\n{text}"

    inputs = tokenizer(prompt, return_tensors="pt", padding=True)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        num_beams=4,
        early_stopping=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("\n📝 الملخص:")
    print(summary.strip())

    return summary.strip()
summarize_text(answer)

In [None]:
# Generate The Answer with Calculation The Time
start_time = time.time()

answer = generate_answer(retrieved_chunks, user_question)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\n⏱️ الوقت المستغرق لتوليد الإجابة: {elapsed_time:.2f} ثانية")

In [None]:
start_time = time.time()

translated_answer = translate_text(answer, src="en", tgt="ar")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\n⏱️ الوقت المستغرق للترجمة: {elapsed_time:.2f} ثانية")

In [None]:
start_time = time.time()

summary = summarize_text(answer)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\n⏱️ الوقت المستغرق للتلخيص: {elapsed_time:.2f} ثانية")