In [1]:
import json
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources for Arabic
nltk.download('punkt')
nltk.download('stopwords')

# Arabic stop words
stop_words = set(stopwords.words('arabic'))

# Paths
json_folder = "books_json"  # Folder containing JSON files
output_file = "processed_books_content.json"  # Output file in the main QNARAI folder

def load_data(file_path):
    """Load text data from a JSON file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        text = next(iter(data.values()))  # Assumes one key-value pair per file where value is the text
    return text

def clean_arabic_text(text):
    """Clean Arabic text: remove diacritics, tatweel, punctuation, and extra spaces."""
    text = re.sub(r'[\u0610-\u061A\u064B-\u065F]', '', text)  # Remove Arabic diacritics
    text = re.sub(r'[\u0640]', '', text)  # Remove tatweel (kashida)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep only Arabic characters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

def process_arabic_text(text):
    """Tokenize Arabic text, remove stopwords, and normalize to lowercase."""
    words = word_tokenize(text)  # Tokenize text
    words = [word for word in words if word not in stop_words]  # Remove Arabic stopwords
    return words

def preprocess_books(json_folder):
    """Preprocess all JSON files in the given folder."""
    all_books_content = {}
    
    for file_name in os.listdir(json_folder):
        if file_name.endswith('.json'):  # Process only JSON files
            file_path = os.path.join(json_folder, file_name)
            
            # Load, clean, and process text
            raw_text = load_data(file_path)
            cleaned_text = clean_arabic_text(raw_text)
            processed_text = process_arabic_text(cleaned_text)
            
            # Store processed content
            book_name = file_name.replace('_content.json', '')  # Extract book name
            all_books_content[book_name] = processed_text
    
    return all_books_content

# Preprocess all books and save output
books_content = preprocess_books(json_folder)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(books_content, f, ensure_ascii=False, indent=4)

print(f"Preprocessing complete! Processed data saved to '{output_file}'.")


[nltk_data] Downloading package punkt to C:\Users\Sai
[nltk_data]     Rajagopal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Sai
[nltk_data]     Rajagopal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing complete! Processed data saved to 'processed_books_content.json'.


In [3]:
pip install langchain langchain_community ollama faiss-cpu


Collecting langchain
  Downloading langchain-0.3.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting ollama
  Using cached ollama-0.4.4-py3-none-any.whl.metadata (4.7 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp39-cp39-win_amd64.whl.metadata (4.5 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Using cached langchain_core-0.3.25-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Downloading langchain_text_splitters-0.3.3-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.3,>=0.1.17 (from langchain)
  Using cached langsmith-0.2.3-py3-none-any.whl.metadata (14 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Using cached pydantic-2.10.3-py3-none-any.whl.metadata (172 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2

DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.3.0 requires daal==2021.2.3, which is not installed.
spyder 5.1.5 requires pyqt5<5.13, which is not installed.
spyder 5.1.5 requires pyqtwebengine<5.13, which is not installed.
numba 0.54.1 requires numpy<1.21,>=1.17, but you have numpy 1.26.4 which is incompatible.
scipy 1.7.1 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.26.4 which is incompatible.
tensorflow-intel 2.13.0 requires keras<2.14,>=2.13.1, but you have k

Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting httpcore==1.* (from httpx<0.28.0,>=0.27.0->ollama)
  Using cached httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama)
  Using cached h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.25->langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting packaging (from faiss-cpu)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.3,>=0.1.17->langchain)
  Downloading orjson-3.10.12-cp39-none-win_amd64.whl.metadata (42 kB)
     --------

tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install --upgrade typing-extensions
!pip install --upgrade langchain pydantic
!pip install sentence-transformers



DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import json
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

# Load preprocessed content
with open("processed_books_content.json", "r", encoding="utf-8") as file:
    books_content = json.load(file)

# Combine all book content into one string
all_text = [" ".join(books_content[book]) for book in books_content]
combined_text = "\n".join(all_text)

# Split text into chunks for retrieval
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.create_documents([combined_text])

# Generate embeddings for the text
embeddings = HuggingFaceEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Save the FAISS index for later use
vectorstore.save_local("faiss_index")
