In [6]:
!pip install pandas
!pip install numpy
!pip install sentence-transformers faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl.metadata (5.0 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.2.1-cp313-cp313-win_amd64.whl.metadata (9.1 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Downloading faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl (15.0 MB)
   -----------------------

In [8]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np

In [9]:
df = pd.read_csv('data.csv')

In [10]:
df.head()

Unnamed: 0,surah_no,surah_name_en,surah_name_ar,surah_name_roman,ayah_no_surah,ayah_no_quran,ayah_ar,ayah_en,ruko_no,juz_no,manzil_no,hizb_quarter,total_ayah_surah,total_ayah_quran,place_of_revelation,sajah_ayah,sajdah_no,no_of_word_ayah,list_of_words
0,1,The Opener,الفاتحة,Al-Fatihah,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"In the Name of Allah—the Most Compassionate, M...",1,1,1,1,7,6236,Meccan,False,,4,"[بِسْمِ,ٱللَّهِ,ٱلرَّحْمَٰنِ,ٱلرَّحِيمِ]"
1,1,The Opener,الفاتحة,Al-Fatihah,2,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,"All praise is for Allah—Lord of all worlds,",1,1,1,1,7,6236,Meccan,False,,4,"[ٱلْحَمْدُ,لِلَّهِ,رَبِّ,ٱلْعَٰلَمِينَ]"
2,1,The Opener,الفاتحة,Al-Fatihah,3,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"the Most Compassionate, Most Merciful,",1,1,1,1,7,6236,Meccan,False,,2,"[ٱلرَّحْمَٰنِ,ٱلرَّحِيمِ]"
3,1,The Opener,الفاتحة,Al-Fatihah,4,4,مَٰلِكِ يَوْمِ ٱلدِّينِ,Master of the Day of Judgment.,1,1,1,1,7,6236,Meccan,False,,3,"[مَٰلِكِ,يَوْمِ,ٱلدِّينِ]"
4,1,The Opener,الفاتحة,Al-Fatihah,5,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,You ˹alone˺ we worship and You ˹alone˺ we ask ...,1,1,1,1,7,6236,Meccan,False,,4,"[إِيَّاكَ,نَعْبُدُ,وَإِيَّاكَ,نَسْتَعِينُ]"


In [12]:
texts = df['ayah_en'].tolist() + df['ayah_ar'].tolist()
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  # English


In [13]:
embeddings = model.encode(texts, show_progress_bar=True)


Batches: 100%|███████████████████████████████████████████████████████████████████████| 390/390 [05:56<00:00,  1.09it/s]


In [14]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [19]:
def search_ayah(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    results = []

    for score, i in zip(distances[0], indices[0]):
        ayah_index = i % len(df)  # because you've added both en+ar
        results.append({
            "surah_name_en": df.iloc[ayah_index]['surah_name_en'],
            "surah_name_roman": df.iloc[ayah_index]['surah_name_roman'],
            "surah_no": df.iloc[ayah_index]['surah_no'],
            "ayah_no": df.iloc[ayah_index]['ayah_no_surah'],
            "juz": df.iloc[ayah_index]['juz_no'],
            "ayah_en": df.iloc[ayah_index]['ayah_en'],
            "ayah_ar": df.iloc[ayah_index]['ayah_ar'],
            "confidence": float(1 / (1 + score))  # normalize distance to [0,1]
        })

    return results


In [28]:
query = "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ	"
results = search_ayah(query)
for r in results:
    print(r)


{'surah_name_en': 'The Opener', 'surah_name_roman': 'Al-Fatihah', 'surah_no': np.int64(1), 'ayah_no': np.int64(1), 'juz': np.int64(1), 'ayah_en': 'In the Name of Allah—the Most Compassionate, Most Merciful.', 'ayah_ar': 'بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ', 'confidence': 1.0}
{'surah_name_en': 'The Fig', 'surah_name_roman': 'At-Tin', 'surah_no': np.int64(95), 'ayah_no': np.int64(1), 'juz': np.int64(30), 'ayah_en': 'By the fig and the olive ˹of Jerusalem˺,', 'ayah_ar': 'بِّسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ وَٱلتِّينِ وَٱلزَّيْتُونِ', 'confidence': 0.965474009513855}
{'surah_name_en': 'The Ant', 'surah_name_roman': 'An-Naml', 'surah_no': np.int64(27), 'ayah_no': np.int64(30), 'juz': np.int64(19), 'ayah_en': 'It is from Solomon, and it reads: ‘In the Name of Allah—the Most Compassionate, Most Merciful.', 'ayah_ar': 'إِنَّهُۥ مِن سُلَيْمَٰنَ وَإِنَّهُۥ بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ', 'confidence': 0.7217244505882263}
{'surah_name_en': 'The Troops', 'surah_name_roman': 'Az-Z

In [30]:
# build_model.py
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pickle

df = pd.read_csv("data.csv")

# Combine English and Arabic for better matching
texts = df['ayah_en'].tolist() + df['ayah_ar'].tolist()

# Load model and encode
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# Build FAISS index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Save model parts
faiss.write_index(index, "quran_faiss.index")
with open("quran_model.pkl", "wb") as f:
    pickle.dump({
        "df": df,
        "model_path": "sentence-transformers/all-MiniLM-L6-v2"
    }, f)


Batches: 100%|███████████████████████████████████████████████████████████████████████| 390/390 [05:44<00:00,  1.13it/s]
