In [1]:
import pandas as pd
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load the Excel file
asbab_nuzul = pd.read_excel('asbabun_nuzul.xlsx')


In [3]:
# Text cleaning function
def clean_text(text):
    text = re.sub("\n\n", " ", text)
    return text


In [4]:
# Apply text cleaning
asbab_nuzul['clean'] = [clean_text(i) for i in asbab_nuzul.nuzul]

In [5]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\AINA
[nltk_data]     REDIZO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Load Malay stopwords from NLTK
stopword_malay = stopwords.words("indonesian")

In [7]:
# Load custom stopwords from file
txt_stopword = pd.read_csv("stopword-list.txt", names=["stopwords"], header=None)
custom_stopwords = set(txt_stopword["stopwords"])

In [8]:
# Combine both sets of stopwords
all_stopwords = set(stopword_malay).union(custom_stopwords)

In [9]:
# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in all_stopwords]
    filtered_text = ' '.join(filtered_words)
    return filtered_text


In [10]:
# Apply stopword removal
asbab_nuzul['Stopwords'] = asbab_nuzul['clean'].apply(remove_stopwords)

In [11]:
# Create BERTopic model and fit-transform documents
docs = asbab_nuzul['Stopwords'].tolist()
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(docs)

In [12]:
# Save the BERTopic model
topic_model.save("bertopic_model")



In [13]:
# Save the topics and documents to a CSV file
df = pd.DataFrame({"topic": topics, "document": docs, "surah": asbab_nuzul['surah']})
df.to_csv('bertopic_results.csv', index=False)

In [14]:
# Load the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [15]:
# Encode all documents
document_vectors = model.encode(docs)

In [16]:
# Create a DataFrame with the vectors and corresponding metadata
vector_df = pd.DataFrame(document_vectors)
vector_df['topic'] = topics
vector_df['document'] = docs
vector_df['surah'] = asbab_nuzul['surah']

In [17]:
# Save the vector DataFrame to a CSV file
vector_df.to_csv('document_vectors.csv', index=False)

In [18]:
# Function to perform k-NN search
def knn_search(input_keyword, model, document_vectors, docs, surahs, k=2):
    # Encode the input keyword
    vector_of_input_keyword = model.encode(input_keyword)
    
    # Compute cosine similarity
    similarities = cosine_similarity([vector_of_input_keyword], document_vectors)[0]
    
    # Get the indices of the top k most similar documents
    top_k_indices = similarities.argsort()[-k:][::-1]
    
    # Retrieve the top k most similar documents and their surahs
    top_k_docs = [(docs[idx], surahs[idx]) for idx in top_k_indices]
    return top_k_docs

In [19]:
# Example usage
input_keyword = "quran"
top_k_docs = knn_search(input_keyword, model, document_vectors, docs, asbab_nuzul['surah'].tolist(), k=2)
print(top_k_docs)

[('Ibnu hibban shahihnya ibnu mardawaih ibnu umar. berkata. turun 261. Rasulullah berdoa."ya Allah. berilah tambahn ummatku\' . turunlah', 2), ('Imam Al-Hakim Aisyah berkata. "Ketika turun ayat. \'Wahai berselimut (Muhammad)! Bangunlah (untuk shalat) malam hari. kecuali kecil.\' (Nabi saw sahabat) shalat malam henti kaki-kaki bengkak. Allah menurunkan 20 surah Al-Muzzammil. \'...karena baclah (bagimu) Al-Qur\'an....\' " Ibnu Jarir hadits diatas Ibnu Abbas lainnya.', 73)]
