> **Ibrahim AbuAlhaol**, Ottawa, Canada 

## Doanload Data

In [121]:
import os

import warnings
warnings.filterwarnings("ignore")

if not os.path.exists("data"):
    os.makedirs("data")

  
if not os.path.exists("models"):
    os.makedirs("models")
    
if not os.path.exists("data/Textfiles/EnglishText"):
    os.makedirs("data/Textfiles/EnglishText")
    
if not os.path.exists("data/Textfiles/ArabicText"):
    os.makedirs("data/Textfiles/ArabicText")
    
    
if not os.path.exists("data/Quran_English.csv"):
    !wget -O data/Quran_English.csv https://raw.githubusercontent.com/UBISOFT-1/Quran_Module/master/Quran_English.csv
    
if not os.path.exists("data/Quran_Arabic.csv"):
    !wget  -O data/Quran_Arabic.csv https://raw.githubusercontent.com/UBISOFT-1/Quran_Module/master/Quran_Arabic.csv

## Explore data

In [109]:
import pandas as pd

In [110]:
Quran_Arabic_DF=pd.read_csv('data/Quran_Arabic.csv')
Quran_English_DF=pd.read_csv('data/Quran_English.csv')


In [111]:
Quran_English_DF.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6236 entries, 0 to 6235
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   DatabaseID  6236 non-null   int64 
 1   SuraID      6236 non-null   int64 
 2   VerseID     6236 non-null   int64 
 3   AyahText    6236 non-null   object
dtypes: int64(3), object(1)
memory usage: 195.0+ KB


In [112]:
Quran_English_DF.head(3)

Unnamed: 0,DatabaseID,SuraID,VerseID,AyahText
0,59,1,1,"In the name of Allah, Most Gracious, Most Merc..."
1,59,1,2,"Praise be to Allah, the Cherisher and Sustaine..."
2,59,1,3,"Most Gracious, Most Merciful;"


In [113]:
Quran_Arabic_DF.head(3)

Unnamed: 0,DatabaseID,SuraID,VerseID,AyahText
0,1,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1,1,3,الرَّحْمَٰنِ الرَّحِيمِ


## Store data into text files 

In [114]:
import pandas as pd
path='data/Textfiles/EnglishText/'
# Iterate through the rows of the DataFrame
for index, row in Quran_English_DF.iterrows():
    # Get the text from the "texts" column
    text = row["AyahText"]
    name= f"{path}english_{row['SuraID']}_{row['VerseID']}.txt"
    # Write the text to a file named by the row number with a ".txt" extension
    with open(name, "w") as file:
        file.write(text)

In [115]:
import pandas as pd
path='data/Textfiles/ArabicText/'
# Iterate through the rows of the DataFrame
for index, row in Quran_Arabic_DF.iterrows():
    # Get the text from the "texts" column
    text = row["AyahText"]
    name= f"{path}arabic_{row['SuraID']}_{row['VerseID']}.txt"
    # Write the text to a file named by the row number with a ".txt" extension
    with open(name, "w") as file:
        file.write(text)

In [117]:
import os
import glob
import re
from gensim import corpora, models
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


# Function to preprocess text
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    text = re.sub(r"\s+", " ", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Folder containing the text files
folder = 'data/Textfiles/EnglishText'

# Get the list of text files
files = glob.glob(os.path.join(folder, '*.txt'))

# Read the contents of each file
texts = []
for file in files:
    with open(file, 'r') as f:
        texts.append(f.read())

texts=texts[1:100]
# Preprocess the text of each file
processed_texts = [clean_text(text) for text in texts]

# Create a list of lists of words (i.e. list of sentences)
sentences = [text.split() for text in processed_texts]

# do you need to run 2-Gram model 
Grams_option=False

if Grams_option:
    # Create 2-grams
    bigram = Phrases(sentences, min_count=1, threshold=1)
    bigram_model = Phraser(bigram)

    # Apply the 2-grams to the sentences
    bigram_sentences = bigram_model[sentences]

    sentences= bigram_sentences


# Create a dictionary from the 2-grams sentences
dictionary = corpora.Dictionary(sentences)

# Create a bag-of-words representation of the texts
bow_corpus = [dictionary.doc2bow(sentence) for sentence in sentences]

# Train the LDA model on the corpus
lda_model = models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary)
lda_model.save('models/lda_english.mdl')

lda_model=models.LdaModel.load('models/lda_english.mdl')

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(lda_viz, 'models/lda_vis.html')

# Model visulization 

lda_viz


Topic: 0 
Words: 0.031*"ye" + 0.030*"people" + 0.018*"right" + 0.013*"felicity" + 0.013*"quot" + 0.013*"get" + 0.012*"wrong" + 0.012*"forbidding" + 0.012*"enjoining" + 0.012*"allah"
Topic: 1 
Words: 0.055*"ye" + 0.026*"fear" + 0.017*"able" + 0.017*"deal" + 0.017*"shall" + 0.017*"justly" + 0.017*"one" + 0.009*"right" + 0.009*"marry" + 0.009*"two"
Topic: 2 
Words: 0.058*"quot" + 0.021*"pharaoh" + 0.018*"command" + 0.012*"said" + 0.009*"forth" + 0.008*"ye" + 0.008*"work" + 0.008*"believe" + 0.008*"thy" + 0.008*"scripture"
Topic: 3 
Words: 0.084*"allah" + 0.029*"quot" + 0.018*"grace" + 0.015*"earth" + 0.015*"power" + 0.014*"ye" + 0.011*"another" + 0.011*"believe" + 0.011*"helpers" + 0.011*"book"
Topic: 4 
Words: 0.051*"quot" + 0.023*"allah" + 0.014*"said" + 0.011*"would" + 0.011*"confederates" + 0.011*"witness" + 0.010*"ye" + 0.010*"say" + 0.009*"think" + 0.009*"word"
Topic: 5 
Words: 0.045*"allah" + 0.025*"earth" + 0.019*"lord" + 0.018*"made" + 0.016*"heavens" + 0.010*"angels" + 0.010*"re

- The visualization shows a scatter plot of the topics, where each point represents a topic and the x and y axes represent the topic's two main dimensions. 

- The points are labeled with the top terms of the topic, and the size of the point corresponds to the topic's prevalence in the corpus. 

- You can also select a term in the right-side panel to see where it appears in the topics, and select a topic to see the top terms of that topic.


## Putting the model into class

In [122]:
class LDA_Model:
    def __init__(self, folder, num_topics, grams_option):
        self.folder = folder
        self.num_topics = num_topics
        self.grams_option = grams_option
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
        text = re.sub(r"\s+", " ", text)
        text = " ".join([word for word in text.split() if word not in self.stop_words])
        return text

    def read_files(self):
        files = glob.glob(os.path.join(self.folder, '*.txt'))
        texts = []
        for file in files:
            with open(file, 'r') as f:
                texts.append(f.read())
        return texts

    def preprocess_texts(self, texts):
        processed_texts = [self.clean_text(text) for text in texts]
        sentences = [text.split() for text in processed_texts]
        if self.grams_option:
            bigram = Phrases(sentences, min_count=1, threshold=1)
            bigram_model = Phraser(bigram)
            bigram_sentences = bigram_model[sentences]
            sentences = bigram_sentences
        return sentences

    def train_model(self, sentences):
        dictionary = corpora.Dictionary(sentences)
        bow_corpus = [dictionary.doc2bow(sentence) for sentence in sentences]
        lda_model = models.LdaModel(bow_corpus, num_topics=self.num_topics, id2word=dictionary)
        return lda_model, bow_corpus, dictionary

    def save_model(self, lda_model, bow_corpus, dictionary):
        lda_model.save('models/lda.mdl')
        lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)
        pyLDAvis.save_html(lda_viz, 'models/lda_vis.html')
        return lda_viz

# create object of the class
lda_model = LDA_Model(folder='data/Textfiles/EnglishText', num_topics=5, grams_option=True)

# read and preprocess texts
texts = lda_model.read_files()

# how many verses you would like to consider 
#texts=texts[1:1000]
sentences = lda_model.preprocess_texts(texts)

# train the model and save it
lda, bow_corpus, dictionary = lda_model.train_model(sentences)
lda_viz=lda_model.save_model(lda, bow_corpus, dictionary)

In [123]:
lda_viz