 ## بسم الله الرّحمٰن الرّحيم 

# Theory of Concentrism in the Qur'an using Genetic Algorithm with CL-AraBERT Embeddings

_by Al-Ahmadgaid B. Asaad_

#### Install Python Libraries

There are Julia's equivalent for the following Python libraries, but for this paper the author decided to use the official one which is in Python.

In [11]:
using Pkg

ENV["PYTHON"]="" # necessary for Conda.pip
Pkg.build("PyCall")

[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/b19db3927f0db4151cb86d073689f2428e524576/build.log`
[32m[1m    Building[22m[39m PyCall → `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/9816a3826b0ebf49ab4926e2b18842ad8b5c8f04/build.log`


In [12]:
using Conda

Conda.pip_interop(true)
Conda.pip("install", "sentence-transformers")
Conda.pip("install", "umap-learn")

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda config --set pip_interop_enabled true --file /Users/al-ahmadgaidasaad/.julia/conda/3/aarch64/condarc-julia.yml` in root environment
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `pip install sentence-transformers` in root environment




[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `pip install umap-learn` in root environment




#### Load Libraries

In [1]:
using CairoMakie
using Clustering
using DataFrames
using Distributions
using Distances
using PyCall
using QuranTree
using Statistics
using Yunir

import Base: show

In [2]:
# load the python libraries
sentence_transformers = pyimport("sentence_transformers")
umap_py = pyimport("umap.umap_")
UMAP = umap_py.UMAP

PyObject <class 'umap.umap_.UMAP'>

#### Load CL-Arabert Embedding Model

In [3]:
model_path = "/Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/models/CL-Arabert"
emodel = sentence_transformers.SentenceTransformer(model_path);

No sentence-transformers model found with name /Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/models/CL-Arabert. Creating a new one with mean pooling.


#### Load Qur'an Data

In [4]:
_, tnzl = load(QuranData());
tnzl_tbl = table(tnzl)

Tanzil Quran Text (Uthmani)
(C) 2008-2010 Tanzil.net

[1m6236×3 DataFrame[0m
[1m  Row [0m│[1m chapter [0m[1m verse [0m[1m form                              [0m
      │[90m Int64   [0m[90m Int64 [0m[90m String                            [0m
──────┼───────────────────────────────────────────────────
    1 │       1      1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
    2 │       1      2  ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
    3 │       1      3  ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
    4 │       1      4  مَٰلِكِ يَوْمِ ٱلدِّينِ
    5 │       1      5  إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
    6 │       1      6  ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ
    7 │       1      7  صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُو…
    8 │       2      1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ الٓمٓ
    9 │       2      2  ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ فِيهِ هُدًى لِّلْمُتَّقِينَ
   10 │       2      3  ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ ٱلصَّل…
   11 │       2      4  وَٱ

In [5]:
"""
Slicer Configuration

params:
    num_slices - number of slices
    var_slices - variability of slices, parameter for Dirichlet distribution
"""
struct Slicer
    num_slices::Int64
    var_slices::Union{Float64,Vector{Float64}}
end

Slicer

In [6]:
"""
Ayah Embeddings Type

params:
    num_slices - number of slices
    var_slices - variability of slices, parameter for Dirichlet distribution
"""
struct AyahEmbeddings{T <: Union{Float32,Float64}}
    emb::Matrix{T} 
end

function show(io::IO, ::MIME"text/plain", ae::AyahEmbeddings{T}) where T
    print(io, "AyahEmbeddings{$T), ")
    show(io, MIME("text/plain"), ae.emb)
end

show (generic function with 710 methods)

In [7]:
ayahs_ver = verses(tnzl_tbl[2]);
ayahs_emb = AyahEmbeddings(emodel.encode(ayahs_ver))

AyahEmbeddings{Float32), 286×768 Matrix{Float32}:
 0.531018  -0.132496    -0.756551   …  -0.305054   0.989819   0.201078
 0.784264  -0.187889     0.303152       0.0520253  0.646556   0.616321
 0.677887  -0.778916    -0.807045      -0.0719035  1.12943    0.859909
 0.506211  -0.325038    -0.502814       0.459864   1.43758    0.466729
 0.269263  -0.194165    -0.539852      -0.0628042  0.77668    0.554814
 0.467806  -0.373838    -0.0348215  …   0.392261   1.14069    0.293043
 0.886688  -0.527031    -0.530024      -0.0749496  0.983624   0.308669
 0.60995   -0.301923    -0.17381        0.103315   0.984032   0.482254
 0.412077  -0.309258    -0.512169       0.890515   0.994584   0.67909
 0.996288  -0.699789    -0.477904      -0.24042    0.772055   0.535724
 0.123426  -0.530353     0.13315    …   0.0387213  0.998667   0.42128
 0.153852   0.00949113  -0.474826       0.17242    0.753725  -0.188628
 0.563945  -0.655017    -0.349274      -0.40291    1.35588    0.388349
 ⋮                           

In [8]:
"""
Slices Generator

params:
    n - number of samples of slices to generate
    slicer - Slicer configuration
    ayahs_emb - the embeddings to slice
"""
function gen_slices(ayahs_emb::AyahEmbeddings{T}, n::Integer, slicer::Slicer)::Vector{Vector{AyahEmbeddings{T}}} where T <: Union{Float32,Float64}
    ayahs_emb = ayahs_emb.emb
    if slicer.var_slices isa Float64
        dir_samples = rand(Dirichlet(repeat([slicer.var_slices], slicer.num_slices - 1)), n)    
    else
        if slicer.num_slices != length(slicer.var_slices)
            error("Slicer.num_slices should be equal to length of Slicer.var_slices")
        else
            dir_samples = rand(Dirichlet(slicer.var_slices), n)
        end
    end

    # proportion the ayahs into slices using Dirichlet samples
    midpoints = Int64.(floor.(size(ayahs_emb)[1] .* dir_samples))
    midpoints = mapslices(sort, midpoints, dims=1)
    midpoints = unique(midpoints, dims=2) # drop any duplicate samples
    
    slices = Vector{AyahEmbeddings{<:Union{Float32,Float64}}}[]
    mp_size = size(midpoints)
    for j in 1:mp_size[2]
        slice = AyahEmbeddings{<:Union{Float32,Float64}}[]
        for i in 1:mp_size[1]
            if i == 1
                if midpoints[i, j] == 0
                    push!(slice, ayahs_emb[1:1, :] |> AyahEmbeddings)
                else
                    push!(slice, ayahs_emb[1:midpoints[i, j],:] |> AyahEmbeddings)
                end
            elseif i < mp_size[1]
                if midpoints[i-1, j] == midpoints[i, j]
                    push!(slice, ayahs_emb[(midpoints[i-1, j]+1):midpoints[i, j]+2,:] |> AyahEmbeddings)
                else
                    push!(slice, ayahs_emb[(midpoints[i-1, j]+1):midpoints[i, j],:] |> AyahEmbeddings)
                end
            else
                if midpoints[i-1, j] == midpoints[i, j]
                    push!(slice, ayahs_emb[(midpoints[i-1, j]+1):midpoints[i, j]+2,:] |> AyahEmbeddings)
                    push!(slice, ayahs_emb[(midpoints[i, j]+1):end,:] |> AyahEmbeddings)
                else
                    push!(slice, ayahs_emb[(midpoints[i-1, j]+1):midpoints[i, j],:] |> AyahEmbeddings)
                    push!(slice, ayahs_emb[(midpoints[i, j]+1):end,:] |> AyahEmbeddings)
                end
            end
        end
        push!(slices, slice)
    end
    return slices
end

gen_slices

In [9]:
"""
Compute Five Number Summary

five_summary(v::Vector{T}) where T<:Union{Float32,Float64}
params:
    v - data (e.g. embeddings)

five_summary(slices::Vector{Vector{Matrix{T}}}) where T<:Union{Float32,Float64}
params:
    slices - slices of data (e.g. ayah embeddings)
"""
function five_summary(v::Vector{T})::Vector{T} where T <: Union{Float32,Float64}
    sv = sort(v)

    min = minimum(sv)
    q1 = quantile(sv, 0.25)
    med = median(sv)
    q3 = quantile(sv, 0.5)
    max = maximum(sv)

    return [min, q1, med, q3, max]
end

function five_summary(slices::Vector{Vector{AyahEmbeddings{T}}})::Vector{Vector{AyahEmbeddings{T}}} where T <: Union{Float32,Float64}
    five_nums = Vector{AyahEmbeddings{<:Union{Float32,Float64}}}[]
    for slice in slices
        five_num = AyahEmbeddings{<:Union{Float32,Float64}}[]
        for i in 1:size(slice)[1]
            push!(five_num, mapslices(five_summary, slice[i].emb, dims=1) |> AyahEmbeddings)
        end
        push!(five_nums, five_num)
    end
    return five_nums
end

five_summary (generic function with 2 methods)

In [15]:
struct AyahDistances{T <: Union{Float32,Float64}}
    dist::Vector{T}
end

function show(io::IO, ::MIME"text/plain", ae::AyahDistances{T}) where T
    print(io, "AyahDistances{$T}, ")
    show(io, MIME("text/plain"), ae.dist)
end

show (generic function with 711 methods)

In [16]:
"""
Circular-wise Computation of the Distance of Slices

params:
    five_nums - five number summaries
    slicer - Slicer configuration
    dist - a Distances UnionSemiMetric
"""
function distance(five_nums::Vector{Vector{AyahEmbeddings{T}}}, slicer::Slicer, dist::Distances.UnionSemiMetric)::AyahDistances{Float64} where T <: Union{Float32,Float64}
    med_idx = Int64(median(1:slicer.num_slices))
    costs = Float64[]
    for five_num in five_nums
        cost = Float64[]
        for i in 1:(med_idx-1)
            ring_dist = sum(abs.(colwise(dist, five_num[i].emb, five_num[end-i+1].emb)))
            cen_lower = sum(abs.(colwise(dist, five_num[med_idx].emb, five_num[i].emb)))
            cen_upper = sum(abs.(colwise(dist, five_num[med_idx].emb, five_num[end-i+1].emb)))
            
            push!(cost, ring_dist)
            push!(cost, cen_lower)
            push!(cost, cen_upper)
        end
        push!(costs, sum(cost))
    end
    return AyahDistances(costs)
end

distance

In [17]:
slicer = Slicer(7, 1.5)
slices = gen_slices(ayahs_emb, 10, slicer)
slices[1][1]

AyahEmbeddings{Float32), 1×768 Matrix{Float32}:
 0.531018  -0.132496  -0.756551  0.111261  …  -0.305054  0.989819  0.201078

In [18]:
five_nums = five_summary(slices)
five_nums[1][1]

AyahEmbeddings{Float32), 5×768 Matrix{Float32}:
 0.531018  -0.132496  -0.756551  0.111261  …  -0.305054  0.989819  0.201078
 0.531018  -0.132496  -0.756551  0.111261     -0.305054  0.989819  0.201078
 0.531018  -0.132496  -0.756551  0.111261     -0.305054  0.989819  0.201078
 0.531018  -0.132496  -0.756551  0.111261     -0.305054  0.989819  0.201078
 0.531018  -0.132496  -0.756551  0.111261     -0.305054  0.989819  0.201078

### Computing Distances

In [20]:
ayah_dist = distance(five_nums, slicer, ChiSqDist())

AyahDistances{Float64}, 10-element Vector{Float64}:
 25864.22802734375
 12482.024536132812
 29138.103912353516
 14739.898986816406
 11997.844451904297
  7471.916107177734
 12075.943359375
  8517.262390136719
  8849.407196044922
 16113.497344970703

### Sampling Parents

In [15]:
function sample(ayah_emb::AyahEmbeddings, n::Integer, w::AyahDistances)
    ayah_emb = ayah_emb.emb
    
end

LoadError: ParseError:
[90m# Error @ [0;0m]8;;file:///Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/In[15]#1:58\[90mIn[15]:1:58[0;0m]8;;\
function sample(ayah_emb::AyahEmbeddings, n::Integer, w::[48;2;120;70;70m)[0;0m
[90m#                                                        ╙ ── [0;0m[91munexpected `)`[0;0m

In [59]:
med_idx = Int64(median(1:num_slices))
costs = Float64[]
for fivenum in fivenums
    cost = Float64[]
    for i in 1:(med_idx-1)
        ring_dist = sum(abs.(colwise(ChiSqDist(), fivenum[i], fivenum[end-i+1])))
        cen_lower = sum(abs.(colwise(ChiSqDist(), fivenum[med_idx], fivenum[i])))
        cen_upper = sum(abs.(colwise(ChiSqDist(), fivenum[med_idx], fivenum[end-i+1])))
        
        push!(cost, ring_dist)
        push!(cost, cen_lower)
        push!(cost, cen_upper)
    end
    push!(costs, sum(cost))
end

LoadError: UndefVarError: `num_slices` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [10]:
dir_samples = rand(Dirichlet(repeat([1.5], num_slices - 1)), 10_000)

6×10000 Matrix{Float64}:
 0.148985   0.0532617  0.40407    …  0.104077   0.165932   0.121923
 0.253078   0.381313   0.164189      0.283156   0.0857659  0.0269265
 0.0768869  0.0457845  0.028863      0.302439   0.167866   0.119631
 0.326921   0.0946558  0.154129      0.103038   0.0473106  0.259263
 0.0950957  0.391862   0.0822693     0.0398453  0.336879   0.439312
 0.0990342  0.0331232  0.166479   …  0.167444   0.196247   0.032944

In [11]:
midpoints = Int64.(floor.(size(ayahs)[1] .* dir_samples))
midpoints = mapslices(sort, midpoints, dims=1)
midpoints = unique(midpoints, dims=2)

6×9997 Matrix{Int64}:
 21    9    8   10    5    3  33  14  …    1   7    4   2    6  11  13    7
 27   13   23   15   11   19  38  22       8  28   26  16   15  29  24    9
 28   15   44   34   23   29  46  33      16  32   37  35   32  29  47   34
 42   27   46   35   48   41  47  62      16  61   39  72   35  47  48   34
 72  109   47   75   68   51  54  74      85  69   62  74   64  80  56   74
 93  112  115  112  128  140  65  78  …  156  85  115  84  132  86  96  125

In [12]:
slices = Vector{Matrix{Float32}}[]
mp_size = size(midpoints)
for j in 1:mp_size[2]
    slice = Matrix{Float32}[]
    for i in 1:mp_size[1]
        if i == 1
            if midpoints[i, j] == 0
                push!(slice, ayahs[1:1, :])
            else
                push!(slice, ayahs[1:midpoints[i, j],:])
            end
        elseif i < mp_size[1]
            if midpoints[i-1, j] == midpoints[i, j]
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j]+2,:])
            else
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j],:])
            end
        else
            if midpoints[i-1, j] == midpoints[i, j]
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j]+2,:])
                push!(slice, ayahs[(midpoints[i, j]+1):end,:])
            else
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j],:])
                push!(slice, ayahs[(midpoints[i, j]+1):end,:])
            end
        end
    end
    push!(slices, slice)
end

In [13]:
slices[1][1]

21×768 Matrix{Float32}:
 0.531018  -0.132496    -0.756551   …  -0.305054   0.989819   0.201078
 0.784264  -0.187889     0.303152       0.0520253  0.646556   0.616321
 0.677887  -0.778916    -0.807045      -0.0719035  1.12943    0.859909
 0.506211  -0.325038    -0.502814       0.459864   1.43758    0.466729
 0.269263  -0.194165    -0.539852      -0.0628042  0.77668    0.554814
 0.467806  -0.373838    -0.0348215  …   0.392261   1.14069    0.293043
 0.886688  -0.527031    -0.530024      -0.0749496  0.983624   0.308669
 0.60995   -0.301923    -0.17381        0.103315   0.984032   0.482254
 0.412077  -0.309258    -0.512169       0.890515   0.994584   0.67909
 0.996288  -0.699789    -0.477904      -0.24042    0.772055   0.535724
 0.123426  -0.530353     0.13315    …   0.0387213  0.998667   0.42128
 0.153852   0.00949113  -0.474826       0.17242    0.753725  -0.188628
 0.563945  -0.655017    -0.349274      -0.40291    1.35588    0.388349
 0.535295  -0.527559    -0.118389      -0.281729   1.32

### Summarizing Embeddings

In [14]:
function quantile_summary(v::Vector)
    sv = sort(v)

    min = minimum(sv)
    q1 = quantile(sv, 0.25)
    med = median(sv)
    q3 = quantile(sv, 0.5)
    max = maximum(sv)

    return [min, q1, med, q3, max]
end

quantile_summary (generic function with 1 method)

In [15]:
fivenums = Vector{Matrix{Float32}}[]
for slice in slices
    fivenum = Matrix{Float32}[]
    for i in 1:size(slice)[1]
        push!(fivenum, mapslices(quantile_summary, slice[i], dims=1))
    end
    push!(fivenums, fivenum)
end

### Computing Distances

In [16]:
med_idx = Int64(median(1:num_slices))
costs = Float64[]
for fivenum in fivenums
    cost = Float64[]
    for i in 1:(med_idx-1)
        ring_dist = sum(abs.(colwise(ChiSqDist(), fivenum[i], fivenum[end-i+1])))
        cen_lower = sum(abs.(colwise(ChiSqDist(), fivenum[med_idx], fivenum[i])))
        cen_upper = sum(abs.(colwise(ChiSqDist(), fivenum[med_idx], fivenum[end-i+1])))
        
        push!(cost, ring_dist)
        push!(cost, cen_lower)
        push!(cost, cen_upper)
    end
    push!(costs, sum(cost))
end

In [17]:
costs

9997-element Vector{Float64}:
  11677.102172851562
  11203.320617675781
  29691.85430908203
  38849.152587890625
   5972.433624267578
 165887.71115112305
  22161.464385986328
   8001.195373535156
  16007.111450195312
  11884.476165771484
   9487.483123779297
  13865.978698730469
  13942.094940185547
      ⋮
   9917.428771972656
  26086.905029296875
  14108.519622802734
   8135.8023681640625
  49250.58459472656
   9383.729797363281
  37451.48733520508
 276468.17462158203
  17038.392974853516
 160143.00164794922
  45076.55291748047
  23954.095764160156

### Sampling Parents

In [24]:
k = 10
idx = rand(DiscreteUniform(1, size(ayahs)[1]), k)


10-element Vector{Int64}:
 111
 174
 135
 196
 231
 213
 281
   5
 196
 191

### Mutation

In [18]:
sum(abs.(colwise(ChiSqDist(), fivenums[1][2], fivenums[1][end-i-1])))

LoadError: UndefVarError: `i` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [19]:
sum(abs.(colwise(ChiSqDist(), fivenums[1][med_idx], fivenums[1][2])))

916.2252f0

In [55]:
fivenums[1][end-i-1]

5×768 Matrix{Float32}:
 0.0219057  -0.87156    -0.971374   …  -0.585858   0.621963  -0.275423
 0.330805   -0.395003   -0.511762      -0.0990097  0.931669   0.205508
 0.662539   -0.284216   -0.335894       0.0457841  1.12092    0.377689
 0.662539   -0.284216   -0.335894       0.0457841  1.12092    0.377689
 1.07862    -0.0962282   0.0669256      0.429768   1.67765    0.726807

In [56]:
fivenums[1][2]

5×768 Matrix{Float32}:
 0.412077  -0.527031  -0.530024   …  -0.0749496  0.983624  0.293043
 0.453874  -0.412136  -0.516633       0.0587488  0.98393   0.304762
 0.538878  -0.341548  -0.34299        0.247788   0.989308  0.395461
 0.538878  -0.341548  -0.34299        0.247788   0.989308  0.395461
 0.886688  -0.301923  -0.0348215      0.890515   1.14069   0.67909

In [63]:
fivenums[1][end-i]

5×768 Matrix{Float32}:
 -0.306053  -0.675701   -0.968244   …  -0.740139   0.420536  0.0429858
  0.530863  -0.421813   -0.610742      -0.209497   0.9515    0.320158
  0.657967  -0.376442   -0.424982      -0.0615468  1.16018   0.408021
  0.657967  -0.376442   -0.424982      -0.0615468  1.16018   0.408021
  1.19378    0.0773699   0.0711554      0.654553   1.58278   0.753576

In [62]:
fivenums[1]

7-element Vector{Matrix{Float32}}:
 [0.269263 -0.7789163 … 0.64655566 0.20107773; 0.50621146 -0.3250376 … 0.7766804 0.4667288; … ; 0.5310184 -0.19416487 … 0.9898191 0.5548137; 0.784264 -0.13249622 … 1.4375801 0.8599088]
 [0.41207725 -0.52703136 … 0.98362446 0.2930433; 0.45387352 -0.41213608 … 0.9839301 0.30476224; … ; 0.53887784 -0.34154764 … 0.98930806 0.39546126; 0.8866878 -0.30192348 … 1.1406853 0.67909]
 [0.12342641 -0.8578767 … 0.668599 -0.1886282; 0.48168612 -0.5319276 … 0.8003974 0.32322973; … ; 0.5983705 -0.37594244 … 1.0017582 0.4692089; 0.9962884 0.035326615 … 1.3602252 0.86905515]
 [0.021905743 -0.8715603 … 0.62196314 -0.27542344; 0.33080453 -0.39500266 … 0.9316688 0.20550822; … ; 0.662539 -0.28421557 … 1.1209154 0.37768933; 1.0786216 -0.09622821 … 1.6776503 0.7268074]
 [-0.306053 -0.6757015 … 0.42053604 0.042985797; 0.53086317 -0.4218128 … 0.95150036 0.32015765; … ; 0.6579674 -0.37644216 … 1.160177 0.40802145; 1.1937826 0.077369876 … 1.5827835 0.75357646]
 [0.2113618 -0.450