 ## بسم الله الرّحمٰن الرّحيم 

# Theory of Concentrism in the Qur'an using Bayesian Optimization & Large Language Model

_by Al-Ahmadgaid B. Asaad_

#### Install Python Libraries

There are Julia's equivalent for the following Python libraries, but for this paper the author decided to use the official one which is in Python.

In [11]:
using Pkg

ENV["PYTHON"]="" # necessary for Conda.pip
Pkg.build("PyCall")

[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/b19db3927f0db4151cb86d073689f2428e524576/build.log`
[32m[1m    Building[22m[39m PyCall → `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/9816a3826b0ebf49ab4926e2b18842ad8b5c8f04/build.log`


In [12]:
using Conda

Conda.pip_interop(true)
Conda.pip("install", "sentence-transformers")
Conda.pip("install", "umap-learn")

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda config --set pip_interop_enabled true --file /Users/al-ahmadgaidasaad/.julia/conda/3/aarch64/condarc-julia.yml` in root environment
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `pip install sentence-transformers` in root environment




[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `pip install umap-learn` in root environment




#### Load Libraries

In [2]:
using BOSS
using Combinatorics
using CairoMakie
using Clustering
using DataFrames
using Distributions
using Distances
using Optimization
using PyCall
using QuranTree
using Statistics
using Turing
using Yunir

In [3]:
# load the python libraries
sentence_transformers = pyimport("sentence_transformers")
umap_py = pyimport("umap.umap_")
UMAP = umap_py.UMAP

PyObject <class 'umap.umap_.UMAP'>

#### Load CL-Arabert Embedding Model

In [4]:
model_path = "/Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/models/CL-Arabert"
emodel = sentence_transformers.SentenceTransformer(model_path);

No sentence-transformers model found with name /Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/models/CL-Arabert. Creating a new one with mean pooling.


#### Load Qur'an Data

In [5]:
_, tnzl = load(QuranData());
tnzl_tbl = table(tnzl)

Tanzil Quran Text (Uthmani)
(C) 2008-2010 Tanzil.net

[1m6236×3 DataFrame[0m
[1m  Row [0m│[1m chapter [0m[1m verse [0m[1m form                              [0m
      │[90m Int64   [0m[90m Int64 [0m[90m String                            [0m
──────┼───────────────────────────────────────────────────
    1 │       1      1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
    2 │       1      2  ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
    3 │       1      3  ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
    4 │       1      4  مَٰلِكِ يَوْمِ ٱلدِّينِ
    5 │       1      5  إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
    6 │       1      6  ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ
    7 │       1      7  صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُو…
    8 │       2      1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ الٓمٓ
    9 │       2      2  ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ فِيهِ هُدًى لِّلْمُتَّقِينَ
   10 │       2      3  ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ ٱلصَّل…
   11 │       2      4  وَٱ

In [6]:
surah2 = verses(tnzl_tbl[2]);

In [7]:
surah2_emb = emodel.encode(surah2)

286×768 Matrix{Float32}:
 0.531018  -0.132496    -0.756551   …  -0.305054   0.989819   0.201078
 0.784264  -0.187889     0.303152       0.0520253  0.646556   0.616321
 0.677887  -0.778916    -0.807045      -0.0719035  1.12943    0.859909
 0.506211  -0.325038    -0.502814       0.459864   1.43758    0.466729
 0.269263  -0.194165    -0.539852      -0.0628042  0.77668    0.554814
 0.467806  -0.373838    -0.0348215  …   0.392261   1.14069    0.293043
 0.886688  -0.527031    -0.530024      -0.0749496  0.983624   0.308669
 0.60995   -0.301923    -0.17381        0.103315   0.984032   0.482254
 0.412077  -0.309258    -0.512169       0.890515   0.994584   0.67909
 0.996288  -0.699789    -0.477904      -0.24042    0.772055   0.535724
 0.123426  -0.530353     0.13315    …   0.0387213  0.998667   0.42128
 0.153852   0.00949113  -0.474826       0.17242    0.753725  -0.188628
 0.563945  -0.655017    -0.349274      -0.40291    1.35588    0.388349
 ⋮                                  ⋱   ⋮             

In [8]:
struct Slicer
    num_slices::Int64
    min_ayahs::Int64
end

In [9]:
function gen_slices(slicer::Slicer, ayahs::Vector{String})
    ayah_len = length(ayahs)
    if (slicer.slices < ayah_len)
        error("`slices` should be less than the length of `ayahs` vector to slice.")
    end
    rand(Uniform(slicer.min_ayahs, ayah_len - slicer.min_ayahs), )     
end

gen_slices (generic function with 1 method)

In [10]:
ayahs = surah2_emb;

num_slices = 7

7

In [11]:
dir_samples = rand(Dirichlet(repeat([1.5], num_slices - 1)), 10_000)

6×10000 Matrix{Float64}:
 0.307168   0.0824626  0.105111    …  0.096869  0.114672  0.0288761
 0.348614   0.0188442  0.0790516      0.253143  0.088062  0.158101
 0.0176506  0.115432   0.00746888     0.030411  0.251269  0.183359
 0.0338239  0.123356   0.570426       0.221196  0.147714  0.268946
 0.195637   0.467883   0.0327441      0.263562  0.148368  0.16067
 0.0971076  0.192022   0.205198    …  0.13482   0.249914  0.200049

In [12]:
midpoints = Int64.(floor.(size(ayahs)[1] .* dir_samples))
midpoints = mapslices(sort, midpoints, dims=1)
midpoints = unique(midpoints, dims=2)

6×10000 Matrix{Int64}:
  5    5    2    3  10  22   10   13  …    6   19   2  13    7   8  25   8
  9   23    9   42  13  25   15   22      11   20  24  22    8  27  32  45
 27   33   22   42  40  26   18   28      22   31  29  35   30  38  42  45
 55   35   30   43  63  30   62   34      45   35  67  52   32  63  42  52
 87   54   58   48  74  85   75   35      70   58  75  67   57  72  71  57
 99  133  163  106  83  95  102  151  …  129  119  86  94  148  75  71  76

In [13]:
slices = Vector{Matrix{Float32}}[]
mp_size = size(midpoints)
for j in 1:mp_size[2]
    slice = Matrix{Float32}[]
    for i in 1:mp_size[1]
        if i == 1
            if midpoints[i, j] == 0
                push!(slice, ayahs[1:1, :])
            else
                push!(slice, ayahs[1:midpoints[i, j],:])
            end
        elseif i < mp_size[1]
            if midpoints[i-1, j] == midpoints[i, j]
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j]+2,:])
            else
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j],:])
            end
        else
            if midpoints[i-1, j] == midpoints[i, j]
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j]+2,:])
                push!(slice, ayahs[(midpoints[i, j]+1):end,:])
            else
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j],:])
                push!(slice, ayahs[(midpoints[i, j]+1):end,:])
            end
        end
    end
    push!(slices, slice)
end

In [14]:
slices[1][1]

5×768 Matrix{Float32}:
 0.531018  -0.132496  -0.756551  0.111261  …  -0.305054   0.989819  0.201078
 0.784264  -0.187889   0.303152  0.511859      0.0520253  0.646556  0.616321
 0.677887  -0.778916  -0.807045  0.14974      -0.0719035  1.12943   0.859909
 0.506211  -0.325038  -0.502814  0.422343      0.459864   1.43758   0.466729
 0.269263  -0.194165  -0.539852  0.505806     -0.0628042  0.77668   0.554814

### Summarizing Embeddings

In [15]:
function quantile_summary(v::Vector)
    sv = sort(v)

    min = minimum(sv)
    q1 = quantile(sv, 0.25)
    med = median(sv)
    q3 = quantile(sv, 0.5)
    max = maximum(sv)

    return [min, q1, med, q3, max]
end

quantile_summary (generic function with 1 method)

In [16]:
fivenums = Vector{Matrix{Float32}}[]
for slice in slices
    fivenum = Matrix{Float32}[]
    for i in 1:size(slice)[1]
        push!(fivenum, mapslices(quantile_summary, slice[i], dims=1))
    end
    push!(fivenums, fivenum)
end

### Computing Distances

In [68]:
med_idx = Int64(median(1:num_slices))
costs = Float64[]
for fivenum in fivenums
    cost = Float64[]
    for i in 1:(med_idx-1)
        ring_dist = sum(abs.(colwise(ChiSqDist(), fivenum[i], fivenum[end-i+1])))
        cen_lower = sum(abs.(colwise(ChiSqDist(), fivenum[med_idx], fivenum[i])))
        cen_upper = sum(abs.(colwise(ChiSqDist(), fivenum[med_idx], fivenum[end-i+1])))
        
        push!(cost, ring_dist)
        push!(cost, cen_lower)
        push!(cost, cen_upper)
    end
    push!(costs, sum(cost))
end

In [72]:
costs

10000-element Vector{Float64}:
  7452.843200683594
 41067.15090942383
 11501.478302001953
 46163.764099121094
  6079.689697265625
 26787.86083984375
  7930.223815917969
 14932.815063476562
 34924.868713378906
  9764.357818603516
 22705.226989746094
 13131.507537841797
  6077.573547363281
     ⋮
  7784.376251220703
 12113.333190917969
  9717.5419921875
 16551.457611083984
  6929.189239501953
 14802.379638671875
 16896.49038696289
  6202.122131347656
 31594.0556640625
  7723.901672363281
 18107.681243896484
  8041.982604980469

In [54]:
sum(abs.(colwise(ChiSqDist(), fivenums[1][2], fivenums[1][end-i-1])))

1165.7137f0

In [47]:
sum(abs.(colwise(ChiSqDist(), fivenums[1][med_idx], fivenums[1][2])))

1165.7137f0

In [55]:
fivenums[1][end-i-1]

5×768 Matrix{Float32}:
 0.0219057  -0.87156    -0.971374   …  -0.585858   0.621963  -0.275423
 0.330805   -0.395003   -0.511762      -0.0990097  0.931669   0.205508
 0.662539   -0.284216   -0.335894       0.0457841  1.12092    0.377689
 0.662539   -0.284216   -0.335894       0.0457841  1.12092    0.377689
 1.07862    -0.0962282   0.0669256      0.429768   1.67765    0.726807

In [56]:
fivenums[1][2]

5×768 Matrix{Float32}:
 0.412077  -0.527031  -0.530024   …  -0.0749496  0.983624  0.293043
 0.453874  -0.412136  -0.516633       0.0587488  0.98393   0.304762
 0.538878  -0.341548  -0.34299        0.247788   0.989308  0.395461
 0.538878  -0.341548  -0.34299        0.247788   0.989308  0.395461
 0.886688  -0.301923  -0.0348215      0.890515   1.14069   0.67909

In [63]:
fivenums[1][end-i]

5×768 Matrix{Float32}:
 -0.306053  -0.675701   -0.968244   …  -0.740139   0.420536  0.0429858
  0.530863  -0.421813   -0.610742      -0.209497   0.9515    0.320158
  0.657967  -0.376442   -0.424982      -0.0615468  1.16018   0.408021
  0.657967  -0.376442   -0.424982      -0.0615468  1.16018   0.408021
  1.19378    0.0773699   0.0711554      0.654553   1.58278   0.753576

In [62]:
fivenums[1]

7-element Vector{Matrix{Float32}}:
 [0.269263 -0.7789163 … 0.64655566 0.20107773; 0.50621146 -0.3250376 … 0.7766804 0.4667288; … ; 0.5310184 -0.19416487 … 0.9898191 0.5548137; 0.784264 -0.13249622 … 1.4375801 0.8599088]
 [0.41207725 -0.52703136 … 0.98362446 0.2930433; 0.45387352 -0.41213608 … 0.9839301 0.30476224; … ; 0.53887784 -0.34154764 … 0.98930806 0.39546126; 0.8866878 -0.30192348 … 1.1406853 0.67909]
 [0.12342641 -0.8578767 … 0.668599 -0.1886282; 0.48168612 -0.5319276 … 0.8003974 0.32322973; … ; 0.5983705 -0.37594244 … 1.0017582 0.4692089; 0.9962884 0.035326615 … 1.3602252 0.86905515]
 [0.021905743 -0.8715603 … 0.62196314 -0.27542344; 0.33080453 -0.39500266 … 0.9316688 0.20550822; … ; 0.662539 -0.28421557 … 1.1209154 0.37768933; 1.0786216 -0.09622821 … 1.6776503 0.7268074]
 [-0.306053 -0.6757015 … 0.42053604 0.042985797; 0.53086317 -0.4218128 … 0.95150036 0.32015765; … ; 0.6579674 -0.37644216 … 1.160177 0.40802145; 1.1937826 0.077369876 … 1.5827835 0.75357646]
 [0.2113618 -0.450

In [28]:
costs[1]

9-element Vector{Float64}:
 161.41178289614618
 116.84815433062613
 181.7003574827686
 257.81669568642974
 257.81669568642974
   0.0
   0.0
 320.6357750436291
 320.6357750436291

In [58]:
vectors = 1:7

# Generate all combinations of size 2 from the vectors
combinations_of_vectors = collect(combinations(vectors, 2))

21-element Vector{Vector{Int64}}:
 [1, 2]
 [1, 3]
 [1, 4]
 [1, 5]
 [1, 6]
 [1, 7]
 [2, 3]
 [2, 4]
 [2, 5]
 [2, 6]
 [2, 7]
 [3, 4]
 [3, 5]
 [3, 6]
 [3, 7]
 [4, 5]
 [4, 6]
 [4, 7]
 [5, 6]
 [5, 7]
 [6, 7]

In [54]:
Pkg.add("Combinatorics")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `~/Documents/School/Islamic Studies/ma-thesis-codes/Project.toml`
  [90m[861a8166] [39m[92m+ Combinatorics v1.0.2[39m
[32m[1m  No Changes[22m[39m to `~/Documents/School/Islamic Studies/ma-thesis-codes/Manifest.toml`


In [53]:
using Pkg