 ## بسم الله الرّحمٰن الرّحيم 

# Theory of Concentrism in the Qur'an using Bayesian Optimization & Large Language Model

_by Al-Ahmadgaid B. Asaad_

#### Install Python Libraries

There are Julia's equivalent for the following Python libraries, but for this paper the author decided to use the official one which is in Python.

In [11]:
using Pkg

ENV["PYTHON"]="" # necessary for Conda.pip
Pkg.build("PyCall")

[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/b19db3927f0db4151cb86d073689f2428e524576/build.log`
[32m[1m    Building[22m[39m PyCall → `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/9816a3826b0ebf49ab4926e2b18842ad8b5c8f04/build.log`


In [12]:
using Conda

Conda.pip_interop(true)
Conda.pip("install", "sentence-transformers")
Conda.pip("install", "umap-learn")

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda config --set pip_interop_enabled true --file /Users/al-ahmadgaidasaad/.julia/conda/3/aarch64/condarc-julia.yml` in root environment
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `pip install sentence-transformers` in root environment




[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `pip install umap-learn` in root environment




#### Load Libraries

In [55]:
using BOSS
using Combinatorics
using CairoMakie
using Clustering
using DataFrames
using Distributions
using Distances
using Optimization
using PyCall
using QuranTree
using Statistics
using Turing
using Yunir

In [2]:
# load the python libraries
sentence_transformers = pyimport("sentence_transformers")
umap_py = pyimport("umap.umap_")
UMAP = umap_py.UMAP

PyObject <class 'umap.umap_.UMAP'>

#### Load CL-Arabert Embedding Model

In [3]:
model_path = "/Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/models/CL-Arabert"
emodel = sentence_transformers.SentenceTransformer(model_path);

No sentence-transformers model found with name /Users/al-ahmadgaidasaad/Documents/School/Islamic Studies/ma-thesis-codes/models/CL-Arabert. Creating a new one with mean pooling.


#### Load Qur'an Data

In [4]:
_, tnzl = load(QuranData());
tnzl_tbl = table(tnzl)

Tanzil Quran Text (Uthmani)
(C) 2008-2010 Tanzil.net

[1m6236×3 DataFrame[0m
[1m  Row [0m│[1m chapter [0m[1m verse [0m[1m form                              [0m
      │[90m Int64   [0m[90m Int64 [0m[90m String                            [0m
──────┼───────────────────────────────────────────────────
    1 │       1      1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
    2 │       1      2  ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
    3 │       1      3  ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
    4 │       1      4  مَٰلِكِ يَوْمِ ٱلدِّينِ
    5 │       1      5  إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
    6 │       1      6  ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ
    7 │       1      7  صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُو…
    8 │       2      1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ الٓمٓ
    9 │       2      2  ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ فِيهِ هُدًى لِّلْمُتَّقِينَ
   10 │       2      3  ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ ٱلصَّل…
   11 │       2      4  وَٱ

In [5]:
surah2 = verses(tnzl_tbl[2]);

In [6]:
surah2_emb = emodel.encode(surah2)

286×768 Matrix{Float32}:
 0.531018  -0.132496    -0.756551   …  -0.305054   0.989819   0.201078
 0.784264  -0.187889     0.303152       0.0520253  0.646556   0.616321
 0.677887  -0.778916    -0.807045      -0.0719035  1.12943    0.859909
 0.506211  -0.325038    -0.502814       0.459864   1.43758    0.466729
 0.269263  -0.194165    -0.539852      -0.0628042  0.77668    0.554814
 0.467806  -0.373838    -0.0348215  …   0.392261   1.14069    0.293043
 0.886688  -0.527031    -0.530024      -0.0749496  0.983624   0.308669
 0.60995   -0.301923    -0.17381        0.103315   0.984032   0.482254
 0.412077  -0.309258    -0.512169       0.890515   0.994584   0.67909
 0.996288  -0.699789    -0.477904      -0.24042    0.772055   0.535724
 0.123426  -0.530353     0.13315    …   0.0387213  0.998667   0.42128
 0.153852   0.00949113  -0.474826       0.17242    0.753725  -0.188628
 0.563945  -0.655017    -0.349274      -0.40291    1.35588    0.388349
 ⋮                                  ⋱   ⋮             

In [7]:
struct Slicer
    num_slices::Int64
    min_ayahs::Int64
end

In [8]:
function gen_slices(slicer::Slicer, ayahs::Vector{String})
    ayah_len = length(ayahs)
    if (slicer.slices < ayah_len)
        error("`slices` should be less than the length of `ayahs` vector to slice.")
    end
    rand(Uniform(slicer.min_ayahs, ayah_len - slicer.min_ayahs), )     
end

gen_slices (generic function with 1 method)

In [9]:
ayahs = surah2_emb;

num_slices = 7

7

In [10]:
dir_samples = rand(Dirichlet(repeat([1.5], num_slices - 1)), 10_000)

6×10000 Matrix{Float64}:
 0.0509858  0.221487   0.0278969  …  0.183275  0.0466492  0.110775
 0.204048   0.214299   0.0143003     0.127203  0.122204   0.0504098
 0.43801    0.141789   0.370441      0.104917  0.018396   0.290213
 0.0430842  0.0914996  0.269069      0.221556  0.163428   0.321631
 0.135317   0.317604   0.103178      0.306106  0.0960785  0.0916322
 0.128555   0.0133206  0.215115   …  0.056942  0.553244   0.135339

In [11]:
midpoints = Int64.(floor.(size(ayahs)[1] .* dir_samples))
midpoints = mapslices(sort, midpoints, dims=1)
midpoints = unique(midpoints, dims=2)

6×9999 Matrix{Int64}:
  12   3    4  10  30   17  19  11    8  …  19    2  13   12   6  16    5  14
  14  26    7  46  36   18  21  29   13     24    8  14   21  26  30   13  26
  36  40   29  47  44   28  30  39   18     31   29  30   27  44  36   27  31
  38  61   61  59  45   34  59  56   21     47   35  69   43  51  52   34  38
  58  63   76  60  61   53  63  62   98     80   79  74   57  56  63   46  83
 125  90  105  61  66  133  91  86  125  …  84  130  83  122  99  87  158  91

In [31]:
slices = Vector{Matrix{Float32}}[]
mp_size = size(midpoints)
for j in 1:mp_size[2]
    slice = Matrix{Float32}[]
    for i in 1:mp_size[1]
        if i == 1
            if midpoints[i, j] == 0
                push!(slice, ayahs[1:1, :])
            else
                push!(slice, ayahs[1:midpoints[i, j],:])
            end
        elseif i < mp_size[1]
            if midpoints[i-1, j] == midpoints[i, j]
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j]+2,:])
            else
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j],:])
            end
        else
            if midpoints[i-1, j] == midpoints[i, j]
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j]+2,:])
                push!(slice, ayahs[(midpoints[i, j]+1):end,:])
            else
                push!(slice, ayahs[(midpoints[i-1, j]+1):midpoints[i, j],:])
                push!(slice, ayahs[(midpoints[i, j]+1):end,:])
            end
        end
    end
    push!(slices, slice)
end

In [33]:
slices[1][1]

12×768 Matrix{Float32}:
 0.531018  -0.132496    -0.756551   …  -0.305054   0.989819   0.201078
 0.784264  -0.187889     0.303152       0.0520253  0.646556   0.616321
 0.677887  -0.778916    -0.807045      -0.0719035  1.12943    0.859909
 0.506211  -0.325038    -0.502814       0.459864   1.43758    0.466729
 0.269263  -0.194165    -0.539852      -0.0628042  0.77668    0.554814
 0.467806  -0.373838    -0.0348215  …   0.392261   1.14069    0.293043
 0.886688  -0.527031    -0.530024      -0.0749496  0.983624   0.308669
 0.60995   -0.301923    -0.17381        0.103315   0.984032   0.482254
 0.412077  -0.309258    -0.512169       0.890515   0.994584   0.67909
 0.996288  -0.699789    -0.477904      -0.24042    0.772055   0.535724
 0.123426  -0.530353     0.13315    …   0.0387213  0.998667   0.42128
 0.153852   0.00949113  -0.474826       0.17242    0.753725  -0.188628

### Summarizing Embeddings

In [40]:
function quantile_summary(v::Vector)
    sv = sort(v)

    min = minimum(sv)
    q1 = quantile(sv, 0.25)
    med = median(sv)
    q3 = quantile(sv, 0.5)
    max = maximum(sv)

    return [min, q1, med, q3, max]
end

quantile_summary (generic function with 1 method)

In [41]:
fivenums = Vector{Matrix{Float32}}[]
for slice in slices
    fivenum = Matrix{Float32}[]
    for i in 1:size(slice)[1]
        push!(fivenum, mapslices(quantile_summary, slice[i], dims=1))
    end
    push!(fivenums, fivenum)
end

### Computing Distances

In [45]:
fivenums[1]

7-element Vector{Matrix{Float32}}:
 [0.12342641 -0.7789163 … 0.64655566 -0.1886282; 0.37637368 -0.5278617 … 0.775524 0.30476224; … ; 0.5186149 -0.3171476 … 0.9869256 0.47449136; 0.9962884 0.009491126 … 1.4375801 0.8599088]
 [0.53529483 -0.65501744 … 1.324009 0.38834926; 0.5424573 -0.62315285 … 1.3319763 0.45472947; … ; 0.5496198 -0.5912883 … 1.3399436 0.52110964; 0.5639447 -0.5275592 … 1.3558782 0.65387005]
 [0.16106658 -0.8578767 … 0.668599 -0.19475842; 0.47324833 -0.4187825 … 0.90622646 0.2205224; … ; 0.6096772 -0.31558573 … 1.0329251 0.41992408; 0.94127524 0.035326615 … 1.3669378 0.86905515]
 [0.91756034 -0.33898774 … 0.993578 -0.1230087; 0.95782566 -0.2930137 … 1.0634463 0.011466838; … ; 0.998091 -0.24703965 … 1.1333145 0.14594238; 1.0786216 -0.15509157 … 1.2730509 0.41489345]
 [0.021905743 -0.8715603 … 0.62196314 -0.27542344; 0.35184312 -0.44728988 … 0.9316688 0.24854964; … ; 0.662539 -0.31483316 … 1.1160702 0.37768933; 0.9537341 -0.09622821 … 1.6776503 0.7268074]
 [-0.306053 -0.6

In [49]:
fivenums[1][Int64(median(1:num_slices))]

5×768 Matrix{Float32}:
 0.91756   -0.338988  -0.259021  …  -0.182396   0.993578  -0.123009
 0.957826  -0.293014  -0.255753     -0.0325653  1.06345    0.0114668
 0.998091  -0.24704   -0.252486      0.117266   1.13331    0.145942
 0.998091  -0.24704   -0.252486      0.117266   1.13331    0.145942
 1.07862   -0.155092  -0.24595       0.416928   1.27305    0.414893

In [59]:
med_idx = Int64(median(1:num_slices))
center = fivenums[1][med_idx]

5×768 Matrix{Float32}:
 0.91756   -0.338988  -0.259021  …  -0.182396   0.993578  -0.123009
 0.957826  -0.293014  -0.255753     -0.0325653  1.06345    0.0114668
 0.998091  -0.24704   -0.252486      0.117266   1.13331    0.145942
 0.998091  -0.24704   -0.252486      0.117266   1.13331    0.145942
 1.07862   -0.155092  -0.24595       0.416928   1.27305    0.414893

In [None]:
sum(abs.(quantile_summary(colwise(ChiSqDist(), a1_sum, a2_sum))))

In [None]:
med_idx = Int64(median(1:num_slices))
center = fivenums[1][med_idx]
costs = Vector{Float64}[]
for fivenum in fivenums
    cost = Float64[]
    for i in 1:size(fivenum)[1]
        push!(cost, fivenum[i])
    end
end

In [58]:
vectors = 1:7

# Generate all combinations of size 2 from the vectors
combinations_of_vectors = collect(combinations(vectors, 2))

21-element Vector{Vector{Int64}}:
 [1, 2]
 [1, 3]
 [1, 4]
 [1, 5]
 [1, 6]
 [1, 7]
 [2, 3]
 [2, 4]
 [2, 5]
 [2, 6]
 [2, 7]
 [3, 4]
 [3, 5]
 [3, 6]
 [3, 7]
 [4, 5]
 [4, 6]
 [4, 7]
 [5, 6]
 [5, 7]
 [6, 7]

In [54]:
Pkg.add("Combinatorics")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `~/Documents/School/Islamic Studies/ma-thesis-codes/Project.toml`
  [90m[861a8166] [39m[92m+ Combinatorics v1.0.2[39m
[32m[1m  No Changes[22m[39m to `~/Documents/School/Islamic Studies/ma-thesis-codes/Manifest.toml`


In [53]:
using Pkg