In [1]:
import pandas as pd
import collections

with open('quran-text/quran-uthmani.txt') as f:
    quran_text = [l.strip() for l in f if not l.strip().startswith('#') and l.strip()]

In [2]:
# split basmallah to its own
i = 0 
while i < len(quran_text):
    aya = quran_text[i].split('|')
    if aya[2].startswith('بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ '):
        quran_text.insert(i, f'{aya[0]}|0|بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ')
        i += 1
        quran_text[i] = f'{aya[0]}|1|{aya[2][38:]}'
    i += 1

In [3]:
# number of aya with basmallah
len(quran_text)

6346

In [4]:
# number of aya without basmallah except for first sura
len([qt for qt in quran_text if '|0|' not in qt])

6236

In [5]:
df = pd.DataFrame({'sura': int(a[0]), 'verse': int(a[1]), 'text': a[2]} for a in [a.split('|') for a in quran_text])
df

Unnamed: 0,sura,verse,text
0,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
1,1,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
2,1,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
3,1,4,مَٰلِكِ يَوْمِ ٱلدِّينِ
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
...,...,...,...
6341,114,2,مَلِكِ ٱلنَّاسِ
6342,114,3,إِلَٰهِ ٱلنَّاسِ
6343,114,4,مِن شَرِّ ٱلْوَسْوَاسِ ٱلْخَنَّاسِ
6344,114,5,ٱلَّذِى يُوَسْوِسُ فِى صُدُورِ ٱلنَّاسِ


In [6]:
# Surah lengths
df_length = df[['sura','verse']].groupby('sura').count()
df_length

Unnamed: 0_level_0,verse
sura,Unnamed: 1_level_1
1,7
2,287
3,201
4,177
5,121
...,...
110,4
111,6
112,5
113,6


In [7]:
# longest suras
df_length.sort_values('verse', ascending=False).head(10)

Unnamed: 0_level_0,verse
sura,Unnamed: 1_level_1
2,287
26,228
7,207
3,201
37,183
4,177
6,166
20,136
9,129
16,129


In [8]:
# shortest suras
df_length.sort_values('verse').head(10)

Unnamed: 0_level_0,verse
sura,Unnamed: 1_level_1
110,4
108,4
103,4
112,5
106,5
97,5
111,6
105,6
113,6
1,7


In [9]:
FILTERED_CHARS = \
    '|123456789 \t\nًٌَُّْ	' \
    'ِ ۫ ۨ ۬ ۬' \
    '۪ ۣۭۜ۠' \
    '	' \
    'ٰ'

def len_verse(text):
    return len([c for c in text if c not in FILTERED_CHARS])

df['verse_charlen'] = df['text'].apply(len_verse)

In [10]:
# statistic of verse charlen
df['verse_charlen'].describe()

count    6346.000000
mean       54.135361
std        41.165902
min         2.000000
25%        23.000000
50%        44.000000
75%        71.000000
max       580.000000
Name: verse_charlen, dtype: float64

In [11]:
df[['sura','verse_charlen']].groupby('sura').describe()

Unnamed: 0_level_0,verse_charlen,verse_charlen,verse_charlen,verse_charlen,verse_charlen,verse_charlen,verse_charlen,verse_charlen
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sura,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,7.0,20.000000,11.075498,11.0,14.50,18.0,19.00,44.0
2,287.0,94.675958,62.036445,5.0,56.00,76.0,117.00,580.0
3,201.0,76.701493,39.853487,5.0,49.00,69.0,96.00,296.0
4,177.0,95.124294,55.706172,19.0,57.00,80.0,116.00,316.0
5,121.0,104.388430,57.304068,19.0,67.00,92.0,122.00,297.0
...,...,...,...,...,...,...,...,...
110,4.0,25.500000,6.952218,19.0,19.75,25.5,31.25,32.0
111,6.0,17.833333,1.940790,16.0,16.25,17.5,18.75,21.0
112,5.0,13.600000,4.219005,9.0,11.00,12.0,17.00,19.0
113,6.0,15.333333,3.559026,9.0,14.50,16.0,17.50,19.0


In [12]:
# sura character length
df_length = df[['sura','verse_charlen']].groupby('sura').sum()

In [13]:
# longest sura by charlen (with basmallah)
df_length.sort_values('verse_charlen', ascending=False).head(10)

Unnamed: 0_level_0,verse_charlen
sura,Unnamed: 1_level_1
2,27172
4,16837
3,15417
7,14980
6,13203
5,12631
9,11435
11,8196
16,8056
10,7902


In [14]:
# shortest sura by charlen (with basmallah)
df_length.sort_values('verse_charlen').head(10)

Unnamed: 0_level_0,verse_charlen
sura,Unnamed: 1_level_1
108,62
112,68
113,92
103,95
114,99
106,101
110,102
111,107
105,120
109,121


In [15]:
# arabic character values according to Ibn Arabi

ref = 'ابجدهوزحطيكلمنسعفصقرشتثخذضظغ';
mapping = {
    'ؤ': 'ا',
    'ئ': 'ا',
    'إ': 'ا',
    'أ': 'ا',
    'آ': 'ا',
    'ى': 'ي',
    'ء': 'ا',
    'ة': 'ه'
}
def val1(c):
    c = mapping.get(c, c)
    i = ref.index(c) if c in ref else -1
    if i == -1:
        if c != ' ':
            return None
        return 0
    else:
        return ((i % 9)+1) * (10 ** (i//9))

def val(s):
    values = [val1(c) for c in s]
    return sum(val for val in values if val)


# total char value of each verses
df['verse_value'] = df['text'].apply(val)

In [16]:
# top value verses
df.sort_values('verse_value', ascending=False).head(10)

Unnamed: 0,sura,verse,text,verse_charlen,verse_value
289,2,282,يَٰٓأَيُّهَا ٱلَّذِينَ ءَامَنُوٓا۟ إِذَا تَدَا...,580,39212
5565,73,20,إِنَّ رَبَّكَ يَعْلَمُ أَنَّكَ تَقُومُ أَدْنَى...,342,29589
2843,24,31,وَقُل لِّلْمُؤْمِنَٰتِ يَغْضُضْنَ مِنْ أَبْصَٰ...,356,25563
782,5,110,إِذْ قَالَ ٱللَّهُ يَٰعِيسَى ٱبْنَ مَرْيَمَ ٱذ...,281,25534
675,5,3,حُرِّمَتْ عَلَيْكُمُ ٱلْمَيْتَةُ وَٱلدَّمُ وَل...,284,25333
597,4,102,وَإِذَا كُنتَ فِيهِمْ فَأَقَمْتَ لَهُمُ ٱلصَّل...,295,25279
2619,22,5,يَٰٓأَيُّهَا ٱلنَّاسُ إِن كُنتُمْ فِى رَيْبٍ م...,282,24409
520,4,25,وَمَن لَّمْ يَسْتَطِعْ مِنكُمْ طَوْلًا أَن يَن...,261,24201
3616,33,53,يَٰٓأَيُّهَا ٱلَّذِينَ ءَامَنُوا۟ لَا تَدْخُلُ...,310,23440
4657,48,29,مُّحَمَّدٌ رَّسُولُ ٱللَّهِ وَٱلَّذِينَ مَعَهُ...,261,23399


In [17]:
# top sura by total verse value
df[['sura', 'verse_value']].groupby('sura').sum().sort_values('verse_value', ascending=False).head(10)

Unnamed: 0_level_0,verse_value
sura,Unnamed: 1_level_1
2,1823606
4,1116434
3,1055898
7,1054508
6,933617
5,824303
9,737106
16,557806
11,537999
10,536169
