# Annotating the Fusus for Quran citations
All citations from the quran seem to be cited accurately by the edition. The task is, thus, to extract the information from the `word` column. This is not easy since there is no one single pattern that picks up all Quran citations. Let's explore.

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arabicABC as abc

In [433]:
fusus = pd.read_csv('fusus.csv', dtype={"page":int, "line":int, "column":int, "span":int, "direction": str, 
                                        "left":"Int64", "top":"Int64", "right":"Int64", "bottom":"Int64", 
                                        "word":str, "short":str, "haspunct":str, "punctAfter":str, "punctBefore":str, 
                                        "QunawiMS":str, "poetryMeter":str, "poetryVerse":"Int64", "fass":"Int64", "lwcvl":str, 
                                        "quran":str})
fusus.word = fusus.word.fillna('')
fusus.short = fusus.short.fillna('')
fusus.haspunct = fusus.haspunct.fillna('')
fusus.punctAfter = fusus.punctAfter.fillna('')
fusus.punctBefore = fusus.punctBefore.fillna('')
fusus.poetryMeter = fusus.poetryMeter.fillna('')
fusus.lwcvl = fusus.lwcvl.fillna('')
fusus.quran = fusus.quran.fillna('')

In [434]:
fusus

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
0,8,2,1,1,r,356,197,384,218,الحَمْدُ,الحمد,,,,1b,,,0,,
1,8,2,1,1,r,341,197,356,218,لِلهِ,لله,,,,1b,,,0,,
2,8,2,1,1,r,312,197,341,218,مُـنَـزِّلِ,منزل,,,,1b,,,0,,
3,8,2,1,1,r,274,197,312,218,الحِكَمِ,الحكم,,,,1b,,,0,,
4,8,2,1,1,r,260,197,274,218,عَلَىٰ,على,,,,1b,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39498,409,5,1,1,r,331,191,356,212,يُقُوْلُ,يقول,,,,78a,,,27,,
39499,409,5,1,1,r,305,191,331,212,ٱلحَقُّ,ٱلحق,,,,78a,,,27,,
39500,409,5,1,1,r,284,191,305,212,وَهُوُ,وهو,,,,78a,,,27,,
39501,409,5,1,1,r,266,191,284,212,يَهْدِي,يهدي,,,,78a,,,27,,


In [456]:
fusus['word'].replace("", np.nan, inplace=True)
fusus.dropna(subset=['word'],inplace=True)

In [457]:
fusus.to_csv('fusus.csv',index=False)

### Different elements of a Quran citation

In [4]:
print(fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)].shape)
print(fusus[fusus.word.str.contains(pat=abc.QURANOPEN)].shape)
print(fusus[fusus.word.str.contains(pat="\[")].shape)
print(fusus[fusus.word.str.contains(pat="\]")].shape)
print(fusus[fusus.short.str.contains(pat="\[")].shape)
print(fusus[fusus.short.str.contains(pat="\]")].shape)
print(fusus[fusus.word.str.contains(pat="سورة")].shape)

(520, 20)
(520, 20)
(97, 20)
(97, 20)
(96, 20)
(96, 20)
(79, 20)


### Generally you would think that ﴾﴿ indicates a Quran citation.  ﴿ gives 520 and ﴾ gives 520. So there seem to be 520 Quran citations.
But note that the chapter titles were also adorned with these brackets (which have since been cleaned).

In [5]:
sura = fusus.iloc[fusus[fusus.short.str.contains(pat=abc.QURANCLOSE)&~fusus.short.str.contains(pat='\[')].index+1]
sura[sura.short.str.contains(pat='\[')&sura.short.str.contains(pat='\]')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran


In [3]:
noclose = fusus[fusus.short.str.contains(pat=abc.QURANCLOSE)&fusus.short.str.contains(pat='\[')]
close = noclose[noclose.short.str.contains(pat='\]')]

index numbers

In [4]:
endwords = close[close.short.str.contains(pat=':')].short.str.split(abc.QURANCLOSE).str[1].str.split(']').str[0].str.split('[').str[1].str.split(":").str[1].index

verse numbers

In [5]:
verseNumber = []
for i in close[close.short.str.contains(pat=':')].short.str.split(abc.QURANCLOSE).str[1].str.split(']').str[0].str.split('[').str[1].str.split(":").str[1].tolist():
    verseNumber.append(int(str(list(map(int, re.findall(r'\d+', i)))[0])[::-1]))

sura numbers

In [6]:
suraNumber = []
for y in close[close.short.str.contains(pat=':')].short.str.split(abc.QURANCLOSE).str[1].str.split(']').str[0].str.split('[').str[1].str.split(":").str[0].str.replace("سورة", "").str.replace(" ","").tolist():
    suraNumber.append(abc.QURAN_SURAS[y])

Mass change it

In [8]:
# for i in range(400):
#     fusus.iloc[endwords[i], fusus.columns.get_loc("quran")] = str(suraNumber[i])+":"+str(verseNumber[i])
#     fusus.iloc[endwords[i], fusus.columns.get_loc("word")] = fusus.iloc[endwords[i]].word.split("[")[0]
#     fusus.iloc[endwords[i], fusus.columns.get_loc("short")] = fusus.iloc[endwords[i]].word.split("[")[0]

In [7]:
suras = []
for i in close.short.str.split(abc.QURANCLOSE).tolist():
    suras.append(i[1])
suras
    # cleansuras = []
# for i in suras:
#     new = i.replace(':', '').replace('سورة','').replace('[','').replace(']','').replace('،','').replace('.','').replace('١','').replace('٢','').replace('٣','').replace('٤','').replace('٥','').replace('٦','').replace('٧','').replace('٨','').replace('٩','').replace('٠','').replace('أ','').replace('؛','').replace('–','').replace('‐','').replace('؟','')
#     cleansuras.append(new)
# usedSuras = list(set(cleansuras))

['[هود:٣٢١]',
 '،[سورةالبقرة:٠٣].',
 '،[سورةص:٥٧]',
 '.[سورةالنساء:١]',
 '؛[سورةالنساء:١]',
 '[سورةغافر٠٦]',
 '[سورةمحمد:١٣]',
 '[سورةالحجر:١٢]',
 '[سورةطه:٠٥]',
 '[سورةفصلت:٣٥]',
 '[سورةفصلت:٣٥]',
 '[سورةفصلت:٣٥]',
 '.[سورةفصلت:٣٥]',
 '،[سورةالفاتحة:٢]أ',
 '[سورةالشورى:١١]',
 '[سورةالشورى:١١]',
 '[سورةالشورى:١١]',
 '[سورةالشورى:١١]',
 '[سورةنوح٨]،',
 '[سورةنوح:٩]،',
 '.[سورةنوح:٠١]',
 '.[سورةنوح:٥–٦].',
 '[سورةالبقرة:٠١١]',
 '[سورةالشورى:١١]',
 '[سورةنوح:٥]',
 '[سورةنوح:٥]',
 '،[سورةالشورى:١١]',
 '،[سورةنوح:٧]',
 '[سورةالشورى:١١]',
 '[سورةنوح١١]',
 '[سورةنوح:٢١]',
 '،[سورةنوح:١٢].',
 '،[سورةالبقرة:٦١]',
 '[سورةالحديد:٧].',
 '،[سورةالإسراء:٢]',
 '،[سورةنوح:٢٢]',
 '[سورةيوسف٨٠١]',
 '.[سورةيوسف:٨٠١]',
 '،[سورةمريم٥٨]',
 '،[سورةنوح:٣٢]',
 '،[سورةالإسرى:٣٢]',
 '[سورةالرعد:٣٣]',
 '.[سورةالزمر:٣]',
 '[سورةالحج:٤٣]',
 '؛[سورةالحج:٤٣]',
 '[سورةنوح:٤٢]',
 '[سورةنوح:٤٢]',
 '[سورةنوح:٤٢]',
 '،[سورةالبقرة:٠٢].',
 '؛[سورةنوح:٥٢]',
 '[سورةنوح:٥٢]',
 '،[سورةالتكوير:٦]',
 '.[سورةنوح:٥٢]',
 '،[سورةالرح

In [39]:
list(map(int, re.findall(r'\d+', '٩٦٢')))[0]

962

In [229]:
fusus.iloc[38963, fusus.columns.get_loc("word")] += fusus.iloc[38964, fusus.columns.get_loc("word")]
fusus.iloc[38963, fusus.columns.get_loc("short")] += fusus.iloc[38964, fusus.columns.get_loc("short")]
fusus.iloc[38963, fusus.columns.get_loc("haspunct")] += fusus.iloc[38964, fusus.columns.get_loc("haspunct")]
fusus.iloc[38964, fusus.columns.get_loc("word")] = ""
fusus.iloc[38964, fusus.columns.get_loc("short")] = ""

In [14]:
fusus[fusus.short.str.contains(pat=abc.QURANCLOSE)&fusus.short.str.contains(pat='\[')].word.str.split(abc.QURANCLOSE).str[1].to_csv("66.csv",index=True)

In [168]:
SliceAndDice(39334,1)

In [111]:
fusus.iloc[5247, fusus.columns.get_loc("word")][:2]

'٥]'

### Note that in the edition it gives the sura number. This has not come through in the extraction and it is not at all available.
Some minor cleaning was done: in two cases brackets were left in, and in two cases normal brackets were used instead of Quran-brackets.

In [52]:
98+357+10+6+6+13+14+14+17

535

### Does the word `سورة` always follow a Quran-close-bracket?
98 have in the same row. 357 in the next. 10 in 2 rows down. 6 in 3 rows down. 6 in 4 rows down. 13 in 5 rows down. 14 in 6 rows down. 14 in 7 rows down. 17 in 8 rows down. Which is 535, or 15 more than the actual total number, which shows there is already some crude overlap

In [31]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE) & fusus.word.str.contains(pat='سورة')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
2020,28,9,1,1,r,138.0,304.0,225.0,325.0,وَنِسَـآءً﴾.[سورةالنساء:,ونساء﴾.[سورةالنساء:,﴾.[:,,,5b,,,
2024,28,10,1,1,r,236.0,330.0,325.0,351.0,رَبَّـكُمْ﴾؛[سورةالنساء:,ربكم﴾؛[سورةالنساء:,﴾؛[:,,,5b,,,
5132,56,8,1,1,r,173.0,285.0,262.0,306.0,شَيءٌ﴾[سورةالشورى:,شيء﴾[سورةالشورى:,﴾[:,,,11b,,,
5137,57,1,1,1,r,284.0,87.0,372.0,108.0,ٱلبَصِيْرُ﴾[سورةالشورى:,ٱلبصير﴾[سورةالشورى:,﴾[:,,,11b,,,
5181,57,6,1,1,r,268.0,217.0,343.0,238.0,غَفَّارًا﴾.[سورةنوح:,غفارا﴾.[سورةنوح:,﴾.[:,,,11b,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,405,4,1,1,r,79.0,165.0,157.0,186.0,مَعَاذِيْـرَهُ﴾[سورة,معاذيره﴾[سورة,﴾[,,,77b,,,
41127,407,2,1,1,r,173.0,113.0,262.0,134.0,وَتَسْبِيحَهُ﴾[سورةالنور:,وتسبيحه﴾[سورةالنور:,﴾[:,,,77b,,,
41177,407,7,1,1,r,232.0,243.0,331.0,264.0,بِحَمْدِهِ﴾[سورةالإسراء:,بحمده﴾[سورةالإسراء:,﴾[:,,,78a,,,
41187,407,8,1,1,r,193.0,269.0,295.0,290.0,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,


In [40]:
total = 0
for q in fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)].index.tolist():
    if 'سورة' in fusus.iloc[q+1].word:
        total += 1
#         print(str(q+1) + " has " + fusus.iloc[q+1].word)
print(total)

357


### Another way is to look at square brackets, which seem to be used after a Quran citation.
Here we can already find two cases in which the word `سورة` is not found.

In [145]:
fusus.iloc[fusus[fusus.short.str.contains(pat='\]')].index-1]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
43,9,4,1,1,r,206,165,219,186,[وَ,[و,[,,,1b,,,0,,
58,9,5,1,1,r,104,191,124,212,[٧٢٦,[٧٢٦,[٧٢٦,,,1b,,,0,,
66,9,6,1,1,r,223,217,236,238,[وَ,[و,[,,,1b,,,0,,
279,13,5,1,1,r,157,207,196,225,الصّفحة,الصفحة,,,,2a,,,0,,
430,16,7,1,1,r,387,247,402,262,[هود:,[هود:,[:,,,2b,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40444,405,8,1,1,r,210,273,264,288,سورةالأحزاب:,سورةالأحزاب,,:,,77b,,,27,,
40553,407,2,1,1,r,173,113,262,134,وَتَسْبِيحَهُ﴾[سورةالنور:,وتسبيحه﴾[سورةالنور:,﴾[:,,,77b,,,27,,
40602,407,7,1,1,r,232,243,331,264,بِحَمْدِهِ﴾[سورةالإسراء:,بحمده﴾[سورةالإسراء:,﴾[:,,,78a,,,27,,
40612,407,8,1,1,r,193,269,295,290,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,27,,


In [62]:
fusus.haspunct.unique().tolist()

['',
 '[',
 ']',
 '[٧٢٦',
 ':.',
 '―',
 '﴿',
 '﴾',
 '[:',
 '٣٢١]',
 '﴾،',
 '٠٣].',
 '«»،',
 '«».',
 '«»',
 '٥٧]',
 '»،',
 '﴾.[:',
 '١]',
 '﴾؛[:',
 '٠٦]',
 '١٣]',
 '؟!',
 '١٢]',
 ':٠٥]',
 '–',
 '٣٥]',
 '﴾.',
 '٢]:',
 '﴾[:',
 '١١]',
 '﴿﴾',
 '٨]،',
 '٩]،',
 '٠١]',
 ':﴿',
 '٥–٦].',
 '٠١١]',
 '٥]',
 '﴿﴾[:',
 '٧]',
 '﴾[',
 '٢١]',
 '١٢].',
 '٦١]',
 '٧].',
 '٢]',
 '٢٢]',
 '٨٠١]',
 '٥٨]',
 '٣٢]',
 '٣٣]',
 '»؟',
 '٣]',
 '٤٣]',
 '٤٢]',
 '٠٢].',
 '٥٢]',
 '٦]',
 '«»؛',
 '٩٢]',
 '٦٢]',
 ':٥٥]',
 '﴿﴾،',
 '٧٢]',
 '٨٢]',
 '١٩].',
 '﴾:[:',
 '٨٨]',
 '٧٥].',
 '٥٣]',
 '٥٣]،',
 '١]،',
 ':٥]،',
 '٨٨]،',
 '٣٢١]،',
 '٠٦].',
 '٧٥]،',
 '٠٣]،',
 '٥٧]،',
 '[]',
 '؟﴿',
 '٢٠١].',
 '٧٠١].',
 ']،',
 '٧٦]',
 '٩٤١]،',
 '؟»',
 '٢٤]',
 '٩٤١].',
 '﴾؛',
 '﴿﴾:',
 '»:',
 '٤٦١].',
 '.:',
 '٤].',
 '٢٠١]،',
 '٤٠١‐٥٠١]،',
 '٣٤].',
 '٦٠١]',
 '٨٣].',
 '٩٢].',
 ':٠٥]،',
 '٨٢–٩٢]',
 '[]]',
 '٨]',
 '٧٤].',
 '٤٥].',
 '٢٣١]،',
 '٩١]،',
 '٧٢].',
 '٧٢]،',
 '٩١]؛',
 '٦١]،',
 '٩٤١]',
 '٢١١]،',
 ':«',
 '٤]',
 '٥].',
 '٥]،',
 '٠٠١]،',
 '٠٠١]'

In [151]:
def SliceAndDice(r, l):
    fusus.iloc[r-1, fusus.columns.get_loc("word")] += fusus.iloc[r, fusus.columns.get_loc("word")][:l]
    fusus.iloc[r-1, fusus.columns.get_loc("short")] += fusus.iloc[r, fusus.columns.get_loc("word")][:l]
    fusus.iloc[r-1, fusus.columns.get_loc("haspunct")] += fusus.iloc[r, fusus.columns.get_loc("haspunct")]
    fusus.iloc[r, fusus.columns.get_loc("word")] = fusus.iloc[r, fusus.columns.get_loc("word")][l:]
    fusus.iloc[r, fusus.columns.get_loc("short")] = fusus.iloc[r, fusus.columns.get_loc("short")][l:]
    fusus.iloc[r, fusus.columns.get_loc("haspunct")] = ""

In [178]:
def AddAndSpread(r):
    fusus.iloc[r-2, fusus.columns.get_loc("word")] += fusus.iloc[r, fusus.columns.get_loc("word")]
    fusus.iloc[r-2, fusus.columns.get_loc("short")] += fusus.iloc[r, fusus.columns.get_loc("short")]
    fusus.iloc[r-2, fusus.columns.get_loc("haspunct")] += fusus.iloc[r, fusus.columns.get_loc("haspunct")]
    fusus.iloc[r, fusus.columns.get_loc("word")] = ""
    fusus.iloc[r, fusus.columns.get_loc("short")] = ""
    fusus.iloc[r, fusus.columns.get_loc("haspunct")] = ""
    fusus.iloc[r-1, fusus.columns.get_loc("word")] = ""
    fusus.iloc[r-1, fusus.columns.get_loc("short")] = ""
    fusus.iloc[r-1, fusus.columns.get_loc("haspunct")] = ""

In [75]:
SliceAndDice(33082,2)

In [6]:
fusus[(fusus.quran=="")&(fusus.word.str.contains(pat=abc.QURANCLOSE))]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
2562,34,11,1,1,r,127,347,149,368,لَكُمْ﴾[سورةغافر:٠٦],لكم﴾[سورةغافر٠٦],﴾[٠٦],,,6b,,,2,,
5058,57,4,1,1,r,109,165,147,186,﴿جِهَارًا﴾[سورةنوح:٨]،,﴿جهارا﴾[سورةنوح٨]،,﴿﴾[٨]،,,,11b,,,3,,
5290,59,2,1,1,r,79,113,140,134,مِدْرَارًا﴾[سورةنوح:١١],مدرارا﴾[سورةنوح١١],﴾[١١],,,12a,,,3,,
5421,60,9,1,1,r,79,295,135,316,اللهِ﴾[سورةيوسف:٨٠١],الله﴾[سورةيوسف٨٠١],﴾[٨٠١],,,12b,,,3,,
5457,61,3,1,1,r,104,139,135,160,وَفْدًا﴾،[سورةمريم:٥٨],وفدا﴾،[سورةمريم٥٨],﴾،[٥٨],,,12b,,,3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38873,398,10,1,1,r,183,304,215,322,الدِّيْنِ﴾،,الدين﴾،,﴾،,,,76b,,,27,,
38895,398,12,1,1,r,208,348,246,366,نَسْتَعِْيْنُ﴾،,نستعين﴾،,﴾،,,,76b,,,27,,
38926,398,15,1,1,r,237,414,259,432,لِّيْنَ﴾،,لين﴾،,﴾،,,,76b,,,27,,
38948,399,3,1,1,r,210,135,262,156,العَـٰلَمِيْنَ﴾،[سورةالفاتحة:٢،],العلمين﴾،[سورةالفاتحة٢،],﴾،[٢،],,,76b,,,27,,


In [183]:
fusus.iloc[iqtibas-2]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
13188,152,13,1,1,r,159,399,189,420,حَدِيدٌ﴾[سورةق:٢٢].,حديد﴾[سورةق٢٢].,﴾[٢٢].,,,27b,,,10,,
13271,153,7,1,1,r,276,243,318,264,الأُجَاجِ».[سورةالفرقان:٣٥؛],الأجاج[سورةالفرقان٣٥؛],[٣٥؛],».,,28a,,,10,,
13281,153,8,1,1,r,125,269,155,290,لِشَارِبِهِ»[سورةالفرقان:٣٥؛],لشاربه[سورةالفرقان٣٥؛],[٣٥؛],»,,28a,,,10,,
13563,156,1,1,1,r,284,87,325,108,ٱلفَوٰحِشَ﴾[سورةالأعراف:٣٣].,ٱلفوحش﴾[سورةالأعراف٣٣].,﴾[٣٣].,,,28b,,,10,,
20261,217,2,1,1,r,215,113,233,134,اللهِ»[سورةآلعمران:٩٤].,الله[سورةآلعمران٩٤].,[٩٤].,»,,41a,,,15,,
20286,217,5,1,1,r,239,191,284,212,وَالأَبْرَصَ[سورةالمائدة:٠١١],والأبرص[سورةالمائدة٠١١],[٠١١],,,41a,,,15,,
26972,286,13,1,1,r,204,399,239,420,وَسَلَامًا[سورةالأنبياء:٩٦],وسلاما[سورةالأنبياء٩٦],[٩٦],,,53b,,,18,,
27202,288,4,1,1,r,184,209,202,230,اللهِ[سورةالأسراء:٤٤]،,الله[سورةالأسراء٤٤]،,[٤٤]،,,,53aII,,,19,,
27823,293,12,1,1,r,266,373,304,394,المَسِّ[سورةالأنبياء:٣٨‐٤٨]،,المس[سورةالأنبياء٣٨‐٤٨]،,[٣٨‐٤٨]،,,,54b,,,19,,
31090,324,4,1,1,r,364,165,402,186,خَيرًاكَثِيرًا[سورةالبقرة:٩٦٢].,خيراكثيرا[سورةالبقرة٩٦٢].,[٩٦٢].,,,61a,,,22,,


In [179]:
for i in iqtibas:
    SliceAndDice(i-1,1)
    AddAndSpread(i)

In [131]:
fusus[(fusus.short.str.contains('\\d\]',regex=True))&(fusus.short.str.len()<5)&(fusus.short.str.contains('|'.join(abc.LETTERS)))]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran


In [69]:
fusus.iloc[fusus[(fusus.short.str.fullmatch('\\d.*\]'))].index-1].word.str.contains(pat='\[').tolist()

[True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 Tr

398,10 and 330,14

In [21]:
fusus[(fusus.page==57)&(fusus.line==4)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
5049,57,4,1,1,r,381,165,385,186,لَو,لو,,,,11b,,,3,,
5050,57,4,1,1,r,362,165,381,186,أَنَّ,أن,,,,11b,,,3,,
5051,57,4,1,1,r,341,165,362,186,نُوحًا,نوحا,,,,11b,,,3,,
5052,57,4,1,1,r,313,165,341,186,جَمَعَ,جمع,,,,11b,,,3,,
5053,57,4,1,1,r,285,165,313,186,لِقَومِهِ,لقومه,,,,11b,,,3,,
5054,57,4,1,1,r,268,165,285,186,بَينَ,بين,,,,11b,,,3,,
5055,57,4,1,1,r,226,165,268,186,الدَعْوَتَينِ,الدعوتين,,,,11b,,,3,,
5056,57,4,1,1,r,188,165,226,186,لَأَجَابُوهُ؛,لأجابوه,,؛,,11b,,,3,,
5057,57,4,1,1,r,147,165,188,186,فَدَعَاهُمْ,فدعاهم,,,,11b,,,3,,
5058,57,4,1,1,r,109,165,147,186,﴿جِهَارًا﴾,﴿جهارا﴾,﴿﴾,,,11b,,,3,,71:8


In [71]:
def ShowContext(i):
    print(str(fusus.iloc[i-1].word)+ " "+str(fusus.iloc[i].word)+ " "+str(fusus.iloc[i+1].word)+ " "+str(fusus.iloc[i+2].word)+ " "+str(fusus.iloc[i+3].word))

In [72]:
for i in fusus[(fusus.quran=="")&(fusus.word.str.contains(abc.QURANCLOSE))].index:
    print(i)
    ShowContext(i)
    print(" ")

5826
فِي ٱلأَرْضِ﴾. وَإِذَا دُفِنْتَ فِيهَا،
 
5869
إِنْ تَذَرْهُمْ﴾ أَيْ: تَدَعُهُمْ وَتَـتْـرُكُهُمْ
 
5896
﴿وَلَا يَلِدُوْا﴾ أَي: مَايُنْتِجُونَ، وَلَايُظْهِرُونَ
 
6170
رَبَّكَ الأَعْلَىٰ﴾ [الأعلىٰ : ١]،
 
6241
فَجَعَلَ ﴿عَلِيٍّا﴾ نَعْتًا لِلْمَكَانِ. ﴿وَإِذْ
 
6718
ٱفْعَلْ مَاتُـؤمَرُ﴾ [الصافات: ٢٠١]. وَالوَلَدُ
 
6731
﴿بِذِبْحٍ عَظِيمٍ﴾ [الصافات: ٧٠١]. فَظَهَرَ
 
6753
مِنْهَا زَوْجَهَا﴾ [النساء: ١]، فَمَا
 
7504
أَغْرَاضَهُمْ، فَـ﴿يُكْشَفُ﴾ لَهُمْ ﴿عَنْ سَاقٍ﴾
 
7550
﴿لَوْ شَاءَ﴾؛ ﴿لَوْ﴾ حَـرْفُ ٱمْتِنَاعِ
 
7551
شَاءَ﴾؛ ﴿لَوْ﴾ حَـرْفُ ٱمْتِنَاعِ الٱمْتِنَاعِ.
 
7584
وَمَعْنَىٰ ﴿لَهَدَىٰكُمْ﴾: لَبَيَّنَ لَـكُمْ. وَمَاكُلُّ
 
8257
الٱخْتِبَارُ، ﴿ٱلمُبِينُ﴾، أَي: الظَّاهِرُ. يَعْنِي:
 
9292
فكَانَتْ ﴿راَضِيَةً﴾ بِمَا يَظْهَرُ فِيهَا
 
9300
رَبَّهَا، ﴿مَرْضِيَّةً﴾ تِلْكَ الأَفْعَالُ؛ لِأَنَّكُلَّ
 
10365
وَرَضُوا عَنْهُ﴾ هٰذَا جَـزَاءٌ بِمَا
 
10981
كَمَآ أُمِرْتَ﴾، فَشَـيَّـبَتْهُ ﴿كَمَآ أُمِرْتَ﴾
 
12725
مَغْضُوبٍ عَلَيهِمْ﴾، مِنْ هٰذَا الوَجهِ
 
13723
عَلَىٰ العَرشِ﴾ فَهٰذَا أَيض

In [398]:
fusus.iloc[3339].word.split('!')[0][:-1]

'الأَولِيَاءِ'

In [447]:
fusus[(fusus.page==391)&(fusus.line==1)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
37830,391,1,1,1,r,320,87,402,108,مُذَكَّرٌ―وَعَادَةُ,مذكر―وعادة,―,,,75a,,,27,,
37831,391,1,1,1,r,283,87,320,108,العَـرَبِ,العرب,,,,75a,,,27,,
37832,391,1,1,1,r,269,87,283,108,أَنْ,أن,,,,75a,,,27,,
37833,391,1,1,1,r,237,87,269,108,تُغَلِّبَ,تغلب,,,,75a,,,27,,
37834,391,1,1,1,r,186,87,231,108,التّذَكِيرَ,التذكير,,,,75a,,,27,,
37835,391,1,1,1,r,170,87,186,108,عَلَىٰ,على,,,,75a,,,27,,
37836,391,1,1,1,r,118,87,168,108,التَأنِيثِ,التأنيث,,,,75a,,,27,,
37837,391,1,1,1,r,83,87,118,108,فَتَقُولُ:,فتقول,,:,,75a,,,27,,


In [250]:
def ChangeQuran(r,s,a):
    fusus.iloc[r, fusus.columns.get_loc("quran")] = str(s)+":"+str(a)
    fusus.iloc[r, fusus.columns.get_loc("word")] = fusus.iloc[r].word.split('[')[0]
    fusus.iloc[r, fusus.columns.get_loc("short")] = fusus.iloc[r].short.split('[')[0]
    if fusus.iloc[r].haspunct[0]==abc.QURANOPEN:
        fusus.iloc[r, fusus.columns.get_loc("haspunct")] = abc.QURANOPEN+abc.QURANCLOSE
    else:
        fusus.iloc[r, fusus.columns.get_loc("haspunct")] = abc.QURANCLOSE

In [399]:
fusus.iloc[3339, fusus.columns.get_loc("word")] = fusus.iloc[3339].word.split('!')[0][:-1]
fusus.iloc[3339, fusus.columns.get_loc("short")] = fusus.iloc[3339].short.split('!')[0][:-1]
fusus.iloc[3339, fusus.columns.get_loc("haspunct")] = ""
fusus.iloc[3339, fusus.columns.get_loc("punctAfter")] = "؟!"
# fusus.iloc[13997, fusus.columns.get_loc("haspunct")] = ""
# fusus.iloc[13998, fusus.columns.get_loc("haspunct")] = ""
# fusus.iloc[13998, fusus.columns.get_loc("punctAfter")] = ":"
# fusus.iloc[13997, fusus.columns.get_loc("punctAfter")] = "؛"

In [3]:
someMore = fusus[(fusus.quran=="")&(fusus.short.str.contains('\['))&(fusus.short.str.contains('\]'))&(fusus.short.str.contains(abc.QURANCLOSE))].short.str.split('\]').str[0].str.split('\[').str[1].str.replace('سورة','').str.replace('،','').str.replace('اقتباس','').tolist()

In [21]:
contentMore = []
for x in someMore:
    sura = ""
    aya = ""
    for i in x:
        if i in abc.LETTERS:
            sura += i
        elif i in abc.NUMBERS:
            aya = i+aya
    sura = abc.QURAN_SURAS[sura] 
    aya = int(str(list(map(int, re.findall(r'\d+', aya)))[0]))
    contentMore.append(str(sura)+":"+str(aya))

In [27]:
for i in range(len(indexMore)):
    fusus.iloc[indexMore[i], fusus.columns.get_loc("quran")] = contentMore[i]
    fusus.iloc[indexMore[i], fusus.columns.get_loc("word")] = fusus.iloc[indexMore[i]].word.split('[')[0]
    fusus.iloc[indexMore[i], fusus.columns.get_loc("short")] = fusus.iloc[indexMore[i]].short.split('[')[0]
    if fusus.iloc[indexMore[i]].haspunct[0]==abc.QURANOPEN:
        fusus.iloc[indexMore[i], fusus.columns.get_loc("haspunct")] = abc.QURANOPEN+abc.QURANCLOSE
    else:
        fusus.iloc[indexMore[i], fusus.columns.get_loc("haspunct")] = abc.QURANCLOSE

In [31]:
fusus.to_csv('fusus.csv',index=False)

In [45]:
from pandasgui import show

In [452]:
fusus.iloc[413].word[1:]

'كَمَا'

In [460]:
fusus[fusus.haspunct.str.contains("﴾")]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
22498,245,17,1,1,r,320,503,373,524,قَولُهُ:﴿مُلكاً﴾,قوله:﴿ملكا﴾,:﴿﴾,,,46a,,,16,,4:54


In [445]:
fusus.iloc[13424, fusus.columns.get_loc("word")] = fusus.iloc[13424].word[:-1]
fusus.iloc[13424, fusus.columns.get_loc("short")] = fusus.iloc[13424].short[:-1]
fusus.iloc[13424, fusus.columns.get_loc("haspunct")] = ""
fusus.iloc[13425, fusus.columns.get_loc("punctAfter")] += "―"
# fusus.iloc[25919, fusus.columns.get_loc("word")] = ""
# fusus.iloc[25920, fusus.columns.get_loc("punctAfter")] += "―"
# fusus.iloc[28041, fusus.columns.get_loc("word")] = ""
# fusus.iloc[28042, fusus.columns.get_loc("punctAfter")] += "―"
# fusus.iloc[28046, fusus.columns.get_loc("word")] = ""
# fusus.iloc[28047, fusus.columns.get_loc("punctAfter")] += "―"

# fusus.iloc[28059, fusus.columns.get_loc("word")] = ""
# fusus.iloc[28060, fusus.columns.get_loc("punctAfter")] += "―"
# fusus.iloc[28063, fusus.columns.get_loc("word")] = ""
# fusus.iloc[28064, fusus.columns.get_loc("punctAfter")] += "―"

In [203]:
fusus.iloc[37766].short[:-1]

'النساء'

In [454]:
fusus.haspunct.unique().tolist()

['', ':﴿', '؟﴿', ':﴿﴾', '―']