# Annotating the Fusus for Quran citations
All citations from the quran seem to be cited accurately by the edition. The task is, thus, to extract the information from the `word` column. This is not easy since there is no one single pattern that picks up all Quran citations. Let's explore.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arabicABC as abc

In [46]:
fusus = pd.read_csv('fusus.csv', dtype={"page":int, "line":int, "column":int, "span":int, "direction": str, 
                                        "left":"Int64", "top":"Int64", "right":"Int64", "bottom":"Int64", 
                                        "word":str, "short":str, "haspunct":str, "punctAfter":str, "punctBefore":str, 
                                        "QunawiMS":str, "poetryMeter":str, "poetryVerse":"Int64", "fass":"Int64", "lwcvl":str, "quran":str})
fusus.word = fusus.word.fillna('')
fusus.short = fusus.short.fillna('')
fusus.haspunct = fusus.haspunct.fillna('')
fusus.punctAfter = fusus.punctAfter.fillna('')
fusus.punctBefore = fusus.punctBefore.fillna('')
fusus.poetryMeter = fusus.poetryMeter.fillna('')
fusus.lwcvl = fusus.lwcvl.fillna('')
fusus.quran = fusus.quran.fillna('')

In [3]:
fusus.head()

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
0,8,2,1,1,r,356,197,384,218,الحَمْدُ,الحمد,,,,1b,,,0,,
1,8,2,1,1,r,341,197,356,218,لِلهِ,لله,,,,1b,,,0,,
2,8,2,1,1,r,312,197,341,218,مُـنَـزِّلِ,منزل,,,,1b,,,0,,
3,8,2,1,1,r,274,197,312,218,الحِكَمِ,الحكم,,,,1b,,,0,,
4,8,2,1,1,r,260,197,274,218,عَلَىٰ,على,,,,1b,,,0,,


### Different elements of a Quran citation

In [50]:
print(fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)].shape)
print(fusus[fusus.word.str.contains(pat=abc.QURANOPEN)].shape)
print(fusus[fusus.word.str.contains(pat="\[")].shape)
print(fusus[fusus.word.str.contains(pat="\]")].shape)
print(fusus[fusus.short.str.contains(pat="\[")].shape)
print(fusus[fusus.short.str.contains(pat="\]")].shape)
print(fusus[fusus.word.str.contains(pat="سورة")].shape)

(520, 20)
(520, 20)
(582, 20)
(581, 20)
(505, 20)
(504, 20)
(485, 20)


### Generally you would think that ﴾﴿ indicates a Quran citation.  ﴿ gives 520 and ﴾ gives 520. So there seem to be 520 Quran citations.
But note that the chapter titles were also adorned with these brackets (which have since been cleaned).

In [4]:
fusus[fusus.word.str.contains(pat=abc.QURANOPEN)]

page                       28
line                        8
column                      1
span                        1
direction                   r
left                      307
top                       278
right                     345
bottom                    299
word           ﴿يٰـٓـأَيُّهَا
short                  ﴿يأيها
haspunct                    ﴿
punctAfter                   
punctBefore                  
QunawiMS                   5b
poetryMeter                  
poetryVerse              <NA>
fass                        1
lwcvl                        
quran                        
Name: 2003, dtype: object

In [11]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
432,16,6,1,1,r,79.0,217.0,133.0,238.0,الأَمْـرُكُلُّهُ﴾,الأمركله﴾,﴾,,,2b,,,
819,20,6,1,1,r,192.0,217.0,219.0,238.0,فِيهَا﴾،,فيها﴾،,﴾،,,,3a,,,
1768,26,11,1,1,r,273.0,347.0,308.0,368.0,بِيَدَىَّ﴾،,بيدى﴾،,﴾،,,,5a,,,
2020,28,9,1,1,r,138.0,304.0,225.0,325.0,وَنِسَـآءً﴾.[سورةالنساء:,ونساء﴾.[سورةالنساء:,﴾.[:,,,5b,,,
2024,28,10,1,1,r,236.0,330.0,325.0,351.0,رَبَّـكُمْ﴾؛[سورةالنساء:,ربكم﴾؛[سورةالنساء:,﴾؛[:,,,5b,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,405,4,1,1,r,79.0,165.0,157.0,186.0,مَعَاذِيْـرَهُ﴾[سورة,معاذيره﴾[سورة,﴾[,,,77b,,,
41127,407,2,1,1,r,173.0,113.0,262.0,134.0,وَتَسْبِيحَهُ﴾[سورةالنور:,وتسبيحه﴾[سورةالنور:,﴾[:,,,77b,,,
41177,407,7,1,1,r,232.0,243.0,331.0,264.0,بِحَمْدِهِ﴾[سورةالإسراء:,بحمده﴾[سورةالإسراء:,﴾[:,,,78a,,,
41187,407,8,1,1,r,193.0,269.0,295.0,290.0,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,


In [22]:
chapterPages = fusus.groupby(['fass']).first().page.tolist()

In [113]:
for i in range(40952,41119):
    fusus.iloc[i, fusus.columns.get_loc("word")] = ""
    fusus.iloc[i, fusus.columns.get_loc("short")] = ""

In [82]:
fusus[(fusus.page==58)&(fusus.line==3)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
5217,58,3,1,1,r,366,139,402,160,أُخْرِجَتْ,أخرجت,,,,12a,,,3,,
5218,58,3,1,1,r,336,139,366,160,لِلنَّاسِ﴾,للناس﴾,﴾,,,12a,,,3,,
5219,58,3,1,1,r,285,141,336,159,[سورةالبقرة:,[سورةالبقرة:,[:,,,12a,,,3,,
5220,58,3,1,1,r,255,143,270,158,٠١١],٠١١],٠١١],,,12a,,,3,,
5221,58,3,1,1,r,207,139,255,160,فَــ﴿لَـيْسَ,ف﴿ليس,﴿,,,12a,,,3,,
5222,58,3,1,1,r,181,139,207,160,كَمِثْلِهِ,كمثله,,,,12a,,,3,,
5223,58,3,1,1,r,100,139,181,160,شَيءٌ﴾[سورةالشورىٰ:,شيء﴾[سورةالشورى:,﴾[:,,,12a,,,3,,


In [85]:
fusus[fusus.short=="فحققت"]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
92,10,2,1,1,r,348,113,384,134,فَحَقَّقْتُ,فحققت,,,,1b,,,0,,


In [98]:

fusus.iloc[39000, fusus.columns.get_loc("poetryVerse")] = 1
fusus.iloc[39000, fusus.columns.get_loc("poetryMeter")] = fusus.iloc[38999, fusus.columns.get_loc("word")]

fusus.iloc[38999, fusus.columns.get_loc("word")] = ""
fusus.iloc[38999, fusus.columns.get_loc("short")] = ""

In [61]:
fusus.to_csv('fusus.csv',index=False)

### Note that in the edition it gives the sura number. This has not come through in the extraction and it is not at all available.
Some minor cleaning was done: in two cases brackets were left in, and in two cases normal brackets were used instead of Quran-brackets.

In [52]:
98+357+10+6+6+13+14+14+17

535

### Does the word `سورة` always follow a Quran-close-bracket?
98 have in the same row. 357 in the next. 10 in 2 rows down. 6 in 3 rows down. 6 in 4 rows down. 13 in 5 rows down. 14 in 6 rows down. 14 in 7 rows down. 17 in 8 rows down. Which is 535, or 15 more than the actual total number, which shows there is already some crude overlap

In [31]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE) & fusus.word.str.contains(pat='سورة')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
2020,28,9,1,1,r,138.0,304.0,225.0,325.0,وَنِسَـآءً﴾.[سورةالنساء:,ونساء﴾.[سورةالنساء:,﴾.[:,,,5b,,,
2024,28,10,1,1,r,236.0,330.0,325.0,351.0,رَبَّـكُمْ﴾؛[سورةالنساء:,ربكم﴾؛[سورةالنساء:,﴾؛[:,,,5b,,,
5132,56,8,1,1,r,173.0,285.0,262.0,306.0,شَيءٌ﴾[سورةالشورى:,شيء﴾[سورةالشورى:,﴾[:,,,11b,,,
5137,57,1,1,1,r,284.0,87.0,372.0,108.0,ٱلبَصِيْرُ﴾[سورةالشورى:,ٱلبصير﴾[سورةالشورى:,﴾[:,,,11b,,,
5181,57,6,1,1,r,268.0,217.0,343.0,238.0,غَفَّارًا﴾.[سورةنوح:,غفارا﴾.[سورةنوح:,﴾.[:,,,11b,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,405,4,1,1,r,79.0,165.0,157.0,186.0,مَعَاذِيْـرَهُ﴾[سورة,معاذيره﴾[سورة,﴾[,,,77b,,,
41127,407,2,1,1,r,173.0,113.0,262.0,134.0,وَتَسْبِيحَهُ﴾[سورةالنور:,وتسبيحه﴾[سورةالنور:,﴾[:,,,77b,,,
41177,407,7,1,1,r,232.0,243.0,331.0,264.0,بِحَمْدِهِ﴾[سورةالإسراء:,بحمده﴾[سورةالإسراء:,﴾[:,,,78a,,,
41187,407,8,1,1,r,193.0,269.0,295.0,290.0,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,


In [40]:
total = 0
for q in fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)].index.tolist():
    if 'سورة' in fusus.iloc[q+1].word:
        total += 1
#         print(str(q+1) + " has " + fusus.iloc[q+1].word)
print(total)

357


### Another way is to look at square brackets, which seem to be used after a Quran citation.
Here we can already find two cases in which the word `سورة` is not found.

In [5]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE) & fusus.word.str.contains(pat='\[') & ~fusus.word.str.contains(pat='سورة')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
39721,396,11,1,1,r,227,347,303,368,أَنْفُسَهُمْ﴾[الأنعام:,أنفسهم﴾[الأنعام:,﴾[:,,,76a,,,,,


In [62]:
fusus.haspunct.unique().tolist()

['',
 '[',
 ']',
 '[٧٢٦',
 ':.',
 '―',
 '﴿',
 '﴾',
 '[:',
 '٣٢١]',
 '﴾،',
 '٠٣].',
 '«»،',
 '«».',
 '«»',
 '٥٧]',
 '»،',
 '﴾.[:',
 '١]',
 '﴾؛[:',
 '٠٦]',
 '١٣]',
 '؟!',
 '١٢]',
 ':٠٥]',
 '–',
 '٣٥]',
 '﴾.',
 '٢]:',
 '﴾[:',
 '١١]',
 '﴿﴾',
 '٨]،',
 '٩]،',
 '٠١]',
 ':﴿',
 '٥–٦].',
 '٠١١]',
 '٥]',
 '﴿﴾[:',
 '٧]',
 '﴾[',
 '٢١]',
 '١٢].',
 '٦١]',
 '٧].',
 '٢]',
 '٢٢]',
 '٨٠١]',
 '٥٨]',
 '٣٢]',
 '٣٣]',
 '»؟',
 '٣]',
 '٤٣]',
 '٤٢]',
 '٠٢].',
 '٥٢]',
 '٦]',
 '«»؛',
 '٩٢]',
 '٦٢]',
 ':٥٥]',
 '﴿﴾،',
 '٧٢]',
 '٨٢]',
 '١٩].',
 '﴾:[:',
 '٨٨]',
 '٧٥].',
 '٥٣]',
 '٥٣]،',
 '١]،',
 ':٥]،',
 '٨٨]،',
 '٣٢١]،',
 '٠٦].',
 '٧٥]،',
 '٠٣]،',
 '٥٧]،',
 '[]',
 '؟﴿',
 '٢٠١].',
 '٧٠١].',
 ']،',
 '٧٦]',
 '٩٤١]،',
 '؟»',
 '٢٤]',
 '٩٤١].',
 '﴾؛',
 '﴿﴾:',
 '»:',
 '٤٦١].',
 '.:',
 '٤].',
 '٢٠١]،',
 '٤٠١‐٥٠١]،',
 '٣٤].',
 '٦٠١]',
 '٨٣].',
 '٩٢].',
 ':٠٥]،',
 '٨٢–٩٢]',
 '[]]',
 '٨]',
 '٧٤].',
 '٤٥].',
 '٢٣١]،',
 '٩١]،',
 '٧٢].',
 '٧٢]،',
 '٩١]؛',
 '٦١]،',
 '٩٤١]',
 '٢١١]،',
 ':«',
 '٤]',
 '٥].',
 '٥]،',
 '٠٠١]،',
 '٠٠١]'

In [124]:
fusus[(fusus.short.str.contains(pat="آله]"))]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
44,9,4,1,1,r,191,165,206,186,آلِهِ],آله],],,,1b,,,0,,
67,9,6,1,1,r,208,217,223,238,آلِهِ],آله],],,,1b,,,0,,


In [106]:
fusus.iloc[6788, fusus.columns.get_loc("short")]

'ما'

In [56]:
for i in fusus[(fusus.short.str.contains(pat="\]"))&(fusus.page==31)].index:
    fusus.iloc[i, fusus.columns.get_loc("word")] = ""
    fusus.iloc[i, fusus.columns.get_loc("short")] = ""
    fusus.iloc[i, fusus.columns.get_loc("haspunct")] = ""


In [51]:
fusus.iloc[2295]

page             31
line              5
column            1
span              1
direction         r
left            358
top             211
right           374
bottom          232
word              ]
short             ]
haspunct           
punctAfter         
punctBefore        
QunawiMS         6a
poetryMeter        
poetryVerse    <NA>
fass           <NA>
lwcvl              
quran              
Name: 2295, dtype: object

In [44]:
fusus['word'].replace("", np.nan, inplace=True)
fusus.dropna(subset=['word'],inplace=True)
fusus[(fusus.page==410)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran


In [45]:
fusus.to_csv('fusus.csv',index=False)

In [78]:
fusus[(fusus.short.str.contains('\\d\]',regex=True))&(fusus.short.str.len()<5)&(fusus.short.str.contains('|'.join(abc.LETTERS)))]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
5257,58,7,1,1,r,197,243,226,264,٥]مِنْ,٥]من,٥],,,12a,,,3,,
9912,116,10,1,1,r,205,330,225,351,٨]أَن,٨]أن,٨],,,21a,,,7,,
12672,144,11,1,1,r,199,347,223,368,١]مِنْ,١]من,١],,,26b,,,9,,
12678,144,12,1,1,r,302,373,328,394,٢]مِنْ,٢]من,٢],,,26b,,,9,,
33567,340,9,1,1,r,357,295,385,316,٣]مَعَ,٣]مع,٣],,,64b,,,24,,


In [69]:
fusus.iloc[fusus[(fusus.short.str.fullmatch('\\d.*\]'))].index-1].word.str.contains(pat='\[').tolist()

[True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 Tr