# Annotating the Fusus for Quran citations
All citations from the quran seem to be cited accurately by the edition. The task is, thus, to extract the information from the `word` column. This is not easy since there is no one single pattern that picks up all Quran citations. Let's explore.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arabicABC as abc

In [2]:
fusus = pd.read_csv('fusus.csv')
fusus = fusus.fillna('')

In [3]:
fusus.shape

(41532, 18)

### It may be noted: there is currently no `Quran` column. We will make it when we can actually extract the information.

In [6]:
fusus.head()

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
0,8,2,1,1,r,356.0,197.0,384.0,218.0,الحَمْدُ,الحمد,,,,1b,,,
1,8,2,1,1,r,341.0,197.0,356.0,218.0,لِلهِ,لله,,,,1b,,,
2,8,2,1,1,r,312.0,197.0,341.0,218.0,مُـنَـزِّلِ,منزل,,,,1b,,,
3,8,2,1,1,r,274.0,197.0,312.0,218.0,الحِكَمِ,الحكم,,,,1b,,,
4,8,2,1,1,r,260.0,197.0,274.0,218.0,عَلَىٰ,على,,,,1b,,,


### Generally you would think that ﴾﴿ indicates a Quran citation.  ﴿ gives 520 and ﴾ gives 520. So there seem to be 520 Quran citations.
But note that the chapter titles were also adorned with these brackets (which have since been cleaned).

In [5]:
fusus[fusus.word.str.contains(pat=abc.QURANOPEN)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
430,16,6,1,1,r,163.0,217.0,210.0,238.0,﴿وَإِلَيهِ,﴿وإليه,﴿,,,2b,,,
815,20,6,1,1,r,284.0,217.0,328.0,238.0,﴿أَتَجْعَلُ,﴿أتجعل,﴿,,,3a,,,
1762,26,10,1,1,r,110.0,321.0,131.0,342.0,﴿مَا,﴿ما,﴿,,,5a,,,
2005,28,8,1,1,r,307.0,278.0,345.0,299.0,﴿يٰـٓـأَيُّهَا,﴿يأيها,﴿,,,5b,,,
2023,28,10,1,1,r,325.0,330.0,359.0,351.0,﴿ٱتَّقُوا,﴿ٱتقوا,﴿,,,5b,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40976,405,4,1,1,r,277.0,165.0,320.0,186.0,﴿الإِنْسٰنُ,﴿الإنسن,﴿,,,77b,,,
41123,407,2,1,1,r,329.0,113.0,360.0,134.0,﴿كُلٌّ,﴿كل,﴿,,,77b,,,
41172,407,6,1,1,r,96.0,217.0,127.0,238.0,﴿وَإِنْ,﴿وإن,﴿,,,78a,,,
41187,407,8,1,1,r,193.0,269.0,295.0,290.0,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,


In [11]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
432,16,6,1,1,r,79.0,217.0,133.0,238.0,الأَمْـرُكُلُّهُ﴾,الأمركله﴾,﴾,,,2b,,,
819,20,6,1,1,r,192.0,217.0,219.0,238.0,فِيهَا﴾،,فيها﴾،,﴾،,,,3a,,,
1768,26,11,1,1,r,273.0,347.0,308.0,368.0,بِيَدَىَّ﴾،,بيدى﴾،,﴾،,,,5a,,,
2020,28,9,1,1,r,138.0,304.0,225.0,325.0,وَنِسَـآءً﴾.[سورةالنساء:,ونساء﴾.[سورةالنساء:,﴾.[:,,,5b,,,
2024,28,10,1,1,r,236.0,330.0,325.0,351.0,رَبَّـكُمْ﴾؛[سورةالنساء:,ربكم﴾؛[سورةالنساء:,﴾؛[:,,,5b,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,405,4,1,1,r,79.0,165.0,157.0,186.0,مَعَاذِيْـرَهُ﴾[سورة,معاذيره﴾[سورة,﴾[,,,77b,,,
41127,407,2,1,1,r,173.0,113.0,262.0,134.0,وَتَسْبِيحَهُ﴾[سورةالنور:,وتسبيحه﴾[سورةالنور:,﴾[:,,,77b,,,
41177,407,7,1,1,r,232.0,243.0,331.0,264.0,بِحَمْدِهِ﴾[سورةالإسراء:,بحمده﴾[سورةالإسراء:,﴾[:,,,78a,,,
41187,407,8,1,1,r,193.0,269.0,295.0,290.0,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,


In [26]:
fusus[fusus.fass!=""]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
283,14,1,1,1,r,299.0,134.0,357.0,162.0,فَصُّ,فص,,,,2a,,,1.0
284,14,1,1,1,r,259.0,134.0,299.0,162.0,حِكْمَةٍ,حكمة,,,,2a,,,1.0
285,14,1,1,1,r,230.0,134.0,259.0,162.0,إِلٰهِيَّةٍ,إلهية,,,,2a,,,1.0
286,14,1,1,1,r,196.0,134.0,213.0,162.0,فِي,في,,,,2a,,,1.0
287,14,1,1,1,r,157.0,134.0,196.0,162.0,كَلِمَةٍ,كلمة,,,,2a,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38385,377,1,1,1,r,268.0,108.0,308.0,136.0,حِكْمَةٍ,حكمة,,,,72b,,,27.0
38386,377,1,1,1,r,232.0,108.0,268.0,136.0,فَـرْدِيَّةٍ,فردية,,,,72b,,,27.0
38387,377,1,1,1,r,204.0,108.0,220.0,136.0,فِي,في,,,,72b,,,27.0
38388,377,1,1,1,r,165.0,108.0,204.0,136.0,كَلِمَةٍ,كلمة,,,,72b,,,27.0


### Note that in the edition it gives the sura number. This has not come through in the extraction and it is not at all available.
Some minor cleaning was done: in two cases brackets were left in, and in two cases normal brackets were used instead of Quran-brackets.

In [52]:
98+357+10+6+6+13+14+14+17

535

### Does the word `سورة` always follow a Quran-close-bracket?
98 have in the same row. 357 in the next. 10 in 2 rows down. 6 in 3 rows down. 6 in 4 rows down. 13 in 5 rows down. 14 in 6 rows down. 14 in 7 rows down. 17 in 8 rows down. Which is 535, or 15 more than the actual total number, which shows there is already some crude overlap

In [31]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE) & fusus.word.str.contains(pat='سورة')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
2020,28,9,1,1,r,138.0,304.0,225.0,325.0,وَنِسَـآءً﴾.[سورةالنساء:,ونساء﴾.[سورةالنساء:,﴾.[:,,,5b,,,
2024,28,10,1,1,r,236.0,330.0,325.0,351.0,رَبَّـكُمْ﴾؛[سورةالنساء:,ربكم﴾؛[سورةالنساء:,﴾؛[:,,,5b,,,
5132,56,8,1,1,r,173.0,285.0,262.0,306.0,شَيءٌ﴾[سورةالشورى:,شيء﴾[سورةالشورى:,﴾[:,,,11b,,,
5137,57,1,1,1,r,284.0,87.0,372.0,108.0,ٱلبَصِيْرُ﴾[سورةالشورى:,ٱلبصير﴾[سورةالشورى:,﴾[:,,,11b,,,
5181,57,6,1,1,r,268.0,217.0,343.0,238.0,غَفَّارًا﴾.[سورةنوح:,غفارا﴾.[سورةنوح:,﴾.[:,,,11b,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,405,4,1,1,r,79.0,165.0,157.0,186.0,مَعَاذِيْـرَهُ﴾[سورة,معاذيره﴾[سورة,﴾[,,,77b,,,
41127,407,2,1,1,r,173.0,113.0,262.0,134.0,وَتَسْبِيحَهُ﴾[سورةالنور:,وتسبيحه﴾[سورةالنور:,﴾[:,,,77b,,,
41177,407,7,1,1,r,232.0,243.0,331.0,264.0,بِحَمْدِهِ﴾[سورةالإسراء:,بحمده﴾[سورةالإسراء:,﴾[:,,,78a,,,
41187,407,8,1,1,r,193.0,269.0,295.0,290.0,﴿بِحَمْدِهِ﴾[سورةالإسراء:,﴿بحمده﴾[سورةالإسراء:,﴿﴾[:,,,78a,,,


In [40]:
total = 0
for q in fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)].index.tolist():
    if 'سورة' in fusus.iloc[q+1].word:
        total += 1
#         print(str(q+1) + " has " + fusus.iloc[q+1].word)
print(total)

357


### Another way is to look at square brackets, which seem to be used after a Quran citation.
Here we can already find two cases in which the word `سورة` is not found.

In [55]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE) & fusus.word.str.contains(pat='\[') & ~fusus.word.str.contains(pat='سورة')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
14569,162,7,1,1,r,345.0,243.0,379.0,264.0,ـٰبِ﴾[س,ب﴾[س,﴾[,,,29b,,,
40108,396,11,1,1,r,227.0,347.0,303.0,368.0,أَنْفُسَهُمْ﴾[الأنعام:,أنفسهم﴾[الأنعام:,﴾[:,,,76a,,,


In one case, the line was not extracted correctly.

In [56]:
fusus[(fusus['page']==162) & (fusus.line==7)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
14567,162,7,1,1,r,386.0,243.0,402.0,264.0,الأَلْ,الأل,,,,29b,,,
14568,162,7,1,1,r,381.0,243.0,385.0,264.0,بَ,ب,,,,29b,,,
14569,162,7,1,1,r,345.0,243.0,379.0,264.0,ـٰبِ﴾[س,ب﴾[س,﴾[,,,29b,,,
14570,162,7,1,1,r,325.0,247.0,344.0,262.0,ورةال,ورةال,,,,29b,,,
14571,162,7,1,1,r,316.0,247.0,324.0,262.0,زُّمَ,زم,,,,29b,,,
14572,162,7,1,1,r,279.0,247.0,314.0,262.0,ر:٩]،,ر:٩]،,:٩]،,,,29b,,,
14573,162,7,1,1,r,260.0,243.0,279.0,264.0,وَهُ,وه,,,,29b,,,
14574,162,7,1,1,r,252.0,243.0,259.0,264.0,مْ,م,,,,29b,,,
14575,162,7,1,1,r,241.0,243.0,252.0,264.0,ال,ال,,,,29b,,,
14576,162,7,1,1,r,236.0,243.0,240.0,264.0,نَّ,ن,,,,29b,,,


### This actually happened in more cases. It seems we first need to fix this.

In [23]:
round(fusus[(fusus['page']==162) & (fusus.line==7)].short.str.len().mean())

2

In [12]:
fusus[(fusus['page']==162) & (fusus.line==8)].short.str.len()

14589    7
14590    2
14591    5
14592    3
14593    3
14594    4
14595    4
14596    4
14597    2
14598    5
14599    4
14600    4
Name: short, dtype: int64

In [77]:
fusus[(fusus['page']==162) & (fusus.line==7)].short.str.len().describe()

count    22.000000
mean      2.136364
std       1.283427
min       1.000000
25%       1.000000
50%       2.000000
75%       2.000000
max       5.000000
Name: short, dtype: float64

In [17]:
fusus.short.str.len().describe()

count    41532.000000
mean         4.256814
std          1.948673
min          0.000000
25%          3.000000
50%          4.000000
75%          5.000000
max         23.000000
Name: short, dtype: float64

In [58]:
testdf = fusus[fusus.QunawiMS=="3b"]

In [86]:
for name, group in fusus.groupby(['page', 'line']):
    lengtes = group.short.str.len().tolist()
    lengtes = [i for i in lengtes if i != 0]
    if lengtes:
        if (np.mean(lengtes) < 3)&(np.min(lengtes)==1):
            print(name)
            print(np.mean(lengtes))
            print(" ")

(35, 9)
2.0869565217391304
 
(61, 6)
1.875
 
(77, 6)
1.6333333333333333
 
(82, 2)
1.6538461538461537
 
(151, 8)
1.68
 
(154, 3)
1.3571428571428572
 
(162, 7)
2.1363636363636362
 
(186, 6)
1.4230769230769231
 
(237, 11)
1.76
 
(245, 8)
2.3529411764705883
 
(266, 2)
1.1818181818181819
 
(268, 9)
1.64
 
(290, 10)
1.7083333333333333
 
(300, 6)
1.3571428571428572
 
(307, 10)
1.4615384615384615
 
(314, 4)
2.933333333333333
 
(320, 6)
2.0
 
(324, 8)
1.7083333333333333
 
(331, 9)
2.6666666666666665
 
(372, 4)
2.45
 
(381, 4)
1.56
 
(382, 6)
1.5357142857142858
 


In [93]:
fusus[(fusus['page']==21) & (fusus.line==1)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
911,21,1,1,1,r,393.0,87.0,402.0,108.0,فَلَا,فلا,,,,3b,,,
912,21,1,1,1,r,360.0,87.0,393.0,108.0,نَدَّعِيَ,ندعي,,,,3b,,,
913,21,1,1,1,r,360.0,87.0,360.0,108.0,―,―,―,,,3b,,,
914,21,1,1,1,r,333.0,87.0,356.0,108.0,مَا,ما,,,,3b,,,
915,21,1,1,1,r,317.0,87.0,333.0,108.0,أَنَا,أنا,,,,3b,,,
916,21,1,1,1,r,284.0,87.0,320.0,108.0,مُـحَقَّقٌ,محقق,,,,3b,,,
917,21,1,1,1,r,272.0,87.0,284.0,108.0,بِهِ,به,,,,3b,,,
918,21,1,1,1,r,246.0,87.0,272.0,108.0,وَحَاوٍ,وحاو,,,,3b,,,
919,21,1,1,1,r,225.0,87.0,246.0,108.0,عَلَيهِ,عليه,,,,3b,,,
920,21,1,1,1,r,225.0,87.0,225.0,108.0,―,―,―,,,3b,,,


In [94]:
for name, group in fusus.groupby(['page', 'line']):
    lengtes = group.short.str.len().tolist()
    lengtes = [i for i in lengtes if i != 0]
    if lengtes:
        if (lengtes.count(1)>3):
            print(name)
            print(np.mean(lengtes))
            print(" ")

(35, 9)
2.0869565217391304
 
(61, 6)
1.875
 
(77, 6)
1.6333333333333333
 
(82, 2)
1.6538461538461537
 
(151, 8)
1.68
 
(154, 3)
1.3571428571428572
 
(162, 7)
2.1363636363636362
 
(186, 6)
1.4230769230769231
 
(237, 11)
1.76
 
(245, 8)
2.3529411764705883
 
(266, 2)
1.1818181818181819
 
(268, 9)
1.64
 
(290, 10)
1.7083333333333333
 
(300, 6)
1.3571428571428572
 
(307, 10)
1.4615384615384615
 
(320, 6)
2.0
 
(324, 8)
1.7083333333333333
 
(372, 4)
2.45
 
(381, 4)
1.56
 
(382, 6)
1.5357142857142858
 
