# Fixing badly extracted lines, which split on letters not words

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arabicABC as abc

In [15]:
fusus = pd.read_csv('fusus.csv', dtype={"page":int, "line":int, "column":int, "span":int, "direction": str, 
                                        "left":"Int64", "top":"Int64", "right":"Int64", "bottom":"Int64", 
                                        "word":str, "short":str, "haspunct":str, "punctAfter":str, "punctBefore":str, 
                                        "QunawiMS":str, "poetryMeter":str, "poetryVerse":"Int64", "fass":"Int64", "lwcvl":str, "quran":str})
fusus.word = fusus.word.fillna('')
fusus.short = fusus.short.fillna('')
fusus.haspunct = fusus.haspunct.fillna('')
fusus.punctAfter = fusus.punctAfter.fillna('')
fusus.punctBefore = fusus.punctBefore.fillna('')
fusus.poetryMeter = fusus.poetryMeter.fillna('')
fusus.lwcvl = fusus.lwcvl.fillna('')
fusus.quran = fusus.quran.fillna('')

In [16]:
fusus.head()

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
0,8,2,1,1,r,356,197,384,218,الحَمْدُ,الحمد,,,,1b,,,,,
1,8,2,1,1,r,341,197,356,218,لِلهِ,لله,,,,1b,,,,,
2,8,2,1,1,r,312,197,341,218,مُـنَـزِّلِ,منزل,,,,1b,,,,,
3,8,2,1,1,r,274,197,312,218,الحِكَمِ,الحكم,,,,1b,,,,,
4,8,2,1,1,r,260,197,274,218,عَلَىٰ,على,,,,1b,,,,,


### Different elements of a Quran citation

In [22]:
print(fusus[fusus.word.str.contains(pat=abc.QURANCLOSE)].shape)
print(fusus[fusus.word.str.contains(pat=abc.QURANOPEN)].shape)
print(fusus[fusus.word.str.contains(pat="\[")].shape)
print(fusus[fusus.word.str.contains(pat="\]")].shape)
print(fusus[fusus.word.str.contains(pat="سورة")].shape)

(520, 20)
(520, 20)
(699, 20)
(698, 20)
(485, 20)


In one case, the line was not extracted correctly.

In [105]:
fusus[(fusus['fass']!="")].fass.unique()

array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
       24.0, 25.0, 26.0, 27.0], dtype=object)

### This actually happened in more cases. It seems we first need to fix this.

In [4]:
fusus[(fusus['page']==162) & (fusus.line==7)].short.str.len()

14567    4
14568    1
14569    4
14570    5
14571    2
14572    5
14573    2
14574    1
14575    2
14576    1
14577    2
14578    3
14579    1
14580    2
14581    1
14582    2
14583    1
14584    2
14585    2
14586    2
14587    1
14588    1
Name: short, dtype: int64

In [77]:
fusus[(fusus['page']==162) & (fusus.line==7)].short.str.len().describe()

count    22.000000
mean      2.136364
std       1.283427
min       1.000000
25%       1.000000
50%       2.000000
75%       2.000000
max       5.000000
Name: short, dtype: float64

In [17]:
fusus.short.str.len().describe()

count    41532.000000
mean         4.256814
std          1.948673
min          0.000000
25%          3.000000
50%          4.000000
75%          5.000000
max         23.000000
Name: short, dtype: float64

I created a subset of the dataframe, to ensure easy calculation before we attempt to calculate on the entire text. Let's call it `testdf`

In [58]:
testdf = fusus[fusus.QunawiMS=="3b"]

After testing with this I came up with the next code for which I replaced `testdf` with `fusus` to calculate on the entire text. We print out all lines for which the mean length of each row (should be word) is less than 3 and which has at least one row with length 1 (zeroes are removed).

In [366]:
namesA = []
for name, group in fusus.groupby(['page', 'line']):
    lengtes = group.short.str.len().tolist()
    lengtes = [i for i in lengtes if i != 0]
    if lengtes:
        if (np.mean(lengtes) < 3.5)&(np.min(lengtes)==1):
            namesA.append(name)
            print(name)
            print(np.mean(lengtes))
            print(" ")

(21, 1)
3.2
 
(21, 6)
3.3076923076923075
 
(29, 4)
3.0714285714285716
 
(47, 5)
3.357142857142857
 
(56, 7)
3.3333333333333335
 
(61, 8)
3.357142857142857
 
(75, 7)
3.4615384615384617
 
(101, 1)
3.2142857142857144
 
(107, 9)
3.0714285714285716
 
(111, 1)
3.3076923076923075
 
(131, 1)
3.2857142857142856
 
(151, 7)
3.3076923076923075
 
(161, 2)
3.142857142857143
 
(166, 2)
3.4285714285714284
 
(166, 3)
3.4285714285714284
 
(167, 7)
3.4166666666666665
 
(168, 13)
3.0
 
(168, 14)
3.357142857142857
 
(174, 2)
3.2857142857142856
 
(183, 4)
3.4285714285714284
 
(183, 8)
3.4545454545454546
 
(205, 9)
3.2857142857142856
 
(209, 3)
3.4285714285714284
 
(214, 5)
3.357142857142857
 
(215, 6)
3.142857142857143
 
(222, 13)
3.2
 
(231, 8)
3.3333333333333335
 
(255, 7)
3.375
 
(258, 10)
3.1333333333333333
 
(267, 9)
3.2857142857142856
 
(271, 9)
3.230769230769231
 
(279, 4)
3.3333333333333335
 
(287, 9)
3.3846153846153846
 
(288, 9)
3.357142857142857
 
(291, 12)
3.2142857142857144
 
(294, 5)
3.3846153

Let's test on one of these and we see we got a positive hit

In [5]:
fusus[(fusus['page']==381) & (fusus.line==4)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
38635,381,4,1,1,r,379.0,165.0,383.0,186.0,فَ,ف,,,,73a,,,
38636,381,4,1,1,r,368.0,165.0,377.0,186.0,إِنَّ,إن,,,,73a,,,
38637,381,4,1,1,r,361.0,165.0,366.0,186.0,هُ,ه,,,,73a,,,
38638,381,4,1,1,r,353.0,165.0,361.0,186.0,قَ,ق,,,,73a,,,
38639,381,4,1,1,r,339.0,165.0,351.0,186.0,الَ,ال,,,,73a,,,
38640,381,4,1,1,r,331.0,165.0,339.0,186.0,فِ,ف,,,,73a,,,
38641,381,4,1,1,r,307.0,165.0,329.0,186.0,يحَ,يح,,,,73a,,,
38642,381,4,1,1,r,294.0,165.0,305.0,186.0,دِيْ,دي,,,,73a,,,
38643,381,4,1,1,r,280.0,165.0,292.0,186.0,ثِ,ث,,,,73a,,,
38644,381,4,1,1,r,270.0,165.0,280.0,186.0,ال,ال,,,,73a,,,


Are there other ways to find these lines with bad splitting?

In [4]:
namesB = []
for name, group in fusus.groupby(['page', 'line']):
    lengtes = group.short.str.len().tolist()
    lengtes = [i for i in lengtes if i != 0]
    if lengtes:
        if (lengtes.count(1)>3):
            namesB.append(name)
            print(name)
            print(np.mean(lengtes))
            print(" ")

(35, 9)
2.0869565217391304
 
(61, 6)
1.875
 
(77, 6)
1.6333333333333333
 
(82, 2)
1.6538461538461537
 
(151, 8)
1.68
 
(154, 3)
1.3571428571428572
 
(162, 7)
2.1363636363636362
 
(186, 6)
1.4230769230769231
 
(237, 11)
1.76
 
(245, 8)
2.3529411764705883
 
(266, 2)
1.1818181818181819
 
(268, 9)
1.64
 
(290, 10)
1.7083333333333333
 
(300, 6)
1.3571428571428572
 
(307, 10)
1.4615384615384615
 
(320, 6)
2.0
 
(324, 8)
1.7083333333333333
 
(372, 4)
2.45
 
(381, 4)
1.56
 
(382, 6)
1.5357142857142858
 


In [11]:
set(namesB).issubset(set(namesA))

True

The list `namesA` is sufficient, then. The original list consisted of: [(35, 9),
 (61, 6),
 (77, 6),
 (82, 2),
 (151, 8),
 (154, 3),
 (162, 7),
 (186, 6),
 (237, 11),
 (245, 8),
 (266, 2),
 (268, 9),
 (290, 10),
 (300, 6),
 (307, 10),
 (314, 4),
 (320, 6),
 (324, 8),
 (331, 9),
 (372, 4),
 (381, 4),
 (382, 6)]
 Since we are manually solving the lines and saving each time and reloading the CSV, the list is shrinking as we recalculate.

In [19]:
namesA

[(35, 9),
 (61, 6),
 (77, 6),
 (82, 2),
 (151, 8),
 (154, 3),
 (162, 7),
 (186, 6),
 (237, 11),
 (245, 8),
 (266, 2),
 (268, 9),
 (290, 10),
 (300, 6),
 (307, 10),
 (314, 4),
 (320, 6),
 (324, 8),
 (331, 9),
 (372, 4),
 (381, 4),
 (382, 6)]

In [373]:
fusus.groupby(['page', 'line']).get_group(namesA[6])

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
6625,75,7,1,1,r,369.0,243.0,384.0,264.0,قَالَ,قال,,,,14b,,,
6626,75,7,1,1,r,338.0,243.0,369.0,264.0,الخَرَّازُ,الخراز,,,,14b,,,
6627,75,7,1,1,r,325.0,243.0,329.0,256.0,―,―,―,,,14b,,,
6628,75,7,1,1,r,287.0,243.0,321.0,264.0,وَهُوَ,وهو,,,,14b,,,
6629,75,7,1,1,r,265.0,243.0,287.0,264.0,وَجْهٌ,وجه,,,,14b,,,
6630,75,7,1,1,r,249.0,243.0,265.0,264.0,مِنْ,من,,,,14b,,,
6631,75,7,1,1,r,222.0,243.0,249.0,264.0,وُجُوهِ,وجوه,,,,14b,,,
6632,75,7,1,1,r,196.0,243.0,222.0,264.0,الحَقِّ,الحق,,,,14b,,,
6633,75,7,1,1,r,167.0,243.0,196.0,264.0,وَلِسَانٌ,ولسان,,,,14b,,,
6634,75,7,1,1,r,150.0,243.0,167.0,264.0,مِنْ,من,,,,14b,,,


In [359]:
# fusus.iloc[38401, fusus.columns.get_loc("word")] += fusus.iloc[38402, fusus.columns.get_loc("word")]+fusus.iloc[38403, fusus.columns.get_loc("word")]
# fusus.iloc[38401, fusus.columns.get_loc("short")] += fusus.iloc[38402, fusus.columns.get_loc("short")]+fusus.iloc[38403, fusus.columns.get_loc("short")]
fusus.iloc[38404, fusus.columns.get_loc("word")] += fusus.iloc[38405, fusus.columns.get_loc("word")]
fusus.iloc[38404, fusus.columns.get_loc("short")] += fusus.iloc[38405, fusus.columns.get_loc("short")]
fusus.iloc[38405, fusus.columns.get_loc("word")]=""

# fusus.iloc[38411, fusus.columns.get_loc("haspunct")] += fusus.iloc[38413, fusus.columns.get_loc("haspunct")]
# fusus.iloc[38413, fusus.columns.get_loc("haspunct")] = ""


fusus[(fusus.page==382)&(fusus.line==6)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
38401,382,6,1,1,r,390.0,217.0,402.0,238.0,أَحَدَكُمْ,أحدكم,,,,73b,,,
38402,382,6,1,1,r,374.0,217.0,389.0,238.0,لَا,لا,,,,73b,,,
38403,382,6,1,1,r,365.0,217.0,372.0,238.0,,م,,,,73b,,,
38404,382,6,1,1,r,349.0,217.0,365.0,238.0,يَرَىٰ,يرى,,,,73b,,,
38405,382,6,1,1,r,343.0,217.0,348.0,238.0,,رى,,,,73b,,,
38406,382,6,1,1,r,321.0,217.0,343.0,238.0,رَبَّهُ,ربه,,,,73b,,,
38407,382,6,1,1,r,314.0,217.0,320.0,238.0,,ه,,,,73b,,,
38408,382,6,1,1,r,302.0,217.0,314.0,238.0,حَتَّىٰ,حتى,,,,73b,,,
38409,382,6,1,1,r,296.0,217.0,301.0,238.0,,ت,,,,73b,,,
38410,382,6,1,1,r,294.0,217.0,295.0,238.0,,ى,,,,73b,,,


In [336]:
fusus.iloc[38297, fusus.columns.get_loc("short")][2:4]

'ال'

In [299]:
fusus.iloc[32556, fusus.columns.get_loc("word")][:2]

'له'

In [78]:
fusus['word'].replace("", np.nan, inplace=True)
fusus.dropna(subset=['word'],inplace=True)
fusus[(fusus.page==382)&(fusus.line==6)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl
38401,382,6,1,1,r,390.0,217.0,402.0,238.0,أَحَدَكُمْ,أحدكم,,,,73b,,,,
38402,382,6,1,1,r,374.0,217.0,389.0,238.0,لَا,لا,,,,73b,,,,
38403,382,6,1,1,r,349.0,217.0,365.0,238.0,يَرَىٰ,يرى,,,,73b,,,,
38404,382,6,1,1,r,321.0,217.0,343.0,238.0,رَبَّهُ,ربه,,,,73b,,,,
38405,382,6,1,1,r,302.0,217.0,314.0,238.0,حَتَّىٰ,حتى,,,,73b,,,,
38406,382,6,1,1,r,278.0,217.0,294.0,238.0,يَمُوتَ»،,يموت،,»،,,,73b,,,,
38407,382,6,1,1,r,233.0,217.0,244.0,238.0,لِذٰلِكَ,لذلك,,,,73b,,,,
38408,382,6,1,1,r,200.0,217.0,209.0,238.0,قَالَ,قال,,,,73b,,,,
38409,382,6,1,1,r,180.0,217.0,187.0,238.0,تَعَالَىٰ:,تعالى,,:,,73b,,,,
38410,382,6,1,1,r,120.0,217.0,152.0,238.0,«وَلَابُدَّ,ولابد,,,«,73b,,,,


In [361]:
fusus.to_csv('fusus2.csv', index=False)

In [386]:
fusus[fusus.short=="مارأى"]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
3144,38,11,1,1,r,358.0,347.0,379.0,368.0,مَارَأَىٰ,مارأى,,,,7b,,,
3154,38,11,1,1,r,121.0,347.0,141.0,368.0,مَارَأَىٰ,مارأى,,,,8a,,,
8394,98,5,1,1,r,151.0,191.0,181.0,212.0,مَارَأىٰ.,مارأى,,.,,18a,,,


In [77]:
fusus[(fusus.page==371)&(fusus.line==16)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl
37524,371,16,1,1,r,375.0,477.0,402.0,498.0,خَلْفَكَ,خلفك,,,,71b,,,,
37525,371,16,1,1,r,366.0,477.0,375.0,498.0,,,,,,71b,,,,
37526,371,16,1,1,r,355.0,477.0,366.0,498.0,ءَايَةً﴾,ءاية﴾,﴾,,,71b,,,,
37527,371,16,1,1,r,304.0,479.0,355.0,497.0,[سورةيونس:,[سورةيونس:,[:,,,71b,,,,
37528,371,16,1,1,r,272.0,481.0,288.0,496.0,٢٩].,٢٩].,٢٩].,,,71b,,,,


In [76]:
fusus.iloc[37526, fusus.columns.get_loc("word")] = fusus.iloc[37525, fusus.columns.get_loc("word")]+fusus.iloc[37526, fusus.columns.get_loc("word")]
fusus.iloc[37526, fusus.columns.get_loc("short")] = fusus.iloc[37525, fusus.columns.get_loc("short")]+fusus.iloc[37526, fusus.columns.get_loc("short")]

# fusus.iloc[31784, fusus.columns.get_loc("word")] += fusus.iloc[31785, fusus.columns.get_loc("word")]
# fusus.iloc[31784, fusus.columns.get_loc("short")] += fusus.iloc[31785, fusus.columns.get_loc("short")]

fusus.iloc[37525, fusus.columns.get_loc("word")] = ""
fusus.iloc[37525, fusus.columns.get_loc("short")] = ""

In [13]:
fusus.iloc[21855, fusus.columns.get_loc("lwcvl")] = "It belongs to the previous word but is like this across the hemistich"

In [4]:
fusus[fusus.lwcvl!=""]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass,lwcvl,quran
8977,106,8,1,1,r,242.0,269.0,258.0,290.0,ــكَ,ك,,,,19a,,,,Entire word is fīka but it is indeed split ove...,
13035,148,6,1,1,r,150.0,261.0,179.0,282.0,صِـرَٰطٍ,صرط,,,,27a,,,,Overly beatified spelling of ṣirāṭ. Perhaps ne...,
14287,161,2,2,1,r,213.0,113.0,224.0,134.0,تَ,ت,,,,29b,,,,It belongs to the previous word but is like th...,
21855,231,8,2,1,r,201.0,278.0,206.0,299.0,رٌ,ر,,,,43b,,,,It belongs to the previous word but is like th...,
28116,291,13,1,1,r,293.0,399.0,350.0,420.0,فَإِنْكَانَكَمَا,فإنكانكما,,,,54a,,,,Needs fixing,


In [17]:
fusus.to_csv('fusus.csv',index=False)

In [16]:
fusus['lwcvl'] = fusus['lwcvl'].astype('string')