# Fixing badly extracted lines, which split on letters not words

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arabicABC as abc

In [3]:
fusus = pd.read_csv('fusus.csv')
fusus = fusus.fillna('')

### We can look at square brackets, which seem to be used after a Quran citation.

In [55]:
fusus[fusus.word.str.contains(pat=abc.QURANCLOSE) & fusus.word.str.contains(pat='\[') & ~fusus.word.str.contains(pat='سورة')]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
14569,162,7,1,1,r,345.0,243.0,379.0,264.0,ـٰبِ﴾[س,ب﴾[س,﴾[,,,29b,,,
40108,396,11,1,1,r,227.0,347.0,303.0,368.0,أَنْفُسَهُمْ﴾[الأنعام:,أنفسهم﴾[الأنعام:,﴾[:,,,76a,,,


In one case, the line was not extracted correctly.

In [56]:
fusus[(fusus['page']==162) & (fusus.line==7)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
14567,162,7,1,1,r,386.0,243.0,402.0,264.0,الأَلْ,الأل,,,,29b,,,
14568,162,7,1,1,r,381.0,243.0,385.0,264.0,بَ,ب,,,,29b,,,
14569,162,7,1,1,r,345.0,243.0,379.0,264.0,ـٰبِ﴾[س,ب﴾[س,﴾[,,,29b,,,
14570,162,7,1,1,r,325.0,247.0,344.0,262.0,ورةال,ورةال,,,,29b,,,
14571,162,7,1,1,r,316.0,247.0,324.0,262.0,زُّمَ,زم,,,,29b,,,
14572,162,7,1,1,r,279.0,247.0,314.0,262.0,ر:٩]،,ر:٩]،,:٩]،,,,29b,,,
14573,162,7,1,1,r,260.0,243.0,279.0,264.0,وَهُ,وه,,,,29b,,,
14574,162,7,1,1,r,252.0,243.0,259.0,264.0,مْ,م,,,,29b,,,
14575,162,7,1,1,r,241.0,243.0,252.0,264.0,ال,ال,,,,29b,,,
14576,162,7,1,1,r,236.0,243.0,240.0,264.0,نَّ,ن,,,,29b,,,


### This actually happened in more cases. It seems we first need to fix this.

In [4]:
fusus[(fusus['page']==162) & (fusus.line==7)].short.str.len()

14567    4
14568    1
14569    4
14570    5
14571    2
14572    5
14573    2
14574    1
14575    2
14576    1
14577    2
14578    3
14579    1
14580    2
14581    1
14582    2
14583    1
14584    2
14585    2
14586    2
14587    1
14588    1
Name: short, dtype: int64

In [77]:
fusus[(fusus['page']==162) & (fusus.line==7)].short.str.len().describe()

count    22.000000
mean      2.136364
std       1.283427
min       1.000000
25%       1.000000
50%       2.000000
75%       2.000000
max       5.000000
Name: short, dtype: float64

In [17]:
fusus.short.str.len().describe()

count    41532.000000
mean         4.256814
std          1.948673
min          0.000000
25%          3.000000
50%          4.000000
75%          5.000000
max         23.000000
Name: short, dtype: float64

I created a subset of the dataframe, to ensure easy calculation before we attempt to calculate on the entire text. Let's call it `testdf`

In [58]:
testdf = fusus[fusus.QunawiMS=="3b"]

After testing with this I came up with the next code for which I replaced `testdf` with `fusus` to calculate on the entire text. We print out all lines for which the mean length of each row (should be word) is less than 3 and which has at least one row with length 1 (zeroes are removed).

In [86]:
for name, group in fusus.groupby(['page', 'line']):
    lengtes = group.short.str.len().tolist()
    lengtes = [i for i in lengtes if i != 0]
    if lengtes:
        if (np.mean(lengtes) < 3)&(np.min(lengtes)==1):
            print(name)
            print(np.mean(lengtes))
            print(" ")

(35, 9)
2.0869565217391304
 
(61, 6)
1.875
 
(77, 6)
1.6333333333333333
 
(82, 2)
1.6538461538461537
 
(151, 8)
1.68
 
(154, 3)
1.3571428571428572
 
(162, 7)
2.1363636363636362
 
(186, 6)
1.4230769230769231
 
(237, 11)
1.76
 
(245, 8)
2.3529411764705883
 
(266, 2)
1.1818181818181819
 
(268, 9)
1.64
 
(290, 10)
1.7083333333333333
 
(300, 6)
1.3571428571428572
 
(307, 10)
1.4615384615384615
 
(314, 4)
2.933333333333333
 
(320, 6)
2.0
 
(324, 8)
1.7083333333333333
 
(331, 9)
2.6666666666666665
 
(372, 4)
2.45
 
(381, 4)
1.56
 
(382, 6)
1.5357142857142858
 


Let's test on one of these and we see we got a positive hit

In [5]:
fusus[(fusus['page']==381) & (fusus.line==4)]

Unnamed: 0,page,line,column,span,direction,left,top,right,bottom,word,short,haspunct,punctAfter,punctBefore,QunawiMS,poetryMeter,poetryVerse,fass
38635,381,4,1,1,r,379.0,165.0,383.0,186.0,فَ,ف,,,,73a,,,
38636,381,4,1,1,r,368.0,165.0,377.0,186.0,إِنَّ,إن,,,,73a,,,
38637,381,4,1,1,r,361.0,165.0,366.0,186.0,هُ,ه,,,,73a,,,
38638,381,4,1,1,r,353.0,165.0,361.0,186.0,قَ,ق,,,,73a,,,
38639,381,4,1,1,r,339.0,165.0,351.0,186.0,الَ,ال,,,,73a,,,
38640,381,4,1,1,r,331.0,165.0,339.0,186.0,فِ,ف,,,,73a,,,
38641,381,4,1,1,r,307.0,165.0,329.0,186.0,يحَ,يح,,,,73a,,,
38642,381,4,1,1,r,294.0,165.0,305.0,186.0,دِيْ,دي,,,,73a,,,
38643,381,4,1,1,r,280.0,165.0,292.0,186.0,ثِ,ث,,,,73a,,,
38644,381,4,1,1,r,270.0,165.0,280.0,186.0,ال,ال,,,,73a,,,


Are there other ways to find these lines with bad splitting?

In [94]:
for name, group in fusus.groupby(['page', 'line']):
    lengtes = group.short.str.len().tolist()
    lengtes = [i for i in lengtes if i != 0]
    if lengtes:
        if (lengtes.count(1)>3):
            print(name)
            print(np.mean(lengtes))
            print(" ")

(35, 9)
2.0869565217391304
 
(61, 6)
1.875
 
(77, 6)
1.6333333333333333
 
(82, 2)
1.6538461538461537
 
(151, 8)
1.68
 
(154, 3)
1.3571428571428572
 
(162, 7)
2.1363636363636362
 
(186, 6)
1.4230769230769231
 
(237, 11)
1.76
 
(245, 8)
2.3529411764705883
 
(266, 2)
1.1818181818181819
 
(268, 9)
1.64
 
(290, 10)
1.7083333333333333
 
(300, 6)
1.3571428571428572
 
(307, 10)
1.4615384615384615
 
(320, 6)
2.0
 
(324, 8)
1.7083333333333333
 
(372, 4)
2.45
 
(381, 4)
1.56
 
(382, 6)
1.5357142857142858
 
