In [None]:
import click
import codecs
import os
import re
import nltk
import string

from Bio import pairwise2

from nlppln.utils import create_dirs, out_file_name


def tokenize(text):
    tokens = nltk.word_tokenize(text)

    # nltk tokenizer replaces " (double quotes) with `` and ''.
    # We want to keep the double quotes, so replace them again.
    tokens = ['"' if t == '``' or t == "''" else t for t in tokens]

    return tokens


def get_spaces_pattern(text):
    # replace regular expressions special characters
    for p in ('(', ')'):
            text = text.replace(p, '#')
    tokens = tokenize(text)
    m = re.match(r'( *)'.join(tokens), text)
    return m.groups()


In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [None]:
text_file = '/home/jvdzwaan/data/tmp/adh/merge-test/text.txt'
text_file = '/home/jvdzwaan/data/tmp/adh/merge-test/0179MalikIbnAnas.Muwatta.txt'
text_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/0381IbnBabawayh.Hidaya.txt'
text_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/1078ShaykhiZadahDamadAfandi.MajmacAnhur..txt'
text_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/1122MuhammadZarqani.SharhCalaMuwatta.txt'
with open(text_file) as f:
    text = f.read()
print(text)

In [None]:
print(len(list(re.finditer('\u2028', text))))

In [None]:
import re
import os

from nlppln.utils import get_files

for txt_file in get_files('/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/'):
#for txt_file in get_files('/home/jvdzwaan/data/adh-corpora/dawa/'):
#for txt_file in get_files('/home/jvdzwaan/data/adh-corpora/poetry/txt/'):
    with open(txt_file) as f:
        text = f.read()
    matches = re.findall('\u2028', text)
    if len(matches) > 0:
        print(os.path.basename(txt_file), len(matches))

In [None]:
#text_file = '/home/jvdzwaan/data/tmp/adh/merge-test/text.txt'
#text_file = '/home/jvdzwaan/data/tmp/adh/merge-test/0179MalikIbnAnas.Muwatta.txt'
#text_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/0381IbnBabawayh.Hidaya.txt'
text_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/1078ShaykhiZadahDamadAfandi.MajmacAnhur..txt'
#text_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/txt/1122MuhammadZarqani.SharhCalaMuwatta.txt'
with open(text_file) as f:
    text = f.read()
print(text)

text = text.replace('\u2028', '\n')
print(text)

with open(text_file, 'w') as f:
    f.write(text)

In [None]:
import re

regex = r'\| \#{1,}(.+)\n'
#print(regex)
match = re.search(regex, text)
header = match.group(1).strip()
print(header)

In [None]:
tokens = tokenize(text)
print(len(tokens))

In [None]:
import codecs
from bs4 import BeautifulSoup
import pandas as pd

def analyzer_xml2df(fname):
    #print(fname)
    with codecs.open(fname) as f:
        soup = BeautifulSoup(f.read(), 'xml')
    
    result = []
    
    for word in soup.find_all('word'):
        analyses = word.find_all('analysis')
        roots = [a.get('root', 'NO_ROOT') for a in analyses]
        roots = list(set(roots))
        if len(roots) == 0:
            roots.append('NOANALYSIS')
        result.append({'word': word['value'], 'proposed_root': '\\'.join(roots), 'id': word['w_id']})
    
    #print(len(result))
    return pd.DataFrame(result)

In [None]:
xml_file = '/home/jvdzwaan/data/tmp/adh/merge-test/text.xml'
xml_file = '/home/jvdzwaan/data/tmp/adh/merge-test/0179MalikIbnAnas.Muwatta.xml'

df = analyzer_xml2df(xml_file)
df.head()

In [None]:
alignment = pairwise2.align.localms(tokens,list(df['word']),2,-1,-0.5,-0.1, gap_char=["GAP"])

In [None]:
h_tok = tokenize(header)

In [None]:
h = False
level = 0
header_words = []
w_ids = []
w_id = 0

for t1, t2 in zip(alignment[0][0], alignment[0][1]):
    if t2 != 'GAP':
        w_id += 1

    if t1 == '|':
        h = True
    elif h and t1 == '#':
        level += 1
    elif t1 == 'NEWLINE': # end of header
        if h:
            print('HEADER [{}] {} ({})'.format(level, ' '.join(header_words), ', '.join(w_ids)))
        h = False
        level = 0
        header_words = []
        w_ids = []
    elif h and t2 != 'GAP':
        header_words.append(t2)
        w_ids.append(str(w_id))
        #print('H', t1)
    #print(t1,t2)

In [None]:
for t1, t2 in zip(alignment[0][0], alignment[0][1]):
    print(t1,t2)
    

In [None]:
regex = r'\| \#\#\#(.+?)\n'

In [None]:
# openiti headers
regex = r'\#\#\# (?P<level>\|+) (?P<text>.+?)\n'
for m in re.finditer(regex, text):
    print(m)
    print(len(m.group(1)))
    print(m.group(2))

In [None]:
print(m.groupdict())

In [None]:
'level' in m.groupdict()

In [None]:
# quran/hadith quotes
regex = r'@(?P<source>[QH])B@(?P<text>.+?)@(?P=source)E@'
for m in re.finditer(regex, text):
    print(m)
    print(m.group('source'))
    print(m.group('text'))

In [None]:
# openiti headers and quran/hadith quotes
regex = r'\#\#\# (?P<level>\|+) (?P<header>.+?)\n|@(?P<source>[QH])B@(?P<quote>.+?)@(?P=source)E@'
for m in re.finditer(regex, text):
    print(m)
    print(m.groupdict())

In [None]:
start = 0

for m in re.finditer(regex, text):
    print(m)
    prev_text = text[start:m.start()]
    print(len(prev_text.strip()))
    print(repr(prev_text))
    import sys
    sys.exit()

In [None]:
import unicodedata

print(unicodedata.category('\u200f'))
print(unicodedata.category('ب'))

In [None]:
def smart_strip(text, to_remove=('\u200f')):
    text = ''.join(list(filter(lambda char: char not in to_remove, text)))
    return text.strip()

print(repr(smart_strip('\n\n\n\n\n\n\n\u200f')))

In [None]:
regex = r'\#\#\# (\||\|\|) (.+?)\n'
start = 0
i = 0
names = []

out_dir = '/home/jvdzwaan/data/tmp'
doc_name = os.path.splitext(os.path.basename(text_file))[0]
print(doc_name)
for m in re.finditer(regex, text):
    print(m)
    prev_text = text[start:m.start()]
    print(len(smart_strip(prev_text)))
    
    prev_text = smart_strip(prev_text)
    if len(prev_text) > 0:
        print('Write file for prev text')
        #print(start,m.start())
        #print(i)
        fname = '{}-{:05}.txt'.format(doc_name, i)
        fname = os.path.join(out_dir, fname)
        print(fname)
        names.append(fname)
        i += 1
    
    print('level {}'.format(len(m.group(1))))
    print('header: {}'.format(m.group(2)))
    print('Write file for header')
    level = len(m.group(1))
    fname = '{}-{:05}-header-{}.txt'.format(doc_name, i, level)
    fname = os.path.join(out_dir, fname)
    print(fname)
    names.append(fname)
    i += 1
    
    start = m.end()
    
    #import sys
    #sys.exit()

In [None]:
names.sort()
for n in names:
    print(n)

In [None]:
c = u"\u200f"
print(repr(u"\u200f"))
print(c)

In [None]:
print(len(c))