In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import OrderedDict

from lxml import etree
from tqdm import tqdm_notebook as tqdm

import re


def read_book(fname, book_id):
    result = OrderedDict()
    regex = '_{9,}\n'

    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('book'))
    for event, elem in context:
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'nass':   # text
                # sometimes the text is None (e.g., file 16, id 214)
                text = e.text
                fn = None
                if text is None:
                    print('None text found in {} (prev? id: {})'.format(fname, t_id))
                else:
                    # &#xd; is automatically replaced by \r, but we would like to have \n
                    text = text.replace('\r', '\n')
                    
                    # get the footnotes if they are there
                    parts = re.split(regex, text)
                    
                    # we shouln't have parts longer than 2
                    if len(parts) > 2:                        
                        # all exceptions
                        print('book_id', book_id)
                        if book_id in (27, 38):
                            text = parts[0]
                            fn = regex.join(parts[1:])
                        elif book_id in (36, 70, 89, 124, 140, 142, 143, 146, 147, 148):
                            # seems to be a table of contents and/or index
                            # so, put the content in the notes
                            text = ' '
                            fn = text
                        else:
                            pass
                            print('More parts found in {} ({})'.format(t_id, len(parts)))
                            #print(text)
                            #print('---')
    
                    if len(parts) == 2:
                        t, fn = parts
                        text = t

            elif e.tag == 'page':
                page = e.text
            elif e.tag == 'part':
                part = e.text
            #print(e.tag)
        
        # ignore entries that don't have text
        if not text is None:
            result[t_id] = {'text': text, 'page': page, 'part': part}
        
        if not fn is None:
            result[t_id]['footnotes'] = fn

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return result

book = read_book('/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/217/book.xml', 1)
print(len(book))

In [None]:
def read_title(fname):
    result = {}
    levels = {}
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('title'))
    for event, elem in context:
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'tit':
                title = e.text
                # &#xd; is automatically replaced by \r, but we would like to have \n
                title = title.replace('\r', '\n')
            elif e.tag == 'lvl':
                level = e.text
                levels[level] = None
            #print(e.tag)
        if not t_id in result.keys():
            result[t_id] = []
            
        result[t_id].append({'title': title, 'level': level})

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    #print('levels found:', levels.keys())
    return result, levels.keys()

title, levels = read_title('/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/217/title.xml')
print(len(title))

In [None]:
def normalize_arabic(text):
    # Remove non-arabic characters
    nonarab_chars = '[^\u0621-\u064A ]'
    text = re.sub(nonarab_chars, '', text)
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    return text

def preprocess_arabic(text):
    # Remove non-arabic characters
    nonarab_chars = '[^\u0621-\u064A ]'
    text = re.sub(nonarab_chars, '', text)
    text = text.strip()
    return text

In [None]:
# combine book and title
# and write to file
import re
import codecs

from fuzzywuzzy import fuzz, process

def replace_in_lines(text, title, lvl):
    #print('text in')
    #print(text)
    #print('---')
    replaced = False
    lines = text.split('\n')
    for i, line in enumerate(lines):
        if title == line:
            lines[i] = line.replace(line, '### {} {}\n'.format(int(lvl)*'|', line))
            replaced=True
            #print('found exact match')
            break
    if not replaced:
        for i, line in enumerate(lines):
            if title in line and not line.startswith('#'):
                lines[i] = line.replace(line, '### {} {}\n'.format(int(lvl)*'|', line))
                
    text = '\n'.join(lines)
    #print('text out')
    #print(text)
    #print('---')
    return text


def combine_data(book, title, out_file):
    start = 0
    in_text = 0
    not_found = 0
    norm_tit = 0
    with codecs.open(out_file, 'w', encoding='utf-8') as f:
        for b_id, data in book.items():
            text = data['text']
            num = b_id
            #num = str(int(b_id)+1)
            if num in title:
                #print(b_id)
                #print('title!')
                #print(title[b_id])
                # Sometimes the titles in the xml are in the wrong order
                # So sort them to put them in the right order
                title[num].sort(key=lambda t: int(t['level']))
                #print(title[b_id])
                #print('---')
                for t in title[num]:
                    ti = t['title']
                    lvl = t['level']
                    replaced = False
                    #print(title[b_id]['title'], title[b_id]['level'])
                    #f.write('### {} {}\n'.format(int(lvl)*'|', ti))
                    if text.strip().startswith(ti):
                        #print('title at start of text', b_id)
                        text = text.replace(ti, '### {} {}\n'.format(int(lvl)*'|', ti), 1)
                        start += 1
                        replaced = True
                    elif ti in text:
                        #print('title in text', b_id)
                        # find line with text
                        text = replace_in_lines(text, ti, lvl)
                        in_text += 1
                        replaced = True
                    else:
                        #text = '### {} {}\n'.format(int(lvl)*'|', ti) + text
                        #print('title text not found', b_id)
                        title_normalized = normalize_arabic(ti)
                        if title_normalized in text:
                            #print('Normalized title found!')
                            norm_tit += 1
                            text = replace_in_lines(text, title_normalized, lvl)
                            
                        else:
                            lines = text.split('\n')
                            print('Title not found', b_id)
                            print('<id>{}</id>'.format(b_id))
                            not_found += 1
                            
                            # For these titles, the fuzzy matching does not work properly, so don't do it
                            if b_id not in (12, 17, 25, 32, 33, 34, 35, 36, 37, 38, 43, 44, 47, 48, 49, 
                                            51, 52, 53, 54, 57, 59, 60, 61, 62, 65, 68, 70, 72, 73, 75, 
                                            76, 77, 78, 79, 80, 81, 85, 90, 91, 92, 93, 94, 95, 96, 97, 
                                            98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 
                                            110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 
                                            122, 123, 124, 125, 127, 128, 129, 130, 131, 134, 135, 136, 
                                            137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 
                                            150, 151, 152, 154, 155, 156, 162, 164, 167, 170, 176, 181, 
                                            188, 189, 190, 191, 192, 193, 194, 195, 198, 199, 201, 202, 
                                            203, 204, 205, 206, 207, 208, 210, 211, 212, 214, 217, 218, 
                                            219, 229, 245, 246, 250, 251, 252, 253, 254, 255, 256, 259, 
                                            263, 264, 271, 273, 275, 278, 281, 283):
                                #print(len(lines), 'lines found')
                                match = process.extractOne(title_normalized, lines)
                                print('fuzzy match', match[1])
                                print(match[0])
                                #print('text')
                                #print(text)
                                #print('---')
                            
                                text = text.replace(match[0], '### {} {}\n'.format(int(lvl)*'|', match[0]))
                            
                                #text_normalized = normalize_arabic(text)
                                #print(len(text), len(text_normalized))
                                #print(text == text_normalized)
                                #if title_normalized in text_normalized:
                                #    print('Normalized title found in normalized text!')
                                
                                # find beginning and end of match
                                #regex = r'{}'.format(title_normalized)
                                #print(regex)
                                #matches = re.finditer(regex, text_normalized)
                                # we replace the first occurence of the title 
                                # but we want to know about the other occurences
                                #for i, m in enumerate(matches):
                                #    print(i, 'm', m)
                                #    if i == 0:
                                #        print(m.start())
                                #        print(m.end())
                                #        prep = preprocess_arabic(text)
                                #        extr = prep[m.start()-1:m.end()]
                                #        print('extracted:\t', extr)
                                #        print(extr in text)
                                #        print(text)
                                        #result = []
                                        #for c1, c2 in zip(ti, m.group()):
                                        #    if c1 == c2:
                                        #        result.append(c1)
                                        #    else:
                                        #        result.append('.')
                                        #print('regex:', ''.join(result))
                                        #matches = re.finditer(r'{}'.format(regex), text)
                                        #for m in matches:
                                        #    print(m)
                            print('unnormalized:\t', ti)
                            print('normalized:\t', title_normalized)
                            print()
                    #print('-----RESULT-----')
                    #print(text)
                    #print('----')
                    
                    # Remove the first occurence of the title string
                    #text = text.replace(ti, '', 1)
            f.write(text.strip())
            f.write('\n')
            # sometimes the page marker information is incomplete (e.g. for book 12)
            # in that case, we don't write the marker information
            if not data['part'] is None and not data['page'] is None:
                page = data['page'].split()[0]
                part = data['part'].split()[0]
                
                try:
                    part = int(part)
                except ValueError:
                    print('Invalid part {} ({})'.format(part, out_file))
                    part = 0
                    
                try:
                    page = int(page)
                except ValueError:
                    print('Invalid page {} ({})'.format(page, out_file))
                    page = 0
                
                page_marker = 'V{:03}P{:04}'.format(part, page)
                f.write(page_marker)
                f.write('\n')
    print('Titles found: {}, in text: {}, normalized: {}, not found: {}'.format(start, in_text, norm_tit, not_found))
    #print('---')

book = read_book('/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/12/book.xml', 1)
title, levels = read_title('/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/12/title.xml')
combine_data(book, title, 'test.txt')

In [None]:
import os
import pandas as pd

in_dir = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/'
out_dir = '/home/jvdzwaan/data/adh-corpora/dawa/txt/'
notes_dir = '/home/jvdzwaan/data/adh/dawa-notes/'

level_data = []

for i in tqdm(range(1, 286)):
    out_name = '{}.txt'.format(i)
    
    d = os.path.join(in_dir, str(i))
    book_file = os.path.join(d, 'book.xml')
    title_file = os.path.join(d, 'title.xml')
    
    if not os.path.isfile(book_file):
        print('Book file "{}" not found!'.format(book_file))
        
    if not os.path.isfile(title_file):
        print('Title file "{}" not found!'.format(title_file))
    
    #print(book_file)
    #print(title_file)
    book = read_book(book_file, i)
    title, levels = read_title(title_file)
    
    level_data.append({'book': i, 'levels': list(levels)})
    
    # write txt to output file
    out_file = os.path.join(out_dir, out_name)
    print(out_file)
    combine_data(book, title, out_file)
    
    # write notes, etc. to output file
    notes = []
    for b_id, data in book.items():
        fn = data.get('footnotes')
        if fn is not None:
            notes.append(fn)
    
    # save notes to file
    if len(notes) > 0:
        out_file = os.path.join(notes_dir, out_name)
        print('Writing notes to', out_file)
        with open(out_file, 'w') as f:
            f.write('\n'.join(notes))
            f.write('\n')
                
    print('-'*80)
    
levels = pd.DataFrame(level_data)
levels = levels.set_index('book')
levels.head()

In [None]:
levels.to_csv('dawa_levels.csv', encoding='utf-8')

In [None]:
from collections import OrderedDict

from lxml import etree
from tqdm import tqdm_notebook as tqdm

import re


def read_almanar(fname, book_id=286):
    result = OrderedDict()
    regex = '__________\n'
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('book'))
    for event, elem in context:
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'nass':   # text
                # sometimes the text is None (e.g., file 16, id 214)
                text = e.text
                fn = None
                author = None
                if text is None:
                    print('None text found in {} (prev? id: {})'.format(fname, t_id))
                else:
                    # &#xd; is automatically replaced by \r, but we would like to have \n
                    text = text.replace('\r', '\n')
                    
                    # get the author name and footnotes
                    parts = re.split(regex, text)
                    
                    #print(t_id, len(parts))
                    one = parts[0].startswith('الكاتب')
                    two = 'الكاتب' in parts[0]
                    
                    # the author name is always the first word in the part before the line
                    if not one and two:
                        print('Unexpected in', t_id)
                        
                    # if there are three parts, the first is always the author name
                    # (we assume part[1] is text and part[2] are footnotes)
                    if len(parts) == 3: 
                        if not one:
                            print('3 parts found without author name', t_id)
                        text = ''.join((parts[0], parts[1]))
                        fn = parts[2]
                        author = parts[0]
                    
                    # if there are two parts, and there is no author name in the first,
                    # part[0] is text and part[1] are footnotes
                    if len(parts) == 2 and not one:
                        print('2 parts found without author name', t_id)
                        text = parts[0]
                        fn = parts[1]
                    # with author name, parts[0] is the author name, parts[1] is the text
                    elif len(parts) == 2 and one:
                        pass
                        text = ''.join((parts[0], parts[1]))
                        author = parts[0]
                        print('2 parts found with author name', t_id)
                    
                    # if there is one part, the text probably is a title (level 1 or level 2)
                    if len(parts) == 1:
                        #print('1 part found', t_id)
                        if one:
                            print('author name in title', t_id)
                        text = parts[0]
                        
                    # author name should be kept in the text and put in the paratext file
                    # parts with author name can be recognized by checking whether the text starts with الكاتب

            elif e.tag == 'page':
                page = e.text
            elif e.tag == 'part':
                part = e.text
            #print(e.tag)
        
        # ignore entries that don't have text
        if not text is None:
            result[t_id] = {'text': text, 'page': page, 'part': part}
        
        if not fn is None:
            result[t_id]['footnotes'] = fn
            
        if not author is None:
            result[t_id]['author'] = author

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return result

book = read_book('/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/1/book.xml', 1)
print(len(book))

In [None]:
# split file 286 on level 1 (this file contains multiple journals)

title_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/286/title.xml'
book_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/286/book.xml'

book = read_almanar(book_file, 286)
print(len(book))
title, levels = read_title(title_file)

In [None]:
# combine book and title
# and write to file
import codecs
import os

def write_almanar(book, title, start_n, level='1'):
    to_write = []
    journal = []
    notes_year = []
    notes = []
    first = True
    for b_id, data in book.items():
        text = data['text']
        if b_id in title:
            #print(b_id)
            #print('title!')
            #print(title[b_id])
            # Sometimes the titles in the xml are in the wrong order
            # So sort them to put them in the right order
            title[b_id].sort(key=lambda t: int(t['level']))
            #print(title[b_id])
            #print('---')
            for t in title[b_id]:
                ti = t['title']
                lvl = t['level']
                
                if lvl == level:
                    if not first:
                        to_write.append(''.join(journal))
                        notes_year.append(''.join(notes))
                    else:
                        first = False
                    journal = []
                    notes = []
                    
                
                #print(title[b_id]['title'], title[b_id]['level'])
                journal.append('### {} {}\n'.format(int(lvl)*'|', ti))
                    
                # Remove the first occurence of the title string
                text = text.replace(ti, '', 1)
        journal.append(text.strip())
        journal.append('\n')
        
        author = data.get('author')
        if author is not None:
            notes.append(author)
        
        fn = data.get('footnotes')
        if fn is not None:
            notes.append(fn)
        
        # sometimes the page marker information is incomplete (e.g. for book 12)
        # in that case, we don't write the marker information
        if not data['part'] is None and not data['page'] is None:
            page = data['page'].split()[0]
            part = data['part'].split()[0]
                
            try:
                part = int(part)
            except ValueError:
                print('Invalid part {}'.format(part))
                part = 0
                    
            try:
                page = int(page)
            except ValueError:
                print('Invalid page {}'.format(page))
                page = 0
                
            page_marker = 'V{:03}P{:04}'.format(part, page)
            journal.append(page_marker)
            journal.append('\n')
            #print('---')
    print(len(to_write))
    
    # write text files
    for i, text in enumerate(to_write):
        n = start_n + i
        out_file = '/home/jvdzwaan/data/adh-corpora/dawa/txt/{}.txt'.format(n)
        print(out_file)
        with open(out_file, 'w') as f:
            f.write(text)
            f.write('\n')
            
    # write notes
    for i, ns in enumerate(notes_year):
        n = i + start_n
        if ns != '':
            out_file = os.path.join(notes_dir, '{}.txt'.format(n))
            print(out_file)
            with open(out_file, 'w') as f:
                f.write(ns)
                f.write('\n')
        
notes_dir = '/home/jvdzwaan/data/adh/dawa-notes/'
write_almanar(book, title, 286)

In [None]:
# files 283 and 262 also need to be split on level 1
# Can we reuse read-almanar and write-almanar?

i = 262
start_n = 321

title_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/{}/title.xml'.format(i)
book_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/{}/book.xml'.format(i)

book = read_almanar(book_file, i)
print(len(book))
title, levels = read_title(title_file)
    
write_almanar(book, title, start_n, level='2')


In [None]:
i = 283
start_n = 336

title_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/{}/title.xml'.format(i)
book_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/{}/book.xml'.format(i)

print(title_file)
print(book_file)

book = read_almanar(book_file, i)
print(len(book))
title, levels = read_title(title_file)
    
write_almanar(book, title, start_n)


In [None]:
# 278 should be split on level 3 headers
        
notes_dir = '/home/jvdzwaan/data/adh/dawa-notes/'

i = 278
start_n = 366

title_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/{}/title.xml'.format(i)
book_file = '/home/jvdzwaan/data/adh-corpora/dawa/New_xml_corpus/{}/book.xml'.format(i)

print(title_file)
print(book_file)

book = read_almanar(book_file, i)
print(len(book))
title, levels = read_title(title_file)
write_almanar(book, title, start_n, '3')