In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# get Max's words
#terms_file = '/home/jvdzwaan/data/adh/word-lists/farada.txt'
terms_file = '/home/jvdzwaan/data/adh/word-lists/wajaba.txt'
terms = pd.read_csv(terms_file, encoding='utf-8', index_col=None, header=None)
terms

In [None]:
t = terms[0].tolist()
print('total number of terms:', len(t))
terms = set(t)
print('number of unique terms:', len(terms))

In [None]:
# root farada
farada = 'فرض'

In [None]:
# root wajaba
wajaba = 'وجب'

In [None]:
from lxml import etree
from tqdm import tqdm

def stemmer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        stem = None
        for a in elem.getchildren():
            if a.tag == 'analysis':
                stem = a.attrib['stem']
        result.append({'word': elem.attrib['value'], 'proposed_root': stem})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

def analyzer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        word = elem.attrib['value']
        #print(repr(word))
        if word != '':
            roots = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
            roots = list(set(roots))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

In [None]:
import os
import re

from tabulate import tabulate

def print_table(data):
    columns = ['# tokens', 'semi-automatic root matches', 'automatically extracted root matches', 'overlap', 'root']
    headers = ['name', 'number']
    print(tabulate(data[columns].transpose(), headers, tablefmt="pipe"))

def root_correct(row, root):
    proposed_roots = set(row['proposed_root'].split('\\'))
    return root in proposed_roots

def regex_search(row, regex):
    m = regex.search(row['word'])
    if m: 
        return True
    return False

def get_terms(txt_file):
    # get the terms list
    terms = pd.read_csv(terms_file, encoding='utf-8', index_col=None, header=None)
    t = terms[0].tolist()
    print('total number of terms:', len(t))
    terms = set(t)
    print('number of unique terms:', len(terms))
    return terms

def match_roots(data, terms, root, document, regex=False):
    print('number of tokens: ', data.shape[0])
    print('number of terms: ', len(terms))
    
    if regex:
        expr = re.compile(r'({})'.format('|'.join(terms)))
        data['root'] = data.apply(lambda row: regex_search(row, expr), axis=1)
    else:
        # token matching
        data['root'] = data.apply(lambda row: row['word'] in terms, axis=1)
    data['proposed_root_correct'] = data.apply(lambda row: root_correct(row, root), axis=1)
    data['overlap'] = (data['root'] == True) & (data['proposed_root_correct'] == True)
    result = {
        'root': root,
        '# tokens': data.shape[0],
        'semi-automatic root matches': data['root'].sum(),
        'automatically extracted root matches': data['proposed_root_correct'].sum(),
        'overlap': data['overlap'].sum(),
        'document': os.path.basename(document)
    }
    
    metadata = pd.DataFrame.from_records([result], index='document')
    matches = data[(data['proposed_root_correct'] == True) | (data['root'] == True)]
    
    return metadata, matches, data

## Khoja data

In [None]:
%%time
xml_file = '/home/jvdzwaan/data/tmp/adh/stemmer/0483IbnAhmadSarakhsi.Mabsut.xml'
terms_file = '/home/jvdzwaan/data/adh/word-lists/farada-short.txt'

terms = get_terms(terms_file)
khoja_data = stemmer_xml2df2(xml_file)
khoja_orig = khoja_data.copy()

In [None]:
khoja_orig = khoja_data.copy()

In [None]:
khoja_orig.to_csv('/home/jvdzwaan/data/adh/0483IbnAhmadSarakhsi.Mabsut-khoja.csv', encoding='utf-8')

In [None]:
%%time
khoja_orig = pd.read_csv('/home/jvdzwaan/data/adh/0483IbnAhmadSarakhsi.Mabsut-khoja.csv', encoding='utf-8')
khoja_data = khoja_orig.copy()

In [None]:
terms_file = '/home/jvdzwaan/data/adh/word-lists/farada-short.txt'
terms = get_terms(terms_file)

In [None]:
khoja_data.shape

In [None]:
khoja_orig.head()

In [None]:
%%time
document = '/home/jvdzwaan/data/adh/0483IbnAhmadSarakhsi.Mabsut-khoja.csv'
khoja_metadata, khoja_matches, khoja_data = match_roots(khoja_data, terms, farada, document, regex=True)

In [None]:
print_table(khoja_metadata)

In [None]:
khoja_matches.to_csv('0483IbnAhmadSarakhsi.Mabsut-khoja-farada-short.csv', index=None, encoding='utf-8')

## AlKhalil data

In [None]:
%%time
xml_file = '/home/jvdzwaan/data/tmp/adh/big-xml/0483IbnAhmadSarakhsi.Mabsut.xml'
terms_file = '/home/jvdzwaan/data/adh/word-lists/wajaba.txt'

terms = get_terms(terms_file)
alk_data = analyzer_xml2df2(xml_file)
alk_orig = alk_data.copy()

In [None]:
alk_orig = alk_data.copy()

In [None]:
alk_orig.to_csv('/home/jvdzwaan/data/adh/0483IbnAhmadSarakhsi.Mabsut-alkhalil.csv', encoding='utf-8')

In [None]:
terms_file = '/home/jvdzwaan/data/adh/word-lists/farada-short.txt'
terms = get_terms(terms_file)

In [None]:
%%time
alk_orig = pd.read_csv('/home/jvdzwaan/data/adh/0483IbnAhmadSarakhsi.Mabsut-alkhalil.csv', encoding='utf-8')
alk_data = alk_orig.copy()

In [None]:
%%time
document = '/home/jvdzwaan/data/adh/0483IbnAhmadSarakhsi.Mabsut-alkhalil.csv'
alk_metadata, alk_matches, alk_data = match_roots(alk_data, terms, farada, document, regex=True)

In [None]:
print_table(alk_metadata)

In [None]:
alk_matches.to_csv('0483IbnAhmadSarakhsi.Mabsut-alkhalil-farada-short.csv', index=None, encoding='utf-8')

In [None]:
data = alk_orig.copy()

In [None]:
data.head()

In [None]:
# root wajaba
root = 'وجب'
data['wajaba'] = data.apply(lambda row: root_correct(row, root), axis=1)

In [None]:
# root farada
root = 'فرض'
data['farada'] = data.apply(lambda row: root_correct(row, root), axis=1)

In [None]:
def num_roots(row):
    proposed_roots = set(row['proposed_root'].split('\\'))
    return len(proposed_roots)

data['num_proposed_roots'] = data.apply(lambda row: num_roots(row), axis=1)

In [None]:
data.head()

In [None]:
data['num_proposed_roots'].mean()

In [None]:
data[data['wajaba'] == True]['num_proposed_roots'].mean()

In [None]:
data[data['farada'] == True]['num_proposed_roots'].mean()