In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# get Max's words
term = '/home/jvdzwaan/data/adh/word-lists/farada.txt'
farada = pd.read_csv(terms, encoding='utf-8', index_col=None, header=None)
farada

In [None]:
f = farada[0].tolist()

In [None]:
print(len(f))

In [None]:
farada = set(f)

In [None]:
len(farada)

In [None]:
# root farada
root = 'فرض'

In [None]:
# taken from root-extraction-performance.ipynb
import codecs

from bs4 import BeautifulSoup

def stemmer_xml2df(fname):
    with codecs.open(fname) as f:
        soup = BeautifulSoup(f.read(), 'xml')
    
    result = []    
    for word in soup.find_all('word'):
        result.append({'word': word['value'], 'proposed_root': word.analysis['stem']})
    
    return pd.DataFrame(result)

def analyzer_xml2df(fname):
    #print(fname)
    with codecs.open(fname) as f:
        soup = BeautifulSoup(f.read(), 'xml')
    
    result = []
    
    for word in soup.find_all('word'):
        analyses = word.find_all('analysis')
        roots = [a.get('root', 'NO_ROOT') for a in analyses]
        roots = list(set(roots))
        if len(roots) == 0:
            roots.append('NOANALYSIS')
        result.append({'word': word['value'], 'proposed_root': '\\'.join(roots)})
    
    #print(len(result))
    return pd.DataFrame(result)

In [None]:
from lxml import etree
from tqdm import tqdm

def stemmer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        stem = None
        for a in elem.getchildren():
            if a.tag == 'analysis':
                stem = a.attrib['stem']
        result.append({'word': elem.attrib['value'], 'proposed_root': stem})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

def analyzer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        word = elem.attrib['value']
        #print(repr(word))
        if word != '':
            roots = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
            roots = list(set(roots))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

## Khoja data

In [None]:
%%time
stemmed_file = '/home/jvdzwaan/data/tmp/adh/stemmer/0483IbnAhmadSarakhsi.Mabsut.xml'

khoja = stemmer_xml2df(stemmed_file)
print(khoja.shape)

In [None]:
%%time
stemmed_file = '/home/jvdzwaan/data/tmp/adh/stemmer/0483IbnAhmadSarakhsi.Mabsut.xml'

data = stemmer_xml2df2(stemmed_file)
print(data.shape)

In [None]:
data.head()

In [None]:
%%time
data['farada'] = data.apply(lambda row: row['word'] in farada, axis=1)

In [None]:
data['farada'].sum()

In [None]:
data['proposed_root_farada'] = data['proposed_root'] == root

In [None]:
data['proposed_root_farada'].sum()

In [None]:
# overlap
data['overlap'] = (data['farada'] == True) & (data['proposed_root_farada'] == True)

In [None]:
data['overlap'].sum()

In [None]:
data[(data['proposed_root_farada'] == True) | (data['farada'] == True)]

In [None]:
data[(data['proposed_root_farada'] == True) | (data['farada'] == True)].to_csv('0483IbnAhmadSarakhsi.Mabsut-khoja-farada.csv', index=None, encoding='utf-8')

## AlKhalil data

In [None]:
%%time
analyzed_file = '/home/jvdzwaan/data/tmp/adh/big-xml/0483IbnAhmadSarakhsi.Mabsut.xml'

data = analyzer_xml2df2(analyzed_file)
print(data.shape)

In [None]:
data.head()

In [None]:
%%time
data['farada'] = data.apply(lambda row: row['word'] in farada, axis=1)

In [None]:
data['farada'].sum()

In [None]:
def root_correct(row, root):
    proposed_roots = set(row['proposed_root'].split('\\'))
    return root in proposed_roots

print(root)
data['proposed_root_farada'] = data.apply(lambda row: root_correct(row, root), axis=1)

In [None]:
data['proposed_root_farada'].sum()

In [None]:
# overlap
data['overlap'] = (data['farada'] == True) & (data['proposed_root_farada'] == True)

In [None]:
data['overlap'].sum()

In [None]:
data[(data['proposed_root_farada'] == True) | (data['farada'] == True)]

In [None]:
data[(data['proposed_root_farada'] == True) | (data['farada'] == True)].to_csv('0483IbnAhmadSarakhsi.Mabsut-alkhalil-farada.csv', index=None, encoding='utf-8')