In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from lxml import etree
from tqdm import tqdm

def stemmer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        stem = None
        for a in elem.getchildren():
            if a.tag == 'analysis':
                stem = a.attrib['stem']
        result.append({'word': elem.attrib['value'], 'proposed_root': stem})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

def analyzer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        word = elem.attrib['value']
        #print(repr(word))
        if word != '':
            roots = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
            roots = list(set(roots))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

In [None]:
from nlppln.utils import get_files

#xml_file1 = '/home/jvdzwaan/data/tmp/adh/chapters/1266MuhammadHasanNajafiJawhari.JawahirKalam.xml'
#xml_file2 = '/home/jvdzwaan/data/tmp/adh/chapters/0381IbnBabawayh.Hidaya.xml'

in_dir = '/home/jvdzwaan/data/tmp/adh/analysis/alkhalil/'

def corpus(in_files):
    for in_file in in_files:
        data = analyzer_xml2df2(in_file)
        yield(list(data['word']))
        
in_files = get_files(in_dir)
c = corpus(in_files)

In [None]:
from weighwords import ParsimoniousLM

terms = [terms for terms in c]

model = ParsimoniousLM(terms, w=.01)

In [None]:
import os

top_k = 20
words = {}

for fname, doc in zip(in_files, terms):
    print("Top %d words in %s:" % (top_k, os.path.basename(fname)))
    words[os.path.basename(fname)] = {}
    for term, p in model.top(top_k, doc):
        print("    %s %.4f" % (term, np.exp(p)))
        words[os.path.basename(fname)][term] = np.exp(p)
print("")

In [None]:
print(len(terms))

In [None]:
from wordcloud import WordCloud

wc = WordCloud(background_color="white", font_path='/usr/share/fonts/opentype/fonts-hosny-amiri/amiri-quran.ttf')
# generate word cloud
wc.generate_from_frequencies(words['0179MalikIbnAnas.Muwatta.xml'])

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wc.generate_from_frequencies(words['0483IbnAhmadSarakhsi.Mabsut.xml'])

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
for word, w in words['0483IbnAhmadSarakhsi.Mabsut.xml'].items():
    print(word, w)