In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from lxml import etree
from tqdm import tqdm

def stemmer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        stem = None
        for a in elem.getchildren():
            if a.tag == 'analysis':
                stem = a.attrib['stem']
        result.append({'word': elem.attrib['value'], 'proposed_root': stem})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

def analyzer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        word = elem.attrib['value']
        #print(repr(word))
        if word != '':
            roots = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
            roots = list(set(roots))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

In [None]:
# one document in the corpus is a book
from nlppln.utils import get_files

#xml_file1 = '/home/jvdzwaan/data/tmp/adh/chapters/1266MuhammadHasanNajafiJawhari.JawahirKalam.xml'
#xml_file2 = '/home/jvdzwaan/data/tmp/adh/chapters/0381IbnBabawayh.Hidaya.xml'

in_dir = '/home/jvdzwaan/data/tmp/adh/analysis/alkhalil/'

def corpus(in_files):
    for in_file in in_files:
        data = analyzer_xml2df2(in_file)
        yield(list(data['word']))
        
in_files = get_files(in_dir)
c = corpus(in_files)

In [None]:
%%time
# one document in the corpus is a school
import os
import codecs

from itertools import chain

import pandas as pd

from nlppln.utils import get_files

#xml_file1 = '/home/jvdzwaan/data/tmp/adh/chapters/1266MuhammadHasanNajafiJawhari.JawahirKalam.xml'
#xml_file2 = '/home/jvdzwaan/data/tmp/adh/chapters/0381IbnBabawayh.Hidaya.xml'

md_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/Meta/Metadata_Fiqh.csv'

in_dir = '/home/jvdzwaan/data/tmp/adh/analysis/alkhalil/'

metadata = pd.read_csv(md_file, encoding='utf-8')
#print(metadata.head())
schools = metadata.groupby('BookSUBJ')

def read_file_analyzer(in_file):
    data = analyzer_xml2df2(in_file)
    return(list(data['word']))

def read_file_stemmer(in_file):
    data = stemmer_xml2df2(in_file)
    return(list(data['proposed_root']))
    

def corpus(schools, in_dir, analyzer=True):

    for i, (name, data) in enumerate(schools):
        print(i, name)
        #print(data['BookURI'])
        words = []
        #with codecs.open('{}.txt'.format(i), 'w', encoding='utf-8') as f:
            
        for book in data['BookURI']:
            #print(book)
            in_file = os.path.join(in_dir, '{}.xml'.format(book))
            if analyzer:
                ws = read_file_analyzer(in_file)
            else:
                ws = read_file_stemmer(in_file)
            #print(ws[0])
            words.append(ws)
        
        yield(chain(*words))
                #print(len(ws))
                #print(ws[0])
                #f.write(' '.join(ws))
                #f.write('\n')
            
c = corpus(schools, in_dir, analyzer=False)

In [None]:
words = read_file('/home/jvdzwaan/data/tmp/adh/analysis/alkhalil/0311AbuBakrKhallal.WuqufWaTarajjul.xml')
print(len(words))

In [None]:
def read_text_file(in_file):
    with codecs.open(in_file, encoding='utf-8') as f:
        for ln in f:
            for word in ln.split():
                #print(word)
                yield word

In [None]:
%%time
data = [list(terms) for terms in c]

In [None]:
for terms in data:
    print(len(terms))

In [None]:
print(data[0][0])

In [None]:
%%time
from weighwords import ParsimoniousLM

model = ParsimoniousLM([terms for terms in data], w=.01)

In [None]:
%%time

dfs = []

top_k = 25
for i, terms in enumerate(data):
    result = []
    for term, p in model.top(top_k, terms, max_iter=100):
        result.append({'{}_term'.format(i): term, '{}_p'.format(i): np.exp(p)})
        #print("    %s %.4f" % (term, np.exp(p)))
        #print(term)
    dfs.append(pd.DataFrame(result))
    #print('---')

In [None]:
pd.concat(dfs, axis=1, sort=False)

In [None]:
from weighwords import ParsimoniousLM

def calculate(data, w, top_k=25):
    model = ParsimoniousLM([terms for terms in data], w=w)
    
    # calculate terms and weights
    dfs = []

    for i, terms in enumerate(data):
        result = []
        for term, p in model.top(top_k, terms, max_iter=10000):
            result.append({'{}_term'.format(i): term, '{}_p'.format(i): np.exp(p)})
        dfs.append(pd.DataFrame(result))
    return pd.concat(dfs, axis=1, sort=False)

In [None]:
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

calculate(data, w=0.005, top_k=25)

In [None]:
%%time
wordcloud_data = []

for w in (1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001):
    wordcloud_data.append(calculate(data, w=w))
    

In [None]:
def get_terms(txt_file):
    # get the terms list
    terms = pd.read_csv(txt_file, encoding='utf-8', index_col=None, header=None)
    t = terms[0].tolist()
    print('total number of terms:', len(t))
    terms = set(t)
    print('number of unique terms:', len(terms))
    return terms
stopwords = get_terms('/home/jvdzwaan/data/adh/stopwords/custom.txt')

In [None]:
def sw(term):
    return 'background-color: yellow' if term in stopwords else ''

In [None]:
wordcloud_data[6].style.applymap(sw)

In [None]:
print('\n'.join(list(wordcloud_data[5]['0_term'])))

In [None]:
%%time
import codecs

c_from_text = [read_text_file(t) for t in ('0.txt', '1.txt', '2.txt', '3.txt', '4.txt')]
[len(list(terms)) for terms in c_from_text]

In [None]:
import os

top_k = 20
words = {}

for fname, doc in zip(in_files, terms):
    print("Top %d words in %s:" % (top_k, os.path.basename(fname)))
    words[os.path.basename(fname)] = {}
    for term, p in model.top(top_k, doc):
        print("    %s %.4f" % (term, np.exp(p)))
        words[os.path.basename(fname)][term] = np.exp(p)
print("")

In [None]:
# Boek dat Christian belangrijk vindt
# 0620IbnQudamaMaqdisi.MughniFiFiqh.xml

In [None]:
print(len(terms))

In [None]:
from wordcloud import WordCloud

wc = WordCloud(background_color="white", font_path='/usr/share/fonts/opentype/fonts-hosny-amiri/amiri-quran.ttf')
# generate word cloud
wc.generate_from_frequencies(words['0179MalikIbnAnas.Muwatta.xml'])

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wc.generate_from_frequencies(words['0483IbnAhmadSarakhsi.Mabsut.xml'])

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
for word, w in words['0483IbnAhmadSarakhsi.Mabsut.xml'].items():
    print(word, w)