In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import holoviews as hv
hv.extension('bokeh', 'matplotlib')

In [None]:
data = pd.read_csv('/home/jvdzwaan/data/tmp/category-list.csv')
data.head()

In [None]:
values = data['cat'].values.reshape(10,10)

In [None]:
values

In [None]:
import xarray as xr

xr_data = xr.DataArray(values, dims=['x', 'y'])

In [None]:
# make x an y coordinates
for ix, iy in np.ndindex(values.shape):
    print(ix,iy)

In [None]:
ds = hv.Dataset((range(10), range(10), values), ['x', 'y'], ['cat'])
ds

In [None]:
img = hv.Image((range(10), range(10), values), datatype=['grid'])
img

In [None]:
%opts Image (cmap='viridis')
ds.to(hv.Image, ['x', 'y'])

In [None]:
# taken from test2.ipynb
from lxml import etree
from tqdm import tqdm

def stemmer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        stem = None
        for a in elem.getchildren():
            if a.tag == 'analysis':
                stem = a.attrib['stem']
        result.append({'word': elem.attrib['value'], 'proposed_root': stem})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

def analyzer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        word = elem.attrib['value']
        #print(repr(word))
        if word != '':
            roots = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
            roots = list(set(roots))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

In [None]:
senses_roots = list('''سمع
بصر
لمس
شمم
ذوق'''.split('\n'))
# select certain root
root = senses_roots[0]

In [None]:
from collections import OrderedDict

query = OrderedDict({'hear': [], 'see': [], 'touch': [], 'smell': [], 'taste': []})
query['hear'].append(senses_roots[0])
query['see'].append(senses_roots[1])
query['touch'].append(senses_roots[2])
query['smell'].append(senses_roots[3])
query['taste'].append(senses_roots[4])
query

In [None]:
fname = '/home/jvdzwaan/data/tmp/adh/analysis/alkhalil/0179MalikIbnAnas.Muwatta.xml'
df = analyzer_xml2df2(fname)

In [None]:
%%time
def get_root_cat(row, query):
    prop_roots = row['proposed_root'].split('\\')
    for i, (cat, roots) in enumerate(query.items()):
        for root in roots:
            if root in prop_roots:
                return i+1
    return 0

df['senses'] = df.apply(lambda row: get_root_cat(row, query), axis=1)

In [None]:
df.head()

In [None]:
df['senses'].sum()

In [None]:
line = 300

def pad_df(df, line):
    num_to_add = line-(len(df)%line)
    
    to_add = pd.DataFrame([{'word': 'PAD', 'proposed_root': 'PAD', 'senses': -1} for i in range(num_to_add)])
    result = df.append(to_add, sort=False)
    return result.reset_index(drop=True)
    
    
df = pad_df(df, line)

In [None]:
len(df)%line

In [None]:
y = len(df)//line
print(y)
values = df['senses'].values.reshape(y, line)
print(values.shape)

ds = hv.Dataset((list(range(line))[::-1], list(range(y))[::-1], values), ['x', 'y'], ['cat'])
ds

In [None]:
print(list(range(line)[::-1]))

In [None]:
mosaic = ds.to(hv.Image, ['x', 'y'])

mosaic.options(cmap=['#d3d3d3', '#ffffff', '#e6194B', '#4363d8', '#ffe119', '#911eb4', '#3cb44b'], colorbar=True, width=600, height=800)

In [None]:
mosaic = ds.to(hv.Points, kdims=['x', 'y'], vdims=['cat'])
mosaic

In [None]:
ds

In [None]:
frequencies, edges = np.histogram(values[np.where(values >= 1)], [1,2,3,4,5,6])
hv.Histogram((edges, frequencies))

In [None]:
edges

In [None]:
values[np.where(values >= 1)]

In [None]:
df['senses'][df['senses'] >= 1].hist()

The five legal categories - STEMS

In THREE texts: Sarakhsi, Ibn Qudama, and Muhaqqiq al-Hilli

* forbidden: محظور = light red AND  حرام = dark red
* discouraged: مكروه = dark orange AND مذموم = light orange
* neutral: مباح = green
* recommended: مندوب = light blue AND مستحب = dark blue
* obligatory: واجب =  purple AND فرض = rose

In [None]:
from collections import OrderedDict

query = OrderedDict({'forbidden': [], 'discouraged': [], 'neutral': [], 'recommended': [], 'obligatory': []})
query['forbidden'].append('محظور')
query['forbidden'].append('حرام')
query['discouraged'].append('مكروه')
query['discouraged'].append('مذموم')
query['neutral'].append('مباح')
query['recommended'].append('مندوب')
query['recommended'].append('مستحب')
query['obligatory'].append('واجب')
query['obligatory'].append('فرض')
query