In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
%%time
# load analysed words
analysed_words = pd.read_csv('/home/jvdzwaan/data/tmp/adh/merged/analysed_words.csv', encoding='utf-8')

In [None]:
analysed_words.head()

In [None]:
%%time
# select a single book
book = analysed_words[analysed_words['title'] == 'Fiqh2'].copy()

In [None]:
books = analysed_words.groupby('title')

In [None]:
book.shape

In [None]:
book.head()

In [None]:
senses_roots = list('''سمع
بصر
لمس
شمم
ذوق'''.split('\n'))
# select certain root
root = senses_roots[0]

In [None]:
# number of words in book
total_words = book.groupby(['file_name', 'position']).count().shape[0]

In [None]:
%%time


res = []
titles = []

for title, book in books:
    #print(title)
    b = {}
    total_words = book.groupby(['file_name', 'position']).count().shape[0]
    for i, r in enumerate(senses_roots):
        num = book[book['root'] == r].shape[0]
        print(u'sense {} ({}): {} of {}'.format(i, r, num, total_words))
        b['sense {}'.format(i)] = num
    print(sum(b.values()))
    b['total_words'] = total_words-sum(b.values())
    res.append(b)
    titles.append(title)
data = pd.DataFrame(res, index=titles)
data.head()

In [None]:
data.head()

In [None]:
data.columns = ['hear', 'see', 'touch', 'smell', 'taste', 'total_words']

In [None]:
data.plot.barh(figsize=(15,10), stacked=True)

In [None]:
data[['hear', 'see', 'touch', 'smell', 'taste']].plot.barh(figsize=(15,10), stacked=True)

In [None]:
query = {'hear': [], 'see': [], 'touch': [], 'smell': [], 'taste': []}
query['hear'].append(senses_roots[0])
query['see'].append(senses_roots[1])
query['touch'].append(senses_roots[2])
query['smell'].append(senses_roots[3])
query['taste'].append(senses_roots[4])

In [None]:
query

In [None]:
query = {'hear': ['سمع', 'صوت', 'ضجج'],
 'see': ['بصر', 'نظر', 'رأي'],
 'touch': ['لمس', 'مسس', 'لصق'],
 'smell': ['شمم', 'ريح', 'عطر'],
 'taste': ['ذوق', 'طعم', 'لذذ']}
query

In [None]:
senses = {'universe': ['كون']}

In [None]:
def match_roots(row, to_count):
    #print(row)
    roots = row['proposed_root'].split('\\')
    overlap = set(roots).intersection(to_count)
    if len(overlap) > 0:
        return True
    return False

In [None]:
senses = query
senses

In [None]:
in_files = get_files(in_dir)
print(in_files)

In [None]:
import os

from tqdm import tqdm_notebook as tqdm

from nlppln.utils import get_files

from adhtools.utils import analyzer_xml2df

in_dir = '/home/jvdzwaan/Downloads/2019-02-08-fiqh-newfiles-alkhalil/'

in_files = get_files(in_dir)

result = {}

for in_file in tqdm(in_files):
    df = analyzer_xml2df(in_file)
    #print(df.columns)
    res = {}
    for name, s in senses.items():
        to_count = set(s)
        #print(name, s)
        res[name] = df.apply(lambda row: match_roots(row, to_count), axis=1).sum()
    res['total'] = df.shape[0]
    #print(res)
    file_id = os.path.splitext(os.path.basename(in_file))[0]
    #print(file_id)
    result[file_id] = res
    #break
alk = pd.DataFrame.from_dict(result, orient='index')
alk

In [None]:
alk_dfs = []
for in_file in tqdm(in_files):
    df = analyzer_xml2df(in_file)
    alk_dfs.append(df)

In [None]:
print(len(alk_dfs))

In [None]:
import pickle

def pickle_obj(fname, obj):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f)

pickle_obj('alk_dfs.pkl', alk_dfs)

In [None]:
def save_dfs(dfs, fname):
    for i, df in enumerate(dfs):
        df.to_csv('{}-{}.csv'.format(fname, i))
save_dfs(alk_dfs, 'alkhalil-fiqh')

In [None]:
result = {}
for df in tqdm(alk_dfs):
    res = {}
    for name, s in senses.items():
        #print(name, s)
        to_count = set(s)
        res[name] = df.apply(lambda row: match_roots(row, to_count), axis=1).sum()
    res['total'] = df.shape[0]
    #print(res)
    file_id = os.path.splitext(os.path.basename(in_file))[0]
    #print(file_id)
    result[file_id] = res
    #break
alk = pd.DataFrame.from_dict(result, orient='index')
alk

In [None]:
alk.shape

In [None]:
alk.to_csv('senses-fiqh-0179-alkhalil.csv')

In [None]:
alk = pd.read_csv('senses-fiqh-new_files-alkhalil.csv')
#alk = pd.read_csv('senses-single-fiqh-new_files-alkhalil.csv')
alk = alk.rename(columns={'Unnamed: 0': 'BookURI'})
alk = alk.set_index('BookURI')
alk

In [None]:
senses

In [None]:
import os

from tqdm import tqdm_notebook as tqdm

from nlppln.utils import get_files

from adhtools.utils import stemmer_xml2df

in_dir = '/home/jvdzwaan/data/tmp/adh/20190325-fiqh-khoja/'

in_files = get_files(in_dir)
in_files = [in_files[0]]

result = {}

for in_file in tqdm(in_files):
    df = stemmer_xml2df(in_file)
    #print(df.columns)
    res = {}
    for name, s in senses.items():
        #print(name, s)
        df['match_{}'.format(name)] = df.apply(lambda row: match_roots(row, s), axis=1)
        res[name] = df['match_{}'.format(name)].sum()
    res['total'] = df.shape[0]
    #print(res)
    file_id = os.path.splitext(os.path.basename(in_file))[0]
    #print(file_id)
    result[file_id] = res
    #break
khod = pd.DataFrame.from_dict(result, orient='index')
khod

In [None]:
khod.to_csv('senses-fiqh-0179-khoja.csv')

In [None]:
senses

In [None]:
import os

from tqdm import tqdm_notebook as tqdm

from nlppln.utils import get_files

from adhtools.utils import stemmer_xml2df

in_dir = '/home/jvdzwaan/data/tmp/adh/20190326-fiqh-isri/'

in_files = get_files(in_dir)
in_files = [in_files[0]]

result = {}

for in_file in tqdm(in_files):
    df = stemmer_xml2df(in_file)
    #print(df.columns)
    res = {}
    for name, s in senses.items():
        #print(name, s)
        df['match_{}'.format(name)] = df.apply(lambda row: match_roots(row, s), axis=1)
        res[name] = df['match_{}'.format(name)].sum()
    res['total'] = df.shape[0]
    #print(res)
    file_id = os.path.splitext(os.path.basename(in_file))[0]
    #print(file_id)
    result[file_id] = res
    #break
isri = pd.DataFrame.from_dict(result, orient='index')
isri

In [None]:
isri.to_csv('senses-fiqh-0179-isri.csv')

In [None]:
khod = pd.read_csv('senses-fiqh-new_files-khoja.csv')
khod = khod.rename(columns={'Unnamed: 0': 'BookURI'})
khod = khod.set_index('BookURI')

In [None]:
khod

In [None]:
md_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/Meta/Metadata_Fiqh.csv'

md = pd.read_csv(md_file, sep=';|,')
print(md.columns)
md = md.set_index('BookURI')
md

In [None]:
khoja = pd.concat([khod.copy(), md.copy()], axis=1, sort=True)

In [None]:
khoja.columns

In [None]:
def set_schools(row):
    if row['BookSUBJ'] == 'جعفري':
        return 'Shi'
    return 'Sun'


khoja['school'] = khoja.apply(lambda row: set_schools(row), axis=1)

In [None]:
k_s = khoja.groupby('school').sum()[['hear', 'see', 'touch', 'smell', 'taste', 'total']].T
k_s

In [None]:
shii_total = k_s.loc['total', 'Shi']
print(shii_total)
k_s['Shi\'i'] = k_s['Shi']/shii_total *100.0

In [None]:
sunni_total = k_s.loc['total', 'Sun']
print(sunni_total)
k_s['Sunni'] = k_s['Sun']/sunni_total *100.0

In [None]:
k_s

In [None]:
alkhalil = pd.concat([alk.copy(), md.copy()], axis=1, sort=True)
alkhalil['school'] = alkhalil.apply(lambda row: set_schools(row), axis=1)
alkhalil

In [None]:
a_s = alkhalil.groupby('school').sum()[['hear', 'see', 'touch', 'smell', 'taste', 'total']].T
a_s

In [None]:
shii_total = a_s.loc['total', 'Shi']
print(shii_total)
a_s['Shi\'i'] = a_s['Shi']/shii_total *100.0
sunni_total = a_s.loc['total', 'Sun']
print(sunni_total)
a_s['Sunni'] = a_s['Sun']/sunni_total *100.0
a_s

In [None]:
k_s

In [None]:
k_s[['Shi\'i', 'Sunni']].drop(k_s[['Shi\'i', 'Sunni']].tail(1).index).plot(kind='bar', figsize=(7,5), fontsize=12)
plt.legend(fontsize=12)

In [None]:
a_s[['Shi\'i', 'Sunni']].drop(a_s[['Shi\'i', 'Sunni']].tail(1).index).plot(kind='bar', figsize=(7,5), fontsize=12)
plt.legend(fontsize=12)

In [None]:
isr = pd.read_csv('senses-fiqh-new_files-isri.csv')
isr = isr.rename(columns={'Unnamed: 0': 'BookURI'})
isr = isr.set_index('BookURI')

isri = pd.concat([isr.copy(), md.copy()], axis=1, sort=True)
isri['school'] = isri.apply(lambda row: set_schools(row), axis=1)
isri

In [None]:
i_s = isri.groupby('school').sum()[['hear', 'see', 'touch', 'smell', 'taste', 'total']].T
shii_total = i_s.loc['total', 'Shi']
print(shii_total)
i_s['Shi\'i'] = i_s['Shi']/shii_total *100.0
sunni_total = i_s.loc['total', 'Sun']
print(sunni_total)
i_s['Sunni'] = i_s['Sun']/sunni_total *100.0
i_s

In [None]:
i_s[['Shi\'i', 'Sunni']].drop(i_s[['Shi\'i', 'Sunni']].tail(1).index).plot(kind='bar', figsize=(7,5), fontsize=12)
plt.legend(fontsize=12)

In [None]:
a_s2 = alkhalil.groupby('BookSUBJ').sum()[['hear', 'see', 'touch', 'smell', 'taste', 'total']].T
a_s2

In [None]:
for c in a_s2.columns:
    print(c)
    c_total = a_s2.loc['total', c]
    print(c_total)
    a_s2['{} %'.format(c)] = a_s2[c]/c_total *100.0
a_s2

In [None]:
k_s2 = khoja.groupby('BookSUBJ').sum()[['hear', 'see', 'touch', 'smell', 'taste', 'total']].T
k_s2

In [None]:
cols = []
for c in k_s2.columns:
    print(c)
    c_total = k_s2.loc['total', c]
    print(c_total)
    n = '{} %'.format(c)
    cols.append(n)
    k_s2[n] = k_s2[c]/c_total *100.0
k_s2

In [None]:
a_s2[cols].drop(a_s2[cols].tail(1).index).plot(kind='bar')

In [None]:
k_s2[cols].drop(k_s2[cols].tail(1).index).plot(kind='bar')

In [None]:
%%time
res = []
titles = []

for title, book in books:
    #print(title)
    b = {}
    for i, roots in query.items():
        num = book[book['root'].isin(roots)].shape[0]
        #print(u'sense {} ({}): {} of {}'.format(i, r, num, total_words))
        b[i] = num
    total_words = book.groupby(['file_name', 'position']).count().shape[0]
    b['other_words'] = total_words-sum(b.values)
    res.append(b)
    titles.append(title)
data = pd.DataFrame(res, index=titles)

In [None]:
shii = 'شيعي'