In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from nlppln.utils import get_files, create_dirs, out_file_name

gs_dir = '/home/jvdzwaan/data/tmp/adh/evaluation/gs/'
gs_files = get_files(gs_dir)

gs = pd.concat([pd.read_csv(f) for f in gs_files])
gs

In [None]:
gs_roots = [root.split('\\') for root in list(gs['root'])]
print(gs_roots[0])

In [None]:
from collections import Counter

gs_root_counts = Counter()

for rs in gs_roots:
    for r in rs:
        gs_root_counts[r] += 1
        
print(len(gs_root_counts.keys()))

In [None]:
%%time
import pickle

from tqdm import tqdm_notebook as tqdm

from nlppln.utils import get_files, create_dirs, out_file_name

from adhtools.utils import corpus_wordlist

def pickle_roots(in_dir, out_dir, analyzer):

    in_files = get_files(in_dir)

    create_dirs(out_dir)

    for roots_in_file, in_file in tqdm(zip(corpus_wordlist(in_files, analyzer=analyzer), 
                                           in_files), total=len(in_files)):
        res = [set(root.split('\\')) for root in roots_in_file]
        #print(len(res))
        #print(res[0])
        out_file = out_file_name(out_dir, in_file, ext='pkl')
        #print(out_file)
        with open(out_file, 'wb') as f:
            pickle.dump(res, f)

In [None]:
# Khoja
khoja_in_dir = '/home/jvdzwaan/data/tmp/adh/20190325-fiqh-khoja/'
analyzer= False

khoja_out_dir = '/home/jvdzwaan/data/tmp/adh/20190325-fiqh-khoja-roots/'

#pickle_roots(khoja_in_dir, khoja_out_dir, analyzer)

In [None]:
# ISRI
isri_in_dir = '/home/jvdzwaan/data/tmp/adh/20190326-fiqh-isri/'
analyzer= False

isri_out_dir = '/home/jvdzwaan/data/tmp/adh/20190326-fiqh-isri-roots/'

#pickle_roots(isri_in_dir, isri_out_dir, analyzer)

In [None]:
# AlKhalil
alk_in_dir = '/home/jvdzwaan/Downloads/2019-02-08-fiqh-newfiles-alkhalil/'
analyzer= True

alk_out_dir = '/home/jvdzwaan/data/tmp/adh/2019-02-08-fiqh-newfiles-alkhalil-roots/'

#pickle_roots(alk_in_dir, alk_out_dir, analyzer)

In [None]:
# result: df met counts voor elke root in gs en index is files
import os

def count_gs_roots(pkl_dir, gs_root_counts):

    res = {}

    in_files = get_files(pkl_dir)
    for in_file in tqdm(in_files):
        file_root_counts = Counter()
        with open(in_file, 'rb') as f:
            roots = pickle.load(f)
        for rs in roots:
            for r in rs:
                if r in gs_root_counts.keys():
                    file_root_counts[r] += 1

        file_root_counts['total'] = len(roots)
        book_id = os.path.splitext(str(os.path.basename(in_file)))[0]
        res[book_id] = file_root_counts
        #break
    tool_df = pd.DataFrame.from_dict(res, orient='index')
    
    tool_df = tool_df.fillna(0)

    roots_not_found = []

    for r in gs_root_counts.keys():
        if r not in tool_df.columns:
            tool_df[r] = 0
            roots_not_found.append(r)
    return tool_df, roots_not_found

In [None]:
khoja_df, khoja_roots_not_found = count_gs_roots(khoja_out_dir, gs_root_counts)

In [None]:
isri_df, isri_roots_not_found = count_gs_roots(isri_out_dir, gs_root_counts)

In [None]:
alk_df, alk_roots_not_found = count_gs_roots(alk_out_dir, gs_root_counts)

In [None]:
# combine with metadata
# set schools (Shii Sunni)
# group by school
# calculate percentages for each column
# plot: for a given root, the percentages per school for each tool

def set_schools(row):
    if row['BookSUBJ'] == 'جعفري':
        return 'Shi\''
    return 'Sunn'

def combine_with_metadata(md_file, df):
    md = pd.read_csv(md_file, sep=';|,')
    md = md.set_index('BookURI')
    
    result = pd.concat([df.copy(), md.copy()], axis=1, sort=True)
    
    return result

def calculate_percentages(df):
    cols = []
    for c in df.columns:
        print(c)
        c_total = df.loc['total', c]
        print(c_total)
        n = '{}i'.format(c)
        cols.append(n)
        df[n] = df[c]/c_total *100.0
    
    return df, cols

def preprocess(df, md_file):
    data = combine_with_metadata(md_file, df)
    data['school'] = data.apply(lambda row: set_schools(row), axis=1)
    data = data.groupby('school').sum().T
    return calculate_percentages(data)

md_file = '/home/jvdzwaan/data/adh-corpora/fiqh_corpus/Meta/Metadata_Fiqh.csv'

khoja, k_cols = preprocess(khoja_df, md_file)
isri, i_cols = preprocess(isri_df, md_file)
alkhalil, a_cols = preprocess(alk_df, md_file)

In [None]:
from nltk.corpus import stopwords as sw

def get_terms(txt_file):
    # get the terms list
    terms = pd.read_csv(txt_file, encoding='utf-8', index_col=None, header=None)
    t = terms[0].tolist()
    print('total number of terms:', len(t))
    terms = set(t)
    print('number of unique terms:', len(terms))
    return terms

stopwords = get_terms('/home/jvdzwaan/data/adh/stopwords/custom.txt')

stopwords_nltk = list(sw.words('arabic'))

In [None]:
def plot_root_tools(khoja, isri, alkhalil, root, cols, sw_n, sw_c, gs_root_counts):
    sw = sw_type(root, sw_n, sw_c)

    #print(root, sw, 'freq. in gs:', gs_root_counts[root])
    #print(cols)
    to_plot = pd.DataFrame()
    #print(khoja.loc[root])
    to_plot['khoja'] = khoja.loc[root]
    to_plot['isri'] = isri.loc[root]
    to_plot['alkhalil'] = alkhalil.loc[root]
    
    #print(to_plot.loc[cols].T)
    #print(np.allclose(to_plot.loc[cols[0]]))
    #print(np.allclose(to_plot.loc[cols[1]]))
    
    return to_plot.loc[cols].T


def sw_type(root, sw_n, sw_c):
    #print(len(sw_n), len(sw_c))
    n = root in sw_n
    c = root in sw_c
    
    if n and c:
        return '(b)'
    elif n:
        return '(n)'
    elif c:
        return '(c)'
    return '(not a stopword)'



plot_root_tools(khoja, isri, alkhalil, khoja.index[116], k_cols, stopwords_nltk, stopwords, gs_root_counts)

In [None]:
k = alkhalil.copy()
k.columns = ['Shi', 'Sunn', 'Shi\'i', 'Sunni']
k.query("Shii == 0 and Sunni == 0").shape

In [None]:
num = 0
num_with_zero = 0
num_with_zero_sw = 0
num_sw = 0

non_zero = []

for root in gs_root_counts.keys():
    res = plot_root_tools(khoja, isri, alkhalil, root, k_cols, stopwords_nltk, stopwords, gs_root_counts)
    
    #print(res)
    
    sw = False
    if root in stopwords_nltk or root in stopwords:
        sw = True
        num_sw += 1
    else:
        print(root, sw, 'freq. in gs:', gs_root_counts[root])
        print(res)
        res.plot(kind='bar', figsize=(7,5), fontsize=12)
        plt.legend(fontsize=12)
        plt.show() 
    
    nz_tools = res.apply(lambda row: np.count_nonzero(row), axis=1)
    #print(nz_tools)
    non_zero.append(nz_tools)
    nz = np.count_nonzero(nz_tools)
    if nz != 3:
        num_with_zero += 1
        if sw:
            num_with_zero_sw += 1

    
    #c = res[k_cols[0]] > res[k_cols[1]]
    #if c.sum() != 0 and c.sum() != 3:
    #    print('Differences for', root)
    #    print(res)
    #    res.plot(kind='bar')
    #    plt.show() 
    #    num += 1
print('khoja non zero', np.sum(pd.DataFrame(non_zero)['khoja'] == 0))
print('isri non zero', np.sum(pd.DataFrame(non_zero)['isri'] == 0))
print('alkhalil non zero', np.sum(pd.DataFrame(non_zero)['alkhalil'] == 0))
print('num stopwords', num_sw)

In [None]:
num_with_zero

In [None]:
num_with_zero_sw

In [None]:
num

In [None]:
gs_roots = set(gs_root_counts.keys())
print(len(gs_roots))
pred_roots = set(khoja.index)

print(pred_roots.difference(gs_roots))
