In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import codecs
from bs4 import BeautifulSoup

def stemmer_xml2df(fname):
    with codecs.open(fname) as f:
        soup = BeautifulSoup(f.read(), 'xml')
    
    result = []    
    for word in soup.find_all('word'):
        result.append({'word': word['value'], 'proposed_root': word.analysis['stem']})
    
    return pd.DataFrame(result)

def analyzer_xml2df(fname):
    #print(fname)
    with codecs.open(fname) as f:
        soup = BeautifulSoup(f.read(), 'xml')
    
    result = []
    
    for word in soup.find_all('word'):
        analyses = word.find_all('analysis')
        roots = [a.get('root', 'NO_ROOT') for a in analyses]
        roots = list(set(roots))
        if len(roots) == 0:
            roots.append('NOANALYSIS')
        result.append({'word': word['value'], 'proposed_root': '\\'.join(roots)})
    
    #print(len(result))
    return pd.DataFrame(result)

data = analyzer_xml2df('/home/jvdzwaan/data/tmp/adh/evaluation/alkhalil/0450AbuHasanMawardi.HawiKabir-sample.xml')
data.shape
data

In [None]:
list(data['word'])

In [None]:
from Bio import pairwise2

def root_correct(row):
    roots = set(row['root'].split('\\'))
    proposed_roots = set(row['proposed_root'].split('\\'))
    return len(roots.intersection(proposed_roots)) > 0

def compare_to_gs(gs, xml, stemmer=False):
    print(gs)
    print(xml)
    gs = pd.read_csv(gs)
    
    if stemmer:
        data = stemmer_xml2df(xml)
    else:
        data = analyzer_xml2df(xml)
    
    print(gs.shape)
    print(data.shape)
    #print(gs)
    #print(list(gs['word']))
    #print(len(list(gs['word'])))
    #print(list(gs['word'])[0])
    if data.shape[0] != gs.shape[0]:
        alignments = pairwise2.align.localms(list(gs['word']), list(data['word']),2,-1,-0.5,-0.1, gap_char=["MISSING"], one_alignment_only=True)
        l1 = alignments[0][0]
        l2 = alignments[0][1]
        
        result = []
        
        i1 = 0
        i2 = 0
        for w1, w2 in zip(l1, l2):
            if w1 == w2:
                word = gs.loc[i1]['word']
                root = gs.loc[i1]['root']
                p_root = data.loc[i2]['proposed_root']
                
                i1 += 1
                i2 += 1
              
            elif w1 == 'MISSING':
                word = 'MISSING'
                root = 'MISSING'
                p_root = data.loc[i2]['proposed_root']
                
                i2 += 1
            elif w2 == 'MISSING':
                word = gs.loc[i1]['word']
                root = gs.loc[i1]['root']
                p_root = 'MISSING'
                
                i1 += 1
            result.append({'word': word, 'root': root, 'proposed_root': p_root})
        data = pd.DataFrame(result)
    else:
        data = pd.concat([gs, data], axis=1, sort=False)
        data = data.loc[:,~data.columns.duplicated()]
        print(data.columns)
        
    data['root_correct'] = data.apply(lambda row: root_correct(row), axis=1)
        
    return data                    

gs = '/home/jvdzwaan/data/tmp/adh/evaluation/gs/0450AbuHasanMawardi.HawiKabir-sample.csv'
xml = '/home/jvdzwaan/data/tmp/adh/evaluation/alkhalil/0450AbuHasanMawardi.HawiKabir-sample.xml'
data = compare_to_gs(gs, xml, stemmer=False)
data

In [None]:
data[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)

In [None]:
# after removing words founf by alkhalil, but not in gold standard (2 words)
data = data.drop(data[data['word']=='MISSING'].index)
data[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)

In [None]:
import os
from nlppln.utils import get_files

gs_files = get_files('/home/jvdzwaan/data/tmp/adh/evaluation/gs/')
khoja_files = get_files('/home/jvdzwaan/data/tmp/adh/evaluation/khoja/', recursive=True)
alkhalil_files = get_files('/home/jvdzwaan/data/tmp/adh/evaluation/alkhalil/', recursive=True)

khoja_results = {}
for gs, xml in zip(gs_files, khoja_files):
    doc_id = os.path.basename(gs)
    khoja_results[doc_id] = compare_to_gs(gs, xml, stemmer=True)

In [None]:
def get_terms(txt_file):
    # get the terms list
    terms = pd.read_csv(txt_file, encoding='utf-8', index_col=None, header=None)
    t = terms[0].tolist()
    print('total number of terms:', len(t))
    terms = set(t)
    print('number of unique terms:', len(terms))
    return terms

In [None]:
stopwords = get_terms('/home/jvdzwaan/data/adh/stopwords/custom.txt')

In [None]:
from nltk.corpus import stopwords as sw
stopwords_nltk = list(sw.words('arabic'))

In [None]:
print(len(stopwords_nltk))

In [None]:
type(stopwords_nltk)

In [None]:
def is_stopword(row, stopwords):
    #print(row['word'])
    return row['word'] in stopwords
 
for doc_id, df in khoja_results.items():
    #print(df.head())
    df['stopword'] = df.apply(lambda row: is_stopword(row, stopwords), axis=1)
    df['stopword_nltk'] = df.apply(lambda row: is_stopword(row, stopwords_nltk), axis=1)
    print(np.sum(df['stopword']), np.sum(df['stopword_nltk']))

In [None]:
for doc_id, df in khoja_results.items():
    print(doc_id, df[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
# Max' stopwords
for doc_id, df in khoja_results.items():
    print(doc_id, df.query('stopword == False').shape[0], df.query('stopword == False')[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
# nltk stopwords
for doc_id, df in khoja_results.items():
    print(doc_id, df.query('stopword_nltk == False').shape[0], df.query('stopword_nltk == False')[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
alkhalil_results = {}
for gs, xml in zip(gs_files, alkhalil_files):
    doc_id = os.path.basename(gs)
    alkhalil_results[doc_id] = compare_to_gs(gs, xml, stemmer=False)

In [None]:
def count_proposed_roots(row):
    roots = row['proposed_root'].split('\\')
    return len(roots)

means = []
num_single_root = 0
total_words = 0

for doc_id, df in alkhalil_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    
    df['num_roots'] = df.apply(lambda row: count_proposed_roots(row), axis=1)
    print(doc_id, np.mean(df['num_roots']), np.min(df['num_roots']), np.max(df['num_roots']))
    means.append(df['num_roots'])
    print(df.shape[0])
    num_single_root += np.sum(df['num_roots'] == 1)
    total_words += df.shape[0]
    
print(num_single_root)
print(total_words)
print(np.mean([item for sublist in means for item in sublist]))

In [None]:
for doc_id, df in alkhalil_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    print(doc_id, df[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0], len(missing), len(missing2))

In [None]:
for doc_id, df in alkhalil_results.items():
    #print(df.head())
    df['stopword'] = df.apply(lambda row: is_stopword(row, stopwords), axis=1)
    df['stopword_nltk'] = df.apply(lambda row: is_stopword(row, stopwords_nltk), axis=1)
    print(np.sum(df['stopword']), np.sum(df['stopword_nltk']))

In [None]:
# Max' stopwords
for doc_id, df in alkhalil_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    print(doc_id, df.query('stopword == False').shape[0], df.query('stopword == False')[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
# nltk stopwords
for doc_id, df in alkhalil_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    print(doc_id, df.query('stopword_nltk == False').shape[0], df.query('stopword_nltk == False')[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
# Does AlKhalil remove all duplicate words?
from collections import Counter

for doc_id, df in alkhalil_results.items():
    c = Counter(list(df['word']))
    for w, f in c.most_common(10):
        print(w, f)
        
    print('---')
# No, apparantly it doesn't

In [None]:
# isri stemmer
import os
from nlppln.utils import get_files

isri_files = get_files('/home/jvdzwaan/data/tmp/adh/evaluation/isri/', recursive=True)

isri_results = {}
for gs, xml in zip(gs_files, isri_files):
    doc_id = os.path.basename(gs)
    isri_results[doc_id] = compare_to_gs(gs, xml, stemmer=True)

In [None]:
for doc_id, df in isri_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    print(doc_id, df.shape[0], df[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])
    print(len(missing), len(missing2))

In [None]:
for doc_id, df in isri_results.items():
    #print(df.head())
    df['stopword'] = df.apply(lambda row: is_stopword(row, stopwords), axis=1)
    df['stopword_nltk'] = df.apply(lambda row: is_stopword(row, stopwords_nltk), axis=1)
    print(np.sum(df['stopword']), np.sum(df['stopword_nltk']))

In [None]:
# Max' stopwords
for doc_id, df in isri_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    print(doc_id, df.query('stopword == False').shape[0], df.query('stopword == False')[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
# nltk stopwords
for doc_id, df in isri_results.items():
    missing = df[df['word']=='MISSING'].index
    missing2 = df[df['proposed_root']=='MISSING'].index
    df = df.drop(missing)
    df = df.drop(missing2)
    print(doc_id, df.query('stopword_nltk == False').shape[0], df.query('stopword_nltk == False')[['root_correct']].apply(lambda x: np.sum(x)/len(x)*100)[0])

In [None]:
def count_roots(row):
    roots = row['root'].split('\\')
    return len(roots)

num_two_roots = 0
total_words = 0

for gs in gs_files:
    gs = pd.read_csv(gs)
    gs['num_roots'] = df.apply(lambda row: count_roots(row), axis=1)
    print(np.max(gs['num_roots']))
    num_two_roots += np.sum(df['num_roots'] == 2)
    total_words += df.shape[0]
print(num_two_roots)
print(total_words)