In [None]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import itertools

In [None]:
## Read the XML files
def extract_from_xml(search_roots, filenames):
    metadata = []
    matched_words = []
    for filename in filenames:
        with open(filename) as fn:
            xml_data = BeautifulSoup(fn, 'xml')
            meta_dict = {meta['name']: meta.text.strip() for meta in xml_data.metadata.find_all('meta')}
            #meta_dict['Bookname'] = dirname
            #meta_dict['Filename'] = filename
            metadata.append(meta_dict)

            # loop over words and match with the searched words
            # To do: do not include roots that are not within the search set
            for word in xml_data.morphology_analysis.find_all('word'):
                roots = set([a.get('root', '') for a in word.find_all('analysis')])
                if not set(search_roots).isdisjoint(roots):
                    matched_words.append((filename, word.attrs, [a.attrs for a in word.find_all('analysis')]))
                    
    # # Put all results in a dataframe
    df_total = pd.DataFrame()
    for filename, word_dict, analyses in matched_words:
        df_analyses = pd.DataFrame(analyses)
        df_analyses['Filename'] = os.path.basename(filename)
        for att in word_dict:
            df_analyses[att] = word_dict[att]
        df_total = df_total.append(df_analyses)
    return metadata, df_total

In [None]:
## Read the csv files
def extract_from_csv(search_roots, filenames):
    df_total = pd.DataFrame()
    for i in range(len(filenames)):
        if i%1000==0:
            print(i)
        filename = filenames[i]
        df_sub = pd.read_csv(filename, index_col=0)
        df_sub = df_sub[df_sub.root.isin(search_roots)]
        df_total = df_total.append(df_sub)
    return df_total

# Extract senses

In [None]:
senses_roots = set('''سمع
بصر
لمس
شمم
ذوق'''.split('\n'))

In [None]:
## from XML
filepath = '/media/sf_VBox_Shared/Arabic/indices/20180424/merged/'
xml_file_names = itertools.chain.from_iterable([[os.path.join(d, f) for f in fnames] for d, dnames, fnames in os.walk(filepath)])
metadata, matched_words = extract_from_xml(senses_roots, list(xml_file_names)[:20])

In [None]:
# from CSV
filepath = '/media/sf_VBox_Shared/Arabic/Fiqh/Fiqh-Alkhalil-csv/csv'
csv_file_names = [os.path.join(filepath, fn) for fn in os.listdir(filepath)]
df_total = extract_from_csv(senses_roots, list(csv_file_names))

In [None]:
df_total.shape

In [None]:
df_total['root'].value_counts()

In [None]:
senses_dict = {
    u'بصر': 'see',
    u'سمع': 'hear',
    u'لمس': 'touch',
    u'شمم': 'smell',
    u'ذوق': 'taste'
}

df_total['sense'] = [senses_dict[s] for s in df_total['root']]

## Merged with metadata

In [None]:
metadata_fields = ['BookURI', 'Century', 'AuthorNAME', 'AuthorGeographicalArea', 'AuthorBORNH', 'AuthorBORNC', 'AuthorDIEDH', 'AuthorDIEDC',  'BookSUBJ', 'NumberOfTokens']

metadata_new = pd.read_csv('/media/sf_VBox_Shared/Arabic/Fiqh/merged_metadata.csv')

metadata_new['Bookname'] = metadata_new.filename_old.str.extract('(.*)\.txt', expand=False)

#metadata_merged = metadata_df['Bookname'].reset_index().merge(metadata_new, left_on='Bookname', right_on='Bookname', how='left')
metadata_merged = metadata_new[['Bookname']+metadata_fields].copy()

metadata_merged.columns

In [None]:
df_merged = df_total.merge(metadata_merged, left_on='title', right_on='Bookname', how='left').drop(['Bookname', 'title'], axis=1)

In [None]:
df_merged.to_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_fiqh.csv', index=False)

In [None]:
tr_dict = {s['root']: s['tr_root'] for i, s in df_total[['root', 'tr_root']].drop_duplicates().iterrows()}
tr_dict

In [None]:
# Also prepare aggregated csv
df_agg = df_total.groupby(['title', 'sense']).size().unstack(fill_value=0)
#df_agg.columns = [u'{} ({})'.format(c, tr_dict[c]) for c in df_agg.columns]

df_agg_merged = df_agg.reset_index().merge(metadata_merged, left_on='title', right_on='Bookname', how='left').drop(['Bookname'], axis=1)

senses_cols = df_agg.columns
senses_cols_relative = [c+'_p' for c in df_agg.columns]
df_agg_merged[senses_cols_relative] = df_agg_merged.apply(lambda r: r[senses_cols]/r['NumberOfTokens'], axis=1)

df_agg_merged.to_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_fiqh_agg.csv')