In [30]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import itertools

In [74]:
## Read the XML files
def extract_from_xml(search_roots, filenames):
    metadata = []
    matched_words = []
    for filename in filenames:
        with open(filename) as fn:
            xml_data = BeautifulSoup(fn, 'xml')
            meta_dict = {meta['name']: meta.text.strip() for meta in xml_data.metadata.find_all('meta')}
            #meta_dict['Bookname'] = dirname
            #meta_dict['Filename'] = filename
            metadata.append(meta_dict)

            # loop over words and match with the searched words
            # To do: do not include roots that are not within the search set
            for word in xml_data.morphology_analysis.find_all('word'):
                roots = set([a.get('root', '') for a in word.find_all('analysis')])
                if not set(search_roots).isdisjoint(roots):
                    matched_words.append((filename, word.attrs, [a.attrs for a in word.find_all('analysis')]))
                    
    # # Put all results in a dataframe
    df_total = pd.DataFrame()
    for filename, word_dict, analyses in matched_words:
        df_analyses = pd.DataFrame(analyses)
        df_analyses['Filename'] = os.path.basename(filename)
        for att in word_dict:
            df_analyses[att] = word_dict[att]
        df_total = df_total.append(df_analyses)
    return metadata, df_total

In [75]:
## Read the csv files
def extract_from_csv(search_roots, filenames):
    df_total = pd.DataFrame()
    for i in range(len(filenames)):
        if i%1000==0:
            print(i)
        filename = filenames[i]
        df_sub = pd.read_csv(filename, index_col=0)
        df_sub = df_sub[df_sub.root.isin(search_roots)]
        df_total = df_total.append(df_sub)
    return df_total

In [25]:
from lxml import etree
from tqdm import tqdm
import pandas as pd

def analyzer_xml2df2(fname, filter_roots=None, filter_stems=None, filter_words=None):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        word = elem.attrib['value']
        if word != '' and (filter_words is None or word in filter_words):
            roots = []
            stems = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
                    try:
                        stems.append(a.attrib['stem'])
                    except:
                        pass
            roots = list(set(roots))
            stems = list(set(stems))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            if len(stems) == 0:
                stems.append('NOANALYSIS')
            if filter_roots is None or not set(filter_roots).isdisjoint(set(roots)):
                if filter_stems is None or not set(filter_stems).isdisjoint(set(stems)):
                    result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots), 'proposed_stem': '\\'.join(stems)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

## Free reasoning

In [58]:
stems_dict = {
    u'اجتهاد': 'free',
    u'مجتهد': 'free',
    u'مقلد': 'following',
    u'تقليد': 'following'
}


stems_list = stems_dict.keys()

In [34]:
fpath = '/media/sf_VBox_Shared/Arabic/Fiqh/2018-09-18-Al-Khalil/'

df = pd.DataFrame()
for fname in os.listdir(fpath):
    print(fname)
    df_sub = analyzer_xml2df2(os.path.join(fpath, fname), filter_stems=stems_list)
    df_sub['fname'] = fname
    df = df.append(df_sub)

639it [00:00, 6240.20it/s]

0179MalikIbnAnas.Muwatta.xml


131338it [00:08, 15041.65it/s]
604it [00:00, 6031.10it/s]

0182AbuYusufYacqub.Kharaj.xml


63518it [00:05, 12419.39it/s]
530it [00:00, 5286.32it/s]

0264IbnYahyaMuzani.Mukhtasar.xml


168993it [00:11, 14840.28it/s]
932it [00:00, 9315.90it/s]

0274AhmadBarqi.Mahasin.xml


119926it [00:07, 15157.21it/s]
551it [00:00, 5495.79it/s]

0311AbuBakrKhallal.WuqufWaTarajjul.xml


28239it [00:02, 11107.63it/s]
658it [00:00, 6559.89it/s]

0334IbnHusaynKhiraqi.MukhtasarMinMasailIbnHanbal.xml


29891it [00:02, 12555.86it/s]
876it [00:00, 8758.00it/s]

0373AbuLaythSamarqandi.CuyunMasail.xml


48316it [00:04, 11680.61it/s]
494it [00:00, 4760.29it/s]

0381IbnBabawayh.Hidaya.xml


17888it [00:01, 14400.27it/s]
1254it [00:00, 12536.16it/s]

0386IbnAbiZaydQayrawani.NawadirWaZiyadat.xml


1566042it [01:27, 17855.02it/s]
754it [00:00, 7499.69it/s]

0415IbnMuhammadMahamili.LubabFiFiqhShafici.xml


26278it [00:01, 18296.80it/s]
1228it [00:00, 12172.08it/s]

0422QadiCabdWahhabThaclabi.IkmalMuCallim.xml


1131695it [01:01, 18478.54it/s]
1095it [00:00, 10938.61it/s]

0450AbuHasanMawardi.HawiKabir.xml


2563290it [03:11, 13409.37it/s]
845it [00:00, 8443.91it/s]

0458QadiAbuYacla.MasailFiqhyya.xml


152023it [00:12, 12266.07it/s]
627it [00:00, 6267.56it/s]

0460ShaykhTusi.Mabsut.xml


748994it [01:02, 12063.01it/s]
588it [00:00, 5495.58it/s]

0463IbnCabdBarr.KafiFiFiqh.xml


187213it [00:13, 13825.88it/s]
447it [00:00, 4080.49it/s]

0483IbnAhmadSarakhsi.Mabsut.xml


2368787it [03:14, 12190.92it/s]
985it [00:00, 9846.37it/s]

0507AbuBakrShashi.HilyaCulama.xml


93498it [00:05, 16387.77it/s]
720it [00:00, 7192.31it/s]

0587IbnMascudCalaDinKasani.BadaicSanaic.xml


1374076it [01:24, 16288.21it/s]
739it [00:00, 7074.88it/s]

0597IbnJawzi.TahqiqFiAhadithKhilaf.xml


183128it [00:09, 18765.70it/s]
780it [00:00, 7705.43it/s]

0600QutbdinBayhaqiKaydari.IdahShica.xml


63395it [00:03, 18478.42it/s]
1300it [00:00, 12764.13it/s]

0620IbnQudamaMaqdisi.MughniFiFiqh.xml


1675300it [01:33, 17892.16it/s]
1173it [00:00, 11726.88it/s]

0676IbnHasanMuhaqqiqHilli.SharaicIslam.xml


173686it [00:08, 19467.40it/s]
764it [00:00, 7599.04it/s]

0676Nawawi.ManahijTalibin.xml


73299it [00:03, 18537.75it/s]
1275it [00:00, 12746.40it/s]

0684ShihabDinQarafi.Thakhira.xml


1153964it [01:00, 18979.97it/s]
700it [00:00, 6990.42it/s]

0710IbnAhmadHafizDinNasafi.KanzDaqaiq.xml


27179it [00:01, 17315.43it/s]
1275it [00:00, 12745.30it/s]

0763IbnMuflihHanbaliMuqaddasi.FurucWaTashihFuruc..xml


763054it [00:43, 17351.62it/s]
1086it [00:00, 10856.57it/s]

0769IbnLuluShihabDinIbnNaqibShafici.CumdatSalik.xml


38891it [00:02, 17426.84it/s]
1248it [00:00, 12476.12it/s]

0776Aljundi.AltawdihFiShafh.xml


1097144it [00:56, 19256.76it/s]
761it [00:00, 7450.21it/s]

0786IbnMuhammadBabarti.CinayaSharhHidaya.xml


1126139it [01:02, 18126.64it/s]
1436it [00:00, 14338.75it/s]

0786ShahidAwwal.Durus.xml


190183it [00:09, 20032.66it/s]
1302it [00:00, 13013.28it/s]

0829TaqiDinDimashqiHisni.KifayatAkhyar.xml


185423it [00:09, 19675.07it/s]
1282it [00:00, 12813.29it/s]

0841JamalDinIbnFahdHilli.MuhadhdhabBaric.xml


336263it [00:19, 17378.27it/s]
1414it [00:00, 14134.55it/s]

0861IbnCabdWahidIbnHumamSiwasi.FathQadir.xml


1974742it [01:50, 17855.93it/s]
1180it [00:00, 11797.68it/s]

0884IbnMuflih.MubdicFiSharhMuqnic.xml


948556it [00:50, 18925.42it/s]
1104it [00:00, 11037.75it/s]

0897IbnYusufCabdariGharnati.TajWaIklilLiMukhtasarKhalil.xml


670267it [00:40, 16684.88it/s]
1386it [00:00, 13853.84it/s]

0940IbnHusaynMuhaqqiqThaniKaraki.JamicMaqasid.xml


1085712it [00:57, 18735.18it/s]
1386it [00:00, 13661.39it/s]

0970IbnIbrahimIbnNujaymMisri.BahrRaiq.xml


1640137it [01:32, 17769.30it/s]
1316it [00:00, 13156.47it/s]

0972IbnAhmadIbnNajjarHanbali.MuntahaIradat.xml


117570it [00:06, 18522.91it/s]
1365it [00:00, 13644.61it/s]

0977KhatibShirbini.MughniMuhtaj.xml


1135403it [01:01, 18602.10it/s]
1297it [00:00, 12840.94it/s]

0995Manjur.SharhManhajMuntakhab.xml


106457it [00:05, 20523.36it/s]
1596it [00:00, 15955.07it/s]

1004ShamsDinRamli.NihayatMuhtaj.xml


973638it [00:56, 17250.42it/s]
1358it [00:00, 13556.90it/s]

1051IbnYunusBuhutiHanbali.RawdMurbic.xml


142483it [00:07, 19057.97it/s]
1421it [00:00, 14055.20it/s]

1078ShaykhiZadahDamadAfandi.MajmacAnhur..xml


369086it [00:18, 20041.71it/s]
1142it [00:00, 11416.01it/s]

1091MuhammadMuhsinFaydKashani.MafatihSaraiC.xml


172694it [00:08, 21224.34it/s]
1369it [00:00, 13685.51it/s]

1122MuhammadZarqani.SharhCalaMuwatta.xml


880388it [00:51, 17252.06it/s]
1292it [00:00, 12916.44it/s]

1204SulaymanJamal.Hashiya.xml


2095952it [01:51, 18864.57it/s]
1163it [00:00, 11304.88it/s]

1205MuhammadBaqirWahidBahbahani.xml


820584it [00:44, 18527.06it/s]
1125it [00:00, 11247.97it/s]

1206IbnCabdWahhab.MajmuucAlhadith.xml


128344it [00:07, 17169.46it/s]
1194it [00:00, 11852.92it/s]

1230Dusuqi.SharhKabir.xml


1529755it [01:23, 18332.33it/s]
1258it [00:00, 12575.63it/s]

1231Tahtawi.Hashia.xml


309203it [00:15, 20186.87it/s]
1345it [00:00, 13265.28it/s]

1252IbnCabidinDimashqi.RaddMuhtar.xml


2417710it [02:16, 17705.62it/s]
1088it [00:00, 10877.34it/s]

1266MuhammadHasanNajafiJawhari.JawahirKalam.xml


4466424it [04:17, 17338.08it/s]
1322it [00:00, 13216.07it/s]

1310BakriDimyati.HashiyaIcanaTalibin.xml


832425it [00:46, 17754.40it/s]
1297it [00:00, 12965.13it/s]

1319IbnYasinLabadiNabulusi.HashiyatCalaNaylMaarib.xml


101820it [00:05, 18157.08it/s]
1281it [00:00, 12805.55it/s]

1335IbnCabdSamicAbiAzhari.ThamrDani.xml


141512it [00:07, 19189.19it/s]


In [38]:
fname_out = '/media/sf_VBox_Shared/Arabic/Analyses/fiqh-reasoning-raw.csv'
df.to_csv(fname_out, index=False)

In [55]:
# Retrieve the stem we were originally interested in
def get_original_stem(stems):
    l = set(stems.split('\\'))
    overlap = l.intersection(set(stems_list))
    if len(overlap)>0:
        return list(overlap)[0]
    else:
        return None

df['stem'] = df.proposed_stem.map(get_original_stem)

In [59]:
df['category'] = df['stem'].map(lambda s: stems_dict[s])

In [69]:
df['BookURI'] = df['fname'].map(lambda s: s.replace('.xml', ''))

In [70]:
counts_category = df.groupby(['BookURI', 'category']).size().unstack().fillna(0)

In [71]:
counts_category.head()

category,following,free
BookURI,Unnamed: 1_level_1,Unnamed: 2_level_1
0179MalikIbnAnas.Muwatta,2.0,11.0
0264IbnYahyaMuzani.Mukhtasar,3.0,10.0
0274AhmadBarqi.Mahasin,0.0,7.0
0334IbnHusaynKhiraqi.MukhtasarMinMasailIbnHanbal,0.0,3.0
0373AbuLaythSamarqandi.CuyunMasail,1.0,0.0


merge with meta data:

In [132]:
metadata_fname = '/media/sf_VBox_Shared/Arabic/fiqh_corpus/Meta/Metadata_Fiqh.csv'
metadata = pd.read_csv(metadata_fname, index_col=1)
metadata['century_n'] = metadata.Century.str.extract('([0-9]*)')
metadata.columns

  app.launch_new_instance()


Index(['Order', 'School', 'Number_of_tokens', 'AuthorAKA', 'Author',
       'AuthorBORNH', 'AuthorBORNC', 'AuthorDIEDH', 'AuthorDIEDC', 'Century',
       'Title', 'Geographical_area', 'Tagging', 'BookVOLS', 'century_n'],
      dtype='object')

In [133]:
metadata_columns = ['AuthorAKA', 'AuthorBORNH', 'AuthorBORNC', 'AuthorDIEDH', 'AuthorDIEDC', 'Century', 'School', 'Geographical_area', 'Number_of_tokens', 'century_n']
metadata = metadata[metadata_columns]

In [134]:
counts_category[metadata_columns] = metadata[metadata_columns]

In [135]:
counts_long = df.groupby(['BookURI', 'stem', 'category']).size()
counts_long.head()

BookURI                       stem    category 
0179MalikIbnAnas.Muwatta      اجتهاد  free          9
                              تقليد   following     2
                              مجتهد   free          2
0264IbnYahyaMuzani.Mukhtasar  اجتهاد  free         10
                              تقليد   following     3
dtype: int64

In [136]:
counts_long_merged = pd.merge(pd.DataFrame(counts_long, columns=['count']).reset_index(), 
         metadata.reset_index(), left_on='BookURI', right_on='BookURI')

In [137]:
fname_out = '/media/sf_VBox_Shared/Arabic/Analyses/fiqh-reasoning-aggregated.csv'
counts_category.to_csv(fname_out)

In [138]:
fname_out = '/media/sf_VBox_Shared/Arabic/Analyses/fiqh-reasoning-aggregated-long.csv'
counts_long_merged.to_csv(fname_out, index=False)

# Extract senses

In [76]:
senses_roots = set('''سمع
بصر
لمس
شمم
ذوق'''.split('\n'))

In [None]:
## from XML
filepath = '/media/sf_VBox_Shared/Arabic/indices/20180424/merged/'
xml_file_names = itertools.chain.from_iterable([[os.path.join(d, f) for f in fnames] for d, dnames, fnames in os.walk(filepath)])
metadata, matched_words = extract_from_xml(senses_roots, list(xml_file_names)[:20])

In [78]:
# from CSV
filepath = '/media/sf_VBox_Shared/Arabic/Fiqh/Fiqh-Alkhalil-csv/csv'
csv_file_names = [os.path.join(filepath, fn) for fn in os.listdir(filepath)]
df_total = extract_from_csv(senses_roots, list(csv_file_names))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [79]:
df_total.shape

(29505, 7)

In [80]:
df_total['root'].value_counts()

سمع    18732
بصر     5296
لمس     2976
شمم     1860
ذوق      641
Name: root, dtype: int64

In [81]:
senses_dict = {
    u'بصر': 'see',
    u'سمع': 'hear',
    u'لمس': 'touch',
    u'شمم': 'smell',
    u'ذوق': 'taste'
}

df_total['sense'] = [senses_dict[s] for s in df_total['root']]

## Merged with metadata

In [82]:
metadata_fields = ['BookURI', 'Century', 'AuthorNAME', 'AuthorGeographicalArea', 'AuthorBORNH', 'AuthorBORNC', 'AuthorDIEDH', 'AuthorDIEDC',  'BookSUBJ', 'NumberOfTokens']

metadata_new = pd.read_csv('/media/sf_VBox_Shared/Arabic/Fiqh/merged_metadata.csv')

metadata_new['Bookname'] = metadata_new.filename_old.str.extract('(.*)\.txt', expand=False)

#metadata_merged = metadata_df['Bookname'].reset_index().merge(metadata_new, left_on='Bookname', right_on='Bookname', how='left')
metadata_merged = metadata_new[['Bookname']+metadata_fields].copy()

metadata_merged.columns

Index(['Bookname', 'BookURI', 'Century', 'AuthorNAME',
       'AuthorGeographicalArea', 'AuthorBORNH', 'AuthorBORNC', 'AuthorDIEDH',
       'AuthorDIEDC', 'BookSUBJ', 'NumberOfTokens'],
      dtype='object')

In [83]:
df_merged = df_total.merge(metadata_merged, left_on='title', right_on='Bookname', how='left').drop(['Bookname', 'title'], axis=1)

In [84]:
df_merged.to_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_fiqh.csv', index=False)

In [85]:
tr_dict = {s['root']: s['tr_root'] for i, s in df_total[['root', 'tr_root']].drop_duplicates().iterrows()}
tr_dict

{'بصر': 'bSr', 'ذوق': '*wq', 'سمع': 'smE', 'شمم': '$mm', 'لمس': 'lms'}

In [88]:
# Also prepare aggregated csv
df_agg = df_total.groupby(['title', 'sense']).size().unstack(fill_value=0)
#df_agg.columns = [u'{} ({})'.format(c, tr_dict[c]) for c in df_agg.columns]

df_agg_merged = df_agg.reset_index().merge(metadata_merged, left_on='title', right_on='Bookname', how='left').drop(['Bookname'], axis=1)

senses_cols = df_agg.columns
senses_cols_relative = [c+'_p' for c in df_agg.columns]
df_agg_merged[senses_cols_relative] = df_agg_merged.apply(lambda r: r[senses_cols]/r['NumberOfTokens'], axis=1)

df_agg_merged.to_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_fiqh_agg.csv')