In [None]:
%matplotlib inline
import os
import pandas as pd
import alfanous
import codecs
import matplotlib.pyplot as plt
import re
import numpy as np

In [None]:
in_dir = '/media/sf_VBox_Shared/Arabic/Fiqh_quran/'
metadata_file = '/media/sf_VBox_Shared/Arabic/Fiqh/Metadata.csv'
out_dir = '/media/sf_VBox_Shared/Arabic/networks/'

In [None]:
# Read in the meta data of the books
metadata = pd.read_csv(metadata_file, encoding='utf-8', sep=';', index_col=0).T
metadata.columns = [re.sub('#META# \d+\.', '', c) for c in metadata.columns]
metadata['filename'] = ['Fiqh{}.txt'.format(i) for i in range(1, len(metadata)+1)]

In [None]:
metadata.shape

In [None]:
metadata.head()

In [None]:
# Read in the quran citations (files with one quote per line)
df = pd.DataFrame()

for fname in os.listdir(in_dir):
    with codecs.open(os.path.join(in_dir, fname), encoding='utf-8') as f:
        lines = f.readlines()
    df = df.append(pd.DataFrame({'quotes': [l.strip() for l in lines], 'file': fname.replace('quotes_', '')}))

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# How many files do we have?
df['file'].nunique()

In [None]:
# How many tokens are the quotes?
nr_tokens = pd.Series([len(q.split(' ')) for q in df['quotes']])

In [None]:
import numpy as np
from matplotlib.ticker import LinearLocator

def plot_int_hist(s, ax=None, maxVal=None):
    if maxVal is None:
        maxVal = max(s)+1
    if ax is None:
        ax = plt.gca()
        s.hist(bins=np.arange(min(s)-0.5, maxVal), grid=False, ax=ax))

In [None]:
l.set_bounds

In [None]:
plt.figure(figsize=(15,8))
plot_int_hist(nr_tokens)
plt.show()

In [None]:
# Zoom in
maxnr = 15
plot_int_hist(nr_tokens, maxVal=maxnr)
plt.gca().set_xlabel('Nr of tokens')
plt.gca().set_ylabel('Count')
plt.gca().set_xticks(range(maxnr));

In [None]:
# How many times does each quote appear?
nr_quotes = df['quotes'].value_counts()
nr_quotes.head(20)

In [None]:
# Make a histogram of these counts
plt.figure(figsize=(15,8))
nr_quotes.hist(bins=np.arange(-0.5, max(nr_quotes)+1))
plt.gca().set_xlabel('nr quotes')
plt.gca().set_ylabel('count')
plt.show()

In [None]:
# Use alfanous to trace the quotes back to quran verses
searches = {}
for q in df['quotes'].unique():
    searches[q] = alfanous.search('"'+q+'"', view='minimal')

In [None]:
searches_fuzzy = {}
for q in df['quotes'].unique():
    searches_fuzzy[q] = alfanous.search('"'+q+'"', view='minimal', fuzzy=True)

In [None]:
searches_words = {}
for q in df['quotes'].unique():
    searches_words[q] = alfanous.search(q, view='minimal')

In [None]:
q_df = pd.DataFrame(nr_quotes)
q_df['nr_ayas'] = [len(searches[q]['search']['ayas']) for q in q_df.index]
q_df['nr_ayas_fuzzy'] = [len(searches_fuzzy[q]['search']['ayas']) for q in q_df.index]

In [None]:
# How many quotes are more often found with 'fuzzy'?
len(q_df[q_df['nr_ayas_fuzzy']>q_df['nr_ayas']])

In [None]:
# How many aya's are found for each quote??
plot_int_hist(q_df['nr_ayas'])

In [None]:
# How many aya's are found for each quote??
plot_int_hist(q_df['nr_ayas_fuzzy'])

In [None]:
# How many aya's are found for quotes appearing more than once?
plot_int_hist(q_df[q_df['quotes']>1]['nr_ayas'])

In [None]:
q_df['nr_books'] = df.groupby('quotes').nunique()['file']

In [None]:
# Some examples of quotes with no aya
q_df[q_df['nr_ayas']<1].head(20)

We take the simplest approach: if multiple aya's are found, we take the first one. if none are found, we disregard the quote eventually.

In [None]:
metafields_aya = 'aya_id', 'sura_id', 'sura_name', 'sura_arabic_name'
ayas = {q: searches[q]['search']['ayas'][1]['identifier'] if len(searches[q]['search']['ayas'])>0 else {} for q in q_df.index}

for m in metafields_aya:
    q_df[m] = [ayas[q].get(m, None) for q in q_df.index]

In [None]:
q_df['id'] = zip(q_df['sura_id'], q_df['aya_id'])

In [None]:
# Note that multiple quotes can come from the same aya
q_df.head()

In [None]:
# We merge the literal quotes with the information on the aya's
links_merged = df.merge(q_df[q_df['nr_ayas']>0], left_on='quotes', right_index=True, suffixes=('', '_cnt'))

In [None]:
links = pd.DataFrame(links_merged.groupby(['file', 'id']).count()['quotes'])
links = links.reset_index()
links.columns = ['Source', 'Target', 'Weight']

In [None]:
# Store the nodes and links

links.to_csv(os.path.join(out_dir, 'fiqh_quran_links.csv'), index=False)

book_nodes = metadata.rename({'filename': 'id'}, axis=1)[['id', 'AuthorAKA', 'AuthorNAME', 'AuthorBORNC', 'AuthorDIEDC', 'Author Geographical Area', 'Century', 'BookTITLE', 'BookSUBJ']]
book_nodes['Type'] = 'book'
book_nodes['Label'] = book_nodes['BookTITLE']
book_nodes.to_csv(os.path.join(out_dir, 'fiqh_quran_book_nodes.csv'), index=False, encoding='utf-8')

quran_nodes = q_df[q_df['nr_ayas']>0].reset_index()[['id', 'sura_id', 'aya_id', 'sura_name', 'sura_arabic_name']].drop_duplicates()
quran_nodes['aya_id'] = quran_nodes['aya_id'].astype('int')
quran_nodes['sura_id'] = quran_nodes['sura_id'].astype('int')
quran_nodes['Type'] = 'aya'
quran_nodes['Label'] = quran_nodes['sura_name'] + [' {:.0f}'.format(s) for s in quran_nodes['aya_id']]
quran_nodes.to_csv(os.path.join(out_dir, 'fiqh_quran_aya_nodes.csv'), index=False, encoding='utf-8')

## Some more analysis

In [None]:
# Look at quotes to the aya with different quotes
q_df[(q_df['sura_id']==2) & (q_df['aya_id']==282)]