In [1]:
# NB: alfanous is not compatible with Python 3, so need to use python 2 here.
%matplotlib inline
import os
import pandas as pd
import alfanous
import codecs
import matplotlib.pyplot as plt
import re
import numpy as np

In [2]:
quotes_file = '/media/sf_VBox_Shared/Arabic/Analyses/Fiqh_final/quotes/quran_quotes.csv'
metadata_file = '/media/sf_VBox_Shared/Arabic/fiqh_corpus/Meta/Metadata_Fiqh.csv'
out_dir = '/media/sf_VBox_Shared/Arabic/networks/'

In [3]:
# Read in the meta data of the books
metadata = pd.read_csv(metadata_file, encoding='utf-8', sep=',', index_col=0)

In [4]:
metadata.shape

(56, 17)

In [5]:
metadata.head()

Unnamed: 0_level_0,BookURI,BookSource,BookTITLE_SHORT,BookTITLE,BookSUBJ,Number_of_tokens,AuthorAKA,AuthorNAME,AuthorBORNH,AuthorBORNC,AuthorDIEDH,AuthorDIEDC,Century,All_tokens_per-century,Author_Geographical_Area,Tagging,BookVOLS
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1.0,0182AbuYusufYacqub.Kharaj,http://shamela.ws/index.php/book/26333,الخراج,الخراج,حنفي,76062.0,أبو يوسف,أبو يوسف يعقوب بن إبراهيم بن حبيب بن سعد بن حب...,113,798,182.0,731,2ndH/8thC and 3rdH/9thC century,,Fertile Crescent,Quran @QB@ @QE@ Hadith @HB@,1
2.0,0204Shafici.Umm,http://shamela.ws/index.php/book/1655,الأم,الأم,شافعي,1205588.0,الشافعي,أبو عبد الله محمد بن إدريس الشافعيّ المطَّلِبي...,150,767,204.0,820,2ndH/8thC and 3rdH/9thC century,,Fertile Crescent,Books ### | Chapters ### || Subchapters ### |...,8
3.0,0179MalikIbnAnas.Muwatta,http://shamela.ws/index.php/book/28107,الموطأ,الموطّأ,مالكي,151424.0,مالك بن أنس,مالك بن أنس بن مالك بن عامر الأصبحي المدني,93,711,179.0,795,2ndH/8thC and 3rdH/9thC century,,Fertile Crescent,Books ### | Chapters ### || Quran @QB@ @QE@,1
4.0,0311AbuBakrKhallal.WuqufWaTarajjul,http://shamela.ws/index.php/book/26883,الوقوف والترجل,الوقوف والترجل من الجامع لمسائل الإمام أحمد بن...,حنبلي,32349.0,الخلال,أبو بكر أحمد بن محمد بن هارون بن يزيد الخلال ا...,234,848,311.0,923,2ndH/8thC and 3rdH/9thC century,,Fertile Crescent,Chapters ### || Quran @QB@ @QE@ Hadith @HB@,1
5.0,0274AhmadBarqi.Mahasin,http://siratali.org/maktaba/mahasin1/,المحاسن,المحاسن,شيعي,145940.0,البرقي,أحمد بن محمد بن خالد البرقي,NODATA,NODATA,274.0,887,2ndH/8thC and 3rdH/9thC century,1611363.0,Fertile Crescent,Chapters ### ||,2


In [6]:
# Read in the quran citations (files with one quote per line)
df = pd.read_csv(quotes_file, encoding='utf-8')
df['quote'] = df['quote'].astype('unicode')

In [7]:
df.head()

Unnamed: 0,BookURI,position,quote
0,0179MalikIbnAnas.Muwatta,8038,والمرسلات عرفا
1,0179MalikIbnAnas.Muwatta,8775,مالك يوم الدين
2,0179MalikIbnAnas.Muwatta,8784,إياك نعبد وإياك نستعين
3,0179MalikIbnAnas.Muwatta,11439,وإذا تولى سعى في الأرض
4,0179MalikIbnAnas.Muwatta,11447,وأما من جاءك يسعى وهو يخشى


In [10]:
df.shape

(2088, 3)

In [11]:
# How many files do we have?
df['BookURI'].nunique()

3

In [None]:
df.quote.dtype

In [None]:
# How many tokens are the quotes?
nr_tokens = pd.Series([len(q.split(' ')) for q in df['quote']])

In [None]:
import numpy as np
from matplotlib.ticker import LinearLocator

def plot_int_hist(s, ax=None, maxVal=None):
    if maxVal is None:
        maxVal = max(s)+1
    if ax is None:
        ax = plt.gca()
        s.hist(bins=np.arange(min(s)-0.5, maxVal), grid=False, ax=ax)

In [None]:
plt.figure(figsize=(15,8))
plot_int_hist(nr_tokens)
plt.show()

In [None]:
# Zoom in
maxnr = 15
plot_int_hist(nr_tokens, maxVal=maxnr)
plt.gca().set_xlabel('Nr of tokens')
plt.gca().set_ylabel('Count')
plt.gca().set_xticks(range(maxnr));

In [None]:
# How many times does each quote appear?
nr_quotes = df['quote'].value_counts()
nr_quotes.head(20)

In [None]:
# Make a histogram of these counts
plt.figure(figsize=(15,8))
nr_quotes.hist(bins=np.arange(-0.5, max(nr_quotes)+1))
plt.gca().set_xlabel('nr quotes')
plt.gca().set_ylabel('count')
plt.show()

In [None]:
# Use alfanous to trace the quotes back to quran verses
searches = {}
for q in df['quote'].unique():
    searches[q] = alfanous.search('"'+q+'"', view='minimal')

In [None]:
searches_fuzzy = {}
for q in df['quote'].unique():
    searches_fuzzy[q] = alfanous.search('"'+q+'"', view='minimal', fuzzy=True)

In [None]:
searches_words = {}
for q in df['quote'].unique():
    searches_words[q] = alfanous.search(q, view='minimal')

In [None]:
q_df = pd.DataFrame(nr_quotes)
q_df.head()

In [None]:
# To do: Encoding issues
q_df['nr_ayas'] = [len(searches[unicode(q)]['search']['ayas']) for q in q_df.index]
q_df['nr_ayas_fuzzy'] = [len(searches_fuzzy[unicode(q)]['search']['ayas']) for q in q_df.index]

In [None]:
# How many quotes are more often found with 'fuzzy'?
len(q_df[q_df['nr_ayas_fuzzy']>q_df['nr_ayas']])

In [None]:
# How many aya's are found for each quote??
plot_int_hist(q_df['nr_ayas'])

In [None]:
# How many aya's are found for each quote??
plot_int_hist(q_df['nr_ayas_fuzzy'])

In [None]:
# How many aya's are found for quotes appearing more than once?
plot_int_hist(q_df[q_df['quote']>1]['nr_ayas'])

In [None]:
q_df['nr_books'] = df.groupby('quote').nunique()['BookURI']

In [None]:
# Some examples of quotes with no aya
q_df[q_df['nr_ayas']<1].head(20)

We take the simplest approach: if multiple aya's are found, we take the first one. if none are found, we disregard the quote eventually.

In [None]:
metafields_aya = 'aya_id', 'sura_id', 'sura_name', 'sura_arabic_name'
ayas = {q: searches[q]['search']['ayas'][1]['identifier'] if len(searches[q]['search']['ayas'])>0 else {} for q in q_df.index}

for m in metafields_aya:
    q_df[m] = [ayas[q].get(m, None) for q in q_df.index]

In [None]:
q_df['id'] = zip(q_df['sura_id'], q_df['aya_id'])

In [None]:
# Note that multiple quotes can come from the same aya
q_df.head()

In [None]:
# We merge the literal quotes with the information on the aya's
links_merged = df.merge(q_df[q_df['nr_ayas']>0], left_on='quote', right_index=True, suffixes=('', '_cnt'))

In [None]:
links = pd.DataFrame(links_merged.groupby(['file', 'id']).count()['quotes'])
links = links.reset_index()
links.columns = ['Source', 'Target', 'Weight']

In [None]:
# Store the nodes and links

links.to_csv(os.path.join(out_dir, 'fiqh_quran_links.csv'), index=False)

book_nodes = metadata.rename({'filename': 'id'}, axis=1)[['id', 'AuthorAKA', 'AuthorNAME', 'AuthorBORNC', 'AuthorDIEDC', 'Author Geographical Area', 'Century', 'BookTITLE', 'BookSUBJ']]
book_nodes['Type'] = 'book'
book_nodes['Label'] = book_nodes['BookTITLE']
book_nodes.to_csv(os.path.join(out_dir, 'fiqh_quran_book_nodes.csv'), index=False, encoding='utf-8')

quran_nodes = q_df[q_df['nr_ayas']>0].reset_index()[['id', 'sura_id', 'aya_id', 'sura_name', 'sura_arabic_name']].drop_duplicates()
quran_nodes['aya_id'] = quran_nodes['aya_id'].astype('int')
quran_nodes['sura_id'] = quran_nodes['sura_id'].astype('int')
quran_nodes['Type'] = 'aya'
quran_nodes['Label'] = quran_nodes['sura_name'] + [' {:.0f}'.format(s) for s in quran_nodes['aya_id']]
quran_nodes.to_csv(os.path.join(out_dir, 'fiqh_quran_aya_nodes.csv'), index=False, encoding='utf-8')

## Some more analysis

In [None]:
# Look at quotes to the aya with different quotes
q_df[(q_df['sura_id']==2) & (q_df['aya_id']==282)]