In [8]:
import pandas as pd

hadiths = pd.read_csv('raw/kaggle_hadiths_clean.csv')
rawis = pd.read_csv('raw/kaggle_rawis.csv')

## Hadiths Cleanup

In [9]:
hadiths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34441 entries, 0 to 34440
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          34441 non-null  int64 
 1   hadith_id   34441 non-null  int64 
 2   source      34441 non-null  object
 3   chapter_no  34441 non-null  int64 
 4   hadith_no   34441 non-null  object
 5   chapter     34441 non-null  object
 6   chain_indx  34318 non-null  object
 7   text_ar     34433 non-null  object
 8   text_en     33588 non-null  object
dtypes: int64(3), object(6)
memory usage: 2.4+ MB


In [10]:
hadiths.dropna(inplace=True)

In [11]:
hadiths.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33536 entries, 0 to 34440
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          33536 non-null  int64 
 1   hadith_id   33536 non-null  int64 
 2   source      33536 non-null  object
 3   chapter_no  33536 non-null  int64 
 4   hadith_no   33536 non-null  object
 5   chapter     33536 non-null  object
 6   chain_indx  33536 non-null  object
 7   text_ar     33536 non-null  object
 8   text_en     33536 non-null  object
dtypes: int64(3), object(6)
memory usage: 2.6+ MB


### Drop Duplicates

In [12]:
ids_with_duplicates = hadiths.hadith_id.value_counts()[hadiths.hadith_id.value_counts() > 1].index

In [13]:
hadiths.drop_duplicates(subset='hadith_id', inplace=True)
# Keep first occurence
hadiths[hadiths.hadith_id.isin(ids_with_duplicates)].shape[0] == len(ids_with_duplicates)


True

In [14]:
hadiths.hadith_id.value_counts().sort_values(ascending=False)

hadith_id
1        1
1029     1
4        1
5        1
6        1
        ..
54224    1
54225    1
54226    1
54213    1
54227    1
Name: count, Length: 32798, dtype: int64

In [15]:
hadiths.drop("hadith_no", axis=1, inplace=True)

In [16]:
hadiths.rename({"hadith_id": "hadith_no"}, axis=1, inplace=True)

In [17]:
hadiths.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32798 entries, 0 to 34440
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          32798 non-null  int64 
 1   hadith_no   32798 non-null  int64 
 2   source      32798 non-null  object
 3   chapter_no  32798 non-null  int64 
 4   chapter     32798 non-null  object
 5   chain_indx  32798 non-null  object
 6   text_ar     32798 non-null  object
 7   text_en     32798 non-null  object
dtypes: int64(3), object(5)
memory usage: 2.3+ MB


### Missing Sanad Data

In [19]:
hadith_id_to_scholar_array = dict()

def extract_scholars(row):
	hadith_id = row['hadith_no']
	chain = row['chain_indx']

	# seperate on commas and remove empty strings
	scholars = [s.strip() for s in chain.split(',') if s.strip() != '']
	# as int
	scholars = [int(s) for s in scholars]
	hadith_id_to_scholar_array[hadith_id] = scholars

hadiths.apply(extract_scholars, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
34436    None
34437    None
34438    None
34439    None
34440    None
Length: 32798, dtype: object

In [22]:
def find_gaps():
	gap_tuples = []
	for key in hadith_id_to_scholar_array:
		for index in range(len(hadith_id_to_scholar_array[key])):
		
			scholar_indx = hadith_id_to_scholar_array[key][index]
			# find in rawis
			gap = rawis[rawis['scholar_indx'] == scholar_indx].shape[0] < 1
			if gap:
				gap_tuples.append((key, scholar_indx))

	return gap_tuples

In [23]:
gap_tuples = find_gaps()
missing_narrator_ids = set([t[1] for t in gap_tuples])
hadiths_with_missing_narrators = set([t[0] for t in gap_tuples])

In [24]:
len(missing_narrator_ids), len(hadiths_with_missing_narrators)

(103, 1894)

In [25]:
# drop hadiths with missing narrators
hadiths = hadiths[~hadiths.hadith_no.isin(hadiths_with_missing_narrators)]

In [26]:
hadiths.info()
hadiths.to_csv('raw/complete_hadiths.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 30904 entries, 0 to 34440
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          30904 non-null  int64 
 1   hadith_no   30904 non-null  int64 
 2   source      30904 non-null  object
 3   chapter_no  30904 non-null  int64 
 4   chapter     30904 non-null  object
 5   chain_indx  30904 non-null  object
 6   text_ar     30904 non-null  object
 7   text_en     30904 non-null  object
dtypes: int64(3), object(5)
memory usage: 2.1+ MB


### Rename Columns

In [27]:
hadiths.rename(columns={
	'source': 'book',
	'chain_indx': 'chain',
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hadiths.rename(columns={


In [28]:
hadiths.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30904 entries, 0 to 34440
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          30904 non-null  int64 
 1   hadith_no   30904 non-null  int64 
 2   book        30904 non-null  object
 3   chapter_no  30904 non-null  int64 
 4   chapter     30904 non-null  object
 5   chain       30904 non-null  object
 6   text_ar     30904 non-null  object
 7   text_en     30904 non-null  object
dtypes: int64(3), object(5)
memory usage: 2.1+ MB


#### Save

In [29]:
hadiths.to_csv('raw/complete_hadiths.csv', index=False)

## Narrators Cleanup

In [30]:
rawis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   scholar_indx          24326 non-null  int64  
 1   name                  24326 non-null  object 
 2   grade                 24326 non-null  object 
 3   parents               3012 non-null   object 
 4   spouse                1010 non-null   object 
 5   siblings              2068 non-null   object 
 6   children              1583 non-null   object 
 7   birth_date_place      7555 non-null   object 
 8   places_of_stay        16728 non-null  object 
 9   death_date_place      24320 non-null  object 
 10  teachers              13436 non-null  object 
 11  students              6499 non-null   object 
 12  area_of_interest      24272 non-null  object 
 13  tags                  15666 non-null  object 
 14  books                 6 non-null      object 
 15  students_inds      

In [31]:
rawi_columns = [
	"scholar_indx",
	"name",
	"grade",
	"death_reason",
	"places_of_stay"
]

In [32]:
rawis = rawis[rawi_columns]

In [33]:
rawis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scholar_indx    24326 non-null  int64 
 1   name            24326 non-null  object
 2   grade           24326 non-null  object
 3   death_reason    24188 non-null  object
 4   places_of_stay  16728 non-null  object
dtypes: int64(1), object(4)
memory usage: 950.4+ KB


In [34]:
rawis.rename(columns={
	'scholar_indx': 'narrator_id'
}, inplace=True)

#### Save

In [35]:
rawis.to_csv('raw/complete_narrators.csv')

### String Formatting - Hadith

In [36]:

narrators = pd.read_csv("raw/complete_narrators.csv")
hadiths = pd.read_csv("raw/complete_hadiths.csv")
hadiths.dropna(inplace=True)

In [37]:
hadiths["book"] = hadiths["book"].str.strip()

#### Sampling

In [None]:
first_few = hadiths.iloc[20000:].head(15).text_en.values
with open('first_few.txt', 'w', encoding='utf-8') as f:
	for text in first_few:
		f.write(text + '\n')

#### clean chapters

In [38]:
# split each string on last occurence of hyphen
hadiths['chapter_en'] = hadiths['chapter'].str.rsplit('-', n=1, expand=True)[0].str.strip()
hadiths['chapter_ar'] = hadiths['chapter'].str.rsplit('-', n=1, expand=True)[1].str.strip()
hadiths.drop('chapter', axis=1, inplace=True)

#### clean texts

In [41]:
# make sure spaces only appear one time to avoid large gaps

for _ in range(20):
	hadiths['text_en'] = hadiths['text_en'].str.replace('  ', ' ') 
	hadiths['text_ar'] = hadiths['text_ar'].str.replace('  ', ' ')

In [42]:
def format_arabic_text(text):
	text = text.replace('\u200f', '')
	text = text.replace('\u200e', '')
	text = text.replace('\n', '')
	text = text.replace(' .', '.')
	text = text.strip()
	return text

In [43]:
hadiths["text_ar"] = hadiths.text_ar.apply(format_arabic_text)
hadiths["text_en"] = hadiths.text_en.str.strip()
hadiths["text_en"] = hadiths.text_en.str.replace('\n', '')

In [44]:
hadiths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30904 entries, 0 to 30903
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          30904 non-null  int64 
 1   hadith_no   30904 non-null  int64 
 2   book        30904 non-null  object
 3   chapter_no  30904 non-null  int64 
 4   chain       30904 non-null  object
 5   text_ar     30904 non-null  object
 6   text_en     30904 non-null  object
 7   chapter_en  30904 non-null  object
 8   chapter_ar  30904 non-null  object
dtypes: int64(3), object(6)
memory usage: 2.1+ MB


### String Formatting - narrators

In [47]:
narrators = pd.read_csv("raw/complete_narrators.csv").drop("Unnamed: 0", axis=1)

In [48]:
narrators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   narrator_id     24326 non-null  int64 
 1   name            24326 non-null  object
 2   grade           24326 non-null  object
 3   death_reason    24188 non-null  object
 4   places_of_stay  16728 non-null  object
dtypes: int64(1), object(4)
memory usage: 950.4+ KB


In [49]:
def format_name(text):
	text = text.replace('\u200f', '')
	text = text.replace('\u200e', '')
	text = text.replace('\n', '')
	text = text.replace(' .', '.')

	text = text.replace('رضي الله عنها', '')
	text = text.replace('رضي الله عنه', '')
	text = text.replace('(', '')
	text = text.replace(')', '')
	text = text.strip()
	
	return text

In [50]:
def split_row_on_name(row):
	# split name on start of arabic chars into 2 columns
	name = row['name']
	# find an arabic char
	i = 0
	for char in name:
		if ord(char) > 1500:
			break
		
		i += 1

	row['name_en'] = name[:i].strip()
	row['name_ar'] = name[i:].strip()
	return row

In [51]:
narrators.name = narrators.name.apply(format_name)

In [52]:
narrators = narrators.apply(split_row_on_name, axis=1)
narrators.drop('name', axis=1, inplace=True)

In [None]:
first_few = narrators.iloc[2360:].head(15).places_of_stay.values
with open('first_few.txt', 'w', encoding="utf-8") as f:
	for text in first_few:
		f.write(text.replace('\u200f', '') + '\n')

In [54]:
def format_grade(text):
	text = text.replace('\u200f', '')
	text = text.replace('\u200e', '')
	text = text.replace('\n', '')
	text = text.replace(' .', '.')

	text = text.replace("Comp.", 'Companion ')
	text = text.replace("Follower", 'Follower ')
	text = text.replace("Succ.", 'Successor')
	text = text.strip()
	return text

In [55]:
narrators["grade"] = narrators.grade.apply(format_grade)

In [56]:
narrators.death_reason.value_counts()

death_reason
Natural     23876
Martyred      312
Name: count, dtype: int64

In [57]:
narrators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   narrator_id     24326 non-null  int64 
 1   grade           24326 non-null  object
 2   death_reason    24188 non-null  object
 3   places_of_stay  16728 non-null  object
 4   name_en         24326 non-null  object
 5   name_ar         24326 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.1+ MB


In [59]:
hadiths.to_csv('complete_hadiths_clean.csv', index=False)
narrators.to_csv('complete_narrators_clean.csv', index=False)

In [1]:
import pandas as pd
hadiths = pd.read_csv('hadiths_clean.csv')