In [None]:
import pandas as pd

hadiths = pd.read_csv('raw/kaggle_hadiths_clean.csv')
rawis = pd.read_csv('raw/kaggle_rawis.csv')

In [None]:
hadiths.head()

In [None]:
rawis.head()

## Hadiths Cleanup

In [None]:
hadiths.info()

In [None]:
hadiths.dropna(inplace=True)

In [None]:
hadiths.info()

### Drop Duplicates

In [None]:
ids_with_duplicates = hadiths.hadith_id.value_counts()[hadiths.hadith_id.value_counts() > 1].index

In [None]:
hadiths.drop_duplicates(subset='hadith_id', inplace=True)
# Keep first occurence
hadiths[hadiths.hadith_id.isin(ids_with_duplicates)].shape[0] == len(ids_with_duplicates)


In [None]:
hadiths.hadith_id.value_counts().sort_values(ascending=False)

In [None]:
hadiths.count()

### Missing Sanad Data

In [None]:
hadith_id_to_scholar_array = dict()

def extract_scholars(row):
	hadith_id = row['hadith_id']
	chain = row['chain_indx']

	# seperate on commas and remove empty strings
	scholars = [s.strip() for s in chain.split(',') if s.strip() != '']
	# as int
	scholars = [int(s) for s in scholars]
	hadith_id_to_scholar_array[hadith_id] = scholars

hadiths.apply(extract_scholars, axis=1)

In [None]:
def find_gaps():
	for key in hadith_id_to_scholar_array:
		for index in range(len(hadith_id_to_scholar_array[key])):
		
			scholar_indx = hadith_id_to_scholar_array[key][index]
			# find in rawis
			gap = rawis[rawis['scholar_indx'] == scholar_indx].shape[0] < 1
			if gap:
				gap_tuples.append((key, scholar_indx))

	return gap_tuples

In [None]:
gap_tuples = find_gaps()
missing_narrator_ids = set([t[1] for t in gap_tuples])
hadiths_with_missing_narrators = set([t[0] for t in gap_tuples])

In [None]:
len(missing_narrator_ids), len(hadiths_with_missing_narrators)

In [None]:
# drop hadiths with missing narrators
hadiths = hadiths[~hadiths.hadith_id.isin(hadiths_with_missing_narrators)]

In [None]:
hadiths.info()
hadiths.to_csv('raw/complete_hadiths.csv', index=False)

### Rename Columns

In [None]:
hadiths.drop('hadith_no', axis=1, inplace=True)

In [None]:
hadiths.rename(columns={
	'hadith_id': 'hadith_no',
	'source': 'book',
	'chain_indx': 'chain',
}, inplace=True)

In [None]:
hadiths.info()

#### Save

In [None]:
# hadiths.to_csv('raw/complete_hadiths.csv', index=False)

## Narrators Cleanup

In [None]:
rawis.info()

In [None]:
rawi_columns = [
	"scholar_indx",
	"name",
	"grade",
	"death_date_place",
	"area_of_interest",
	"death_reason",
	"places_of_stay"
]

In [None]:
rawis = rawis[rawi_columns]

In [None]:
rawis.info()

In [None]:
rawis.rename(columns={
	'scholar_indx': 'narrator_id'
}, inplace=True)

#### Save

In [None]:
rawis.to_csv('raw/complete_narrators.csv')

### String Formatting - Hadith

In [None]:

narrators = pd.read_csv("raw/complete_narrators.csv")
hadiths = pd.read_csv("raw/complete_hadiths.csv")
hadiths.dropna(inplace=True)

In [None]:
hadiths["book"] = hadiths["book"].str.strip()

#### Sampling

In [None]:
first_few = hadiths.iloc[20000:].head(15).text_en.values
with open('first_few.txt', 'w', encoding='utf-8') as f:
	for text in first_few:
		f.write(text + '\n')

#### clean chapters

In [None]:
# split each string on last occurence of hyphen
hadiths['chapter_en'] = hadiths['chapter'].str.rsplit('-', n=1, expand=True)[0].str.strip()
hadiths['chapter_ar'] = hadiths['chapter'].str.rsplit('-', n=1, expand=True)[1].str.strip()
hadiths.drop('chapter', axis=1, inplace=True)

#### clean texts

In [None]:
# make sure spaces only appear one time to avoid large gaps

for _ in range(20):
	hadiths['text_en'] = hadiths['text_en'].str.replace('  ', ' ') 
	hadiths['text_ar'] = hadiths['text_ar'].str.replace('  ', ' ')

In [None]:
def format_arabic_text(text):
	text = text.replace('\u200f', '')
	text = text.replace('\u200e', '')
	text = text.replace('\n', '')
	text = text.replace(' .', '.')
	text = text.strip()
	return text

In [None]:
hadiths["text_ar"] = hadiths.text_ar.apply(format_arabic_text)
hadiths["text_en"] = hadiths.text_en.str.strip()
hadiths["text_en"] = hadiths.text_en.str.replace('\n', '')

In [None]:
hadiths.info()

### String Formatting - narrators

In [252]:
narrators = pd.read_csv("raw/complete_narrators.csv")

In [253]:
narrators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        24326 non-null  int64 
 1   narrator_id       24326 non-null  int64 
 2   name              24326 non-null  object
 3   grade             24326 non-null  object
 4   death_date_place  24320 non-null  object
 5   area_of_interest  24272 non-null  object
 6   death_reason      24188 non-null  object
 7   places_of_stay    16728 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.5+ MB


In [254]:
def format_name(text):
	text = text.replace('\u200f', '')
	text = text.replace('\u200e', '')
	text = text.replace('\n', '')
	text = text.replace(' .', '.')

	text = text.replace('رضي الله عنها', '')
	text = text.replace('رضي الله عنه', '')
	text = text.replace('(', '')
	text = text.replace(')', '')

	text = text.strip()
	
	return text

In [255]:
def split_row_on_name(row):
	# split name on start of arabic chars into 2 columns
	name = row['name']
	# find an arabic char
	i = 0
	for char in name:
		if ord(char) > 1500:
			break
		
		i += 1

	row['name_en'] = name[:i].strip()
	row['name_ar'] = name[i:].strip()
	return row

In [256]:
narrators.name = narrators.name.apply(format_name)

In [257]:
narrators = narrators.apply(split_row_on_name, axis=1)
narrators.drop('name', axis=1, inplace=True)

In [None]:
first_few = narrators.iloc[2360:].head(15).places_of_stay.values
with open('first_few.txt', 'w', encoding="utf-8") as f:
	for text in first_few:
		f.write(text.replace('\u200f', '') + '\n')

In [258]:
def format_grade(text):
	text = text.replace('\u200f', '')
	text = text.replace('\u200e', '')
	text = text.replace('\n', '')
	text = text.replace(' .', '.')

	text = text.replace("Comp.", 'Companion ')
	text = text.replace("Follower", 'Follower ')
	text = text.replace("Succ.", 'Successor')
	return text

In [259]:
narrators["grade"] = narrators.grade.apply(format_grade)

In [None]:
narrators.death_reason.value_counts()

In [260]:
narrators.drop('death_date_place', axis=1, inplace=True)
narrators.drop('area_of_interest', axis=1, inplace=True)

In [261]:
if 'Unnamed: 0' in narrators.columns:
	narrators.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
narrators.info()

In [262]:
hadiths.to_csv('hadiths_clean.csv', index=False)
narrators.to_csv('narrators_clean.csv', index=False)