In this notebook we will parse the Medina Amana website for death records


In [None]:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import os
import sys
import ummalqura
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline  

In [None]:
medina_url = "https://services.amana-md.gov.sa/eservicesite/Inq/DeathInquiry.aspx"


I tried to post request the commands to the websites, but it seems there is an issue with the protocols. Instead, I will use Chrome webdriver and "find_element" in the DOM to access and scrape the data. 

In [None]:
# call the browser
browser = webdriver.Chrome("./chromedriver")
browser.get(medina_url)
browser.implicitly_wait(5)

#start parsing from 1340/1/1 
m_from_day = Select(browser.find_element_by_id('ctl00_ContentPlaceHolder1_cboDFrom')).select_by_value('01')
m_from_month = Select(browser.find_element_by_id('ctl00_ContentPlaceHolder1_cboMFrom')).select_by_value('01')
m_from_year = Select(browser.find_element_by_id('ctl00_ContentPlaceHolder1_cboYFrom')).select_by_value('1340')

# to 1440/11/12
m_to_day = Select(browser.find_element_by_id('ctl00_ContentPlaceHolder1_cboDTo')).select_by_value('12')
m_to_month = Select(browser.find_element_by_id('ctl00_ContentPlaceHolder1_cboMTo')).select_by_value('11')
m_to_year = Select(browser.find_element_by_id('ctl00_ContentPlaceHolder1_cboYTo')).select_by_value('1440')

# create a list to store pages 
page_store = []

browser.find_element_by_id('ctl00_ContentPlaceHolder1_btnSubmit').click()

# make sure to wait for the div to be downloaded
element = WebDriverWait(browser, 10).until(lambda x: x.find_element_by_id('ctl00_ContentPlaceHolder1_dgDeath'))
# and store
page_store.append(bs(browser.page_source, 'html.parser'))

In [None]:
# now let's automate the navigation 
# IMPORTANT NOTE: this automation might stop at some point due to some changes in the dynamics of the page as you click. Therefore, when it stops try  
# to do the process again from where you stopped by changing the date in the section above (change m_from_day, m_from_month and m_from_year)


In [None]:
# define the indices of the spans to click
spans = [i for i in range(1,22)]  

#span text holder to debug the process 
span_text = []
#span index holder to debug the index
span_index = []

# you need to change the the span index after the second and the third iterations
# this var is to control the change 
change_index = 0

# always use try
try:
    while True:
        
        for span in spans:
            span_index.append(span)
            # find where to click using inspect
            span_holder = browser.find_elements_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_dgDeath"]/tbody/tr[78]/td/a[%s]' % span)[0]
            print("span text: ", span_holder.text)
            print("span index: ", span)
            span_text.append(span_holder.text)
            #show us where we are
            # click 
            span_holder.click()
            # append to page store 
            page_store.append(bs(browser.page_source, 'html.parser'))
            
        if change_index ==1 :
            print("change 1")
            # in the second iteration 
            # you need to delete the one and add another 21 to access the next set of spans 
            spans = [i for i in range(2,22)] + [21] 
        if change_index ==2:
            print("change 2")
            # in the third iteration
            # you need to delete the 21 you added in the second iteration
            spans = [i for i in range(2,22)] 
        
        change_index +=1
except Exception as e:
    print(e)
    print('Something wrong')

### Extract the data from pages

In [None]:
# first initialize placeholders: 
names = []
sex = []
nationality = []
DOD = []
age=[]



In [None]:
# now the process of extracting from the html layout

for p in page_store:
    div_ = p.find('table', {'id': 'ctl00_ContentPlaceHolder1_dgDeath'})
    tr = div_.findAll('tr')
    
    # the data contained here
    container = tr[2:-1]

    for row in container:
        try:
            names.append(row.findAll('td')[0].text.strip())
        # fill with np.nan if data doesn't exist
        except:
            names.append(np.nan)

        try:
            sex.append(row.findAll('td')[1].text.strip())

        except:
            sex.append(np.nan)

        try:
            nationality.append(row.findAll('td')[2].text.strip())
        except:
            nationality.append(np.nan)

        try:
            age.append(row.findAll('td')[3].text.strip())
        except:
            age.append(np.nan)

        try:
            DOD.append(row.findAll('td')[4].text.strip())
        except:
            DOD.append(np.nan)

Now save to a dataFrame and then to csv file



In [None]:
df = pd.DataFrame({'name': names, 'sex': sex, 'nationality': nationality, 'age': age, 'DOD': DOD})
df.to_csv('data_m.csv',  encoding='utf-8', index=False)

The End