## ICBF Webscraping Notebook

**Please DO NOT RUN this notebook.**

The code in this file shows the process that was followed in a laptop machine to perform text scraping from the FAQs section of the Colombian Institute for Family Welfare (ICBF, its acronym in Spanish). This FAQs document collection contains common questions that Colombian citizens might have about different legal procedures regarding children welfare, nutrition, child adoption, and others. The dataset is in Spanish.

In [0]:
# Importing required libraries
import re
import time
import asyncio
import aiohttp
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin

source_url = 'https://www.icbf.gov.co/servicios/preguntas-y-respuestas-frecuentes'
chrome_webdriver = 'REPLACE-HERE-WITH-CHROMEDRIVER-PATH-FOR-SELENIUM'

In [0]:
# Function that performs automated scrolling on a website with infinite scrolling.
def automated_webscrolling(url: str, webdriver_path: str
                           ) -> webdriver.chrome.webdriver.WebDriver:

    webservice = webdriver.ChromeService(executable_path=webdriver_path)
    driver = webdriver.Chrome(service=webservice)
    driver.get(url)
    time.sleep(10)
    scroll_pause_time = 5
    screen_height = driver.execute_script("return window.screen.height;")  
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break
    
    return driver


In [0]:
# Parsing the FAQs site of the Colombian Institute for Family Welfare (ICBF) to obtain the questions' websites.
def icbf_faq_urls(driver: webdriver.chrome.webdriver.WebDriver
                  ) -> list:

    urls = []
    soup = BeautifulSoup(driver.page_source, "html.parser")

    for element in soup.find_all(class_="faq-btn"):
        link = element.attrs['href']
        urls.append(link)
    
    return urls

In [0]:
# Defining an async single GET request with retry
async def single_get_request(
    session: aiohttp.client.ClientSession, url: str
) -> str:
    try:
        async with session.get(url) as request:
            response = await request.text() 
            return response
    except Exception as e:
        print(e)
        return None

# Defining how to make multiple async GET requests from an URL list
async def multiple_get_requests(
    loop: asyncio.unix_events._UnixSelectorEventLoop, urls: list
) -> list:
    responses = []
    async with aiohttp.ClientSession(loop=loop) as session:
        responses = await asyncio.gather(
            *[single_get_request(session, url) for url in urls],
            return_exceptions=True,
        )
    return responses

# Function that performs async web scrapping from the FAQs site of ICBF
async def scrapping_icbf_urls(urls: list,
                              event_loop: asyncio.unix_events._UnixSelectorEventLoop
                              ) -> list:

    webcontents = await multiple_get_requests(event_loop, urls)

    scraps = []

    for index, page in enumerate(webcontents):
        try: 
            parsing_doc = BeautifulSoup(page, "html.parser")
            title = parsing_doc.find(class_="faq-title2").text
            fields = parsing_doc.find_all(class_="faq-txt")
            if not fields:
                fields = parsing_doc.find_all("td") # Some entries are in table data cells
            answer = ' '.join([item.text for item in fields])
            id = parsing_doc.find(class_="faq-code").text.strip()
            id = re.sub('^.*\xa0', ' ', id)
            scraps.append({'id': id, 'title': title, 'answer': answer, 'url': urls[index]})
        except Exception as e:
            print(f"The following URL could not be retrieved: {urls[index]}")
            continue
    
    return scraps, webcontents

# Helps to correctly format the responses as a single continuous string without spurious text
def text_preprocessing(input_string: str) -> str:
    processed_text = re.sub('\n', '', input_string)
    processed_text = re.sub(' {2,}', ' ', processed_text)
    processed_text = re.sub('Trámites relacionados.*', '', processed_text)
    return processed_text

In [0]:
# Performing the automated scrolling and link retrieval. Only in a local workstation.
scroller = automated_webscrolling(source_url, chrome_webdriver)
links = icbf_faq_urls(scroller)
print(f"The number of potential text sources is: {len(links)}")

The number of potential text sources is: 341


In [0]:
# Scrapping the FAQ's website
documents, webcontents = await scrapping_icbf_urls(links, asyncio.get_event_loop())
print(f"The number of actually retrieved text sources is: {len(documents)}")

for document in documents:
    document['answer'] = text_preprocessing(document['answer'])

The following URL could not be retrieved: https://www.icbf.gov.co/que-se-entiende-cuando-se-habla-de-trafico-de-influencias-de-servidor-publico
The number of actually retrieved text sources is: 340


In [0]:
# Verifying the integrity of the documents
for index, entry in enumerate(documents):
    print(f"Index of the entry: {index}. Text: {entry['answer']}")

Index of the entry: 0. Text: En este caso, usted debe presentar una demanda ante un Juez de Familia, a fin de que este resuelva la controversia, mediante sentencia. 
Index of the entry: 1. Text: Es la tenencia física, el cuidado personal y directo del niño, niña o adolescente. Es un derecho de los niños y una obligación de los padres o representantes legales. 
Index of the entry: 2. Text: Usted puede solicitar la fijación o regulación de visitas en el Centro Zonal de Bienestar Familiar del lugar en donde se encuentre el niño, niña o adolescente, ya que por Ley es la autoridad competente para conocer y decidir sobre este asunto. Allí, deberá presentarse el registro civil de nacimiento del niño(a), copia de la cedula del solicitante, pruebas de los hechos fundamento de las pretensiones, entre otras. Las visitas se regulan de acuerdo con las necesidades del hijo(a) y a la capacidad del padre o madre de brindarle la atención, cuidado y protección requerida durante el término que duren. Sin

In [0]:
# Eliminating entries without response
for index, document in enumerate(documents):
    if(document['answer'] == '' or document['answer'] is None):
        documents.pop(index)

print(f"The final number of recovered documents is: {len(documents)}")

The final number of recovered documents is: 339


In [0]:
# Saving the documents to a CSV table
icbf_frame = pd.DataFrame(documents)
icbf_frame.to_csv('icbf_knowledge_base.csv', sep = '|', index=False)