# Scraping W3C working group data

This notebooks scraps the data from the W3C website.

In [None]:
import pandas as pd
import requests
from selenium.webdriver import Firefox
from time import sleep
from bs4 import BeautifulSoup
import re
from glob import glob

## Functions

In [2]:
def find_class(soup, class_attr):
    """
    Given a Beautiful Soup object `soup`, return its first element
    whose 'class' attribute is `class_attr` (str) as a Beautiful soup
    object.
    """
    
    return soup.find(attrs={'class':class_attr})


def get_class_text(soup, class_attr, strip=True):
    """
    Return the text inside the first element  in `soup` 
    (Beautiful Soup object) whose 'class' attribute 
    is `class_attr`. If `strip` is True, remove trailing 
    empty spaces.
    """
    element = find_class(soup, class_attr)
    if type(element) == type(None):
        return None
    
    text = element.text
    if strip == True:
        text = text.strip()
    
    return text


def find_all_class(soup, class_attr):
    """
    Given a Beautiful Soup object `soup`, return all its elements
    whose 'class' attribute is `class_attr` (str) as a list of 
    Beautiful soup objects.    
    """
    elements = soup.find_all(attrs={'class': class_attr})

    return elements


def source_to_class_elements(source, class_attr):
    """
    Given a HTML page source code `source` (str), return all 
    its elements whose 'class' attribute is `class_attr` (str) 
    as a list of Beautiful soup objects.    
    """
    
    soup = BeautifulSoup(source, features='lxml')
    elements = find_all_class(soup, class_attr)
    
    return elements


In [3]:
def scrap_group_card(group_card):
    """
    Given an HTML div containing info about a W3C working group
    (BeautifulSoup element Tag), return a dict with the group's
    name, url and description.
    """
    data = dict()
    
    data['name'] = group_card.find('h2').text.strip()
    data['url'] = root + group_card.find('a').attrs['href']
    data['description'] = group_card.find('p').text
    
    return data

In [4]:
def trim_whitespaces(string):
    """
    Remove excessive whitespace from string.
    """
    return ' '.join(string.strip().split())

In [5]:
def get_page_source(url):
    """
    Use Selenium to download the source of the webpage 
    at `url` (str). Return it as a str.
    """
    
    # Get page source:
    driver = Firefox()
    driver.get(url)
    sleep(5)
    source = driver.page_source
    
    driver.close()
    driver.quit()

    return source


In [6]:
def scrap_member(member_card, root, section=None):
    """
    Get data about an WG member from the HTML element 
    (Beautifulsoup element Tag), pass it to a dict.
    """
    # Initialize data about member:
    data = dict()
    data['name']     = None
    data['org_name'] = None
    data['org_url']  = None 
    data['email']    = None
    data['github']   = None
    data['picture']  = None
    data['role']     = section
    
    # Get name:
    data['name'] = trim_whitespaces(member_card.find('h3').text)
    
    # Get organization name and URL:
    org = member_card.find('p')
    data['org_name'] = trim_whitespaces(org.text)
    org_link = org.find('a')
    if org_link != None:
        data['org_url']  = root + org_link.attrs['href']
    
    # Get email and github:
    infos = find_all_class(member_card, 'with-icon--before')
    for info in infos:
        link = info.attrs['href']
        if link.find('mailto') != -1:
            data['email'] = link.split(':')[1]
        elif link.find('github') != -1:
            data['github'] = link

    # Get picture URL:
    data['picture'] = root + find_class(member_card, 'avatar').find('img').attrs['src']

    return data

In [7]:
def get_section_id(section):
    """
    Look for 'id' attribute in a 'h2' header, and return it.
    """
    header = section.find('h2')
    if header == None: 
        return None
    return header.attrs['id']

In [8]:
def scrap_wg_members(url, root, driver):
    """
    Scrap data about W3C WG members from the group's page.
    NOTE: You must perform the login in the `driver` browser
    before calling this function, in order for all information
    to be available.
    
    Parameters
    ----------
    url : str
        Address of the page.
    root : str
        website root URL.
    driver : selenium WebDriver
        Browser used in the scraping. It must have been 
        initialized and logged in already.

    Returns
    -------
    members_info : list of dict
        Data about the members.
    """
    
    # Get page source:
    driver.get(url)
    src = driver.page_source
    
    # Get participants categories:
    #section_ids = ['chairs', 'staff', 'participants']
    soup = BeautifulSoup(src)
    category_sections = find_all_class(soup, 'component--text')
    section_ids = [get_section_id(c) for c in category_sections]
    assert section_ids[0] == None
    
    # Loop over categories of members:
    members_info = []
    for section, section_id in zip(category_sections[1:], section_ids[1:]):
        assert section.find('h2').attrs['id'] == section_id
        # Scrap members infos for section (member category):
        member_cards = find_all_class(section, 'card--user')
        # Add to WG members list:
        members_info += [scrap_member(m, root, section_id) for m in member_cards]

    return members_info

In [9]:
def remove_accents(string, i=0):
    """
    Input: string
    
    Returns the same string, but without all portuguese-valid accents.
    """
    
    # Missing values case:
    if type(string) == type(np.NaN):
        return string
    
    accent_list = [('Ç','C'),('Ã','A'),('Á','A'),('À','A'),('Â','A'),('É','E'),('Ê','E'),('Í','I'),('Õ','O'),('Ó','O'),
                   ('Ô','O'),('Ú','U'),('Ü','U'),('ç','c'),('ã','a'),('á','a'),('à','a'),('â','a'),('é','e'),('ê','e'),
                   ('í','i'),('õ','o'),('ó','o'),('ô','o'),('ú','u'),('ü','u'),('È','E'),('Ö','O'),('Ñ','N'),('è','e'),
                   ('ö','o'),('ñ','n'),('Ë','E'),('ë','e'),('Ä','A'),('ä','a')]
    if i >= len(accent_list):
        return string
    else:
        string = string.replace(*accent_list[i])
        return remove_accents(string, i + 1)


def text2tag(text):
    """
    Simplify `text` to use it as part os filenames and so on
    (lowercase it, remove accents and spaces).
    """

    # Remove duplicated spaces:
    text = ' '.join(text.split())
    # Transform to tag:
    tag  = re.sub('[\.,;!:\(\)/]', '', remove_accents(text).lower().replace(' ', '_'))
    return tag

In [10]:
def split_name_email(value):
    """
    Split name from email in the format:
        John Smith ( john@email.com )
    Returns the name and the email.
    """
    parts = [trim_whitespaces(s) for s in value.split(')')[0].split('(')]
    return parts[0], parts[1]

In [11]:
def scrap_org_info(url, root, driver):
    """
    Scrap W3C member organization from W3C website URL.
    You must be logged in to access the webpages.

    Parameters
    ----------
    url : str
        Address of the organization's page.
    root : str
        website root URL.
    driver : selenium WebDriver
        Browser used in the scraping. It must have been 
        initialized and logged in already.

    Returns
    -------
    org_data : dict
        Data about the organization.
    """
    
    # GET HTML:
    driver.get(url)
    src = driver.page_source
    soup = BeautifulSoup(src)
    
    # Initialize data:
    org_data = dict()
    # Org. name:
    org_data['name'] = trim_whitespaces(soup.find('h1').text)
    # Org logo:
    details = find_class(soup, 'details')
    img = details.find('img')
    if img == None:
        org_data['logo'] = None
    else:
        org_data['logo'] = root + img.attrs['src']
    # Testimonial:
    test_element = details.find(attrs={'id':'en-testimonial'})
    if test_element is None:
        org_data['testimonial'] = None
    else:
        org_data['testimonial'] = test_element.text
    # Other info (from table):
    grid = find_class(details, 'grid')
    labels = [text2tag(trim_whitespaces(r.text)) for r in grid.find_all('dt')]
    values = [trim_whitespaces(r.text) for r in grid.find_all('dd')]
    raw_details = dict(zip(labels, values))
    
    # Parse representative's name and email:
    for role in ['ac_rep', 'alternate_ac_rep']:
        if role in labels:
            if trim_whitespaces(raw_details[role]) == 'None':
                name, email = None, None
            else:
                name, email = split_name_email(raw_details[role])
            raw_details[role + '_email'] = email
            raw_details[role] = name
            
    # Combine details with name and logo:
    org_data.update(raw_details)

    return org_data

## Scraping list of working groups (WGs)

In [86]:
# GET W3C WG page:
url  = 'https://www.w3.org/groups/wg/'
root = 'https://www.w3.org'
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [8]:
# Extract data:
group_cards = find_all_class(soup, 'card--group')
group_data = [scrap_group_card(g) for g in group_cards]
group_df = pd.DataFrame(group_data)

In [38]:
# Add identifier:
group_df['tag'] = group_df['url'].str.split('/').str.slice(-2,-1).str.join('')
assert group_df['tag'].nunique() == len(group_df)

In [34]:
# Save to file:
#group_df.to_csv('../dados/brutos/w3c_wg_2024-07-04.csv', index=False)

## Scraping members of WGs

In [None]:
# Load list of WGs:
root = 'https://www.w3.org'
group_df = pd.read_csv('../dados/brutos/w3c_wg_2024-07-04.csv')

# Hard-coded URL paths:
members_path = 'participants/'

# Start browser:
driver = Firefox()
driver.get(root)

It is necessary to authenticate in the browser above to scrap information.

In [76]:
# Loop over WGs:
for i in range(len(group_df)):
#for i in range(26, 27):

    # Get page URL of the WG:
    group = group_df.iloc[i]
    print(i, group['name'])
    url = group['url'] + members_path
    
    # Scrap members:
    members_info = scrap_wg_members(url, root, driver)
    # Add group ID:
    members_df = pd.DataFrame(members_info)
    members_df['group_tag'] = group['tag']

    # Save to file:
    members_df.to_csv('../dados/brutos/w3c-wg-members_{}_2024-07-04.csv'.format(group['tag']), index=False)
    
    sleep(5)

26 Service Workers Working Group


In [64]:
# Groups with no chairs:
bug_groups = ['https://www.w3.org/groups/wg/service-workers/participants/']

## Scraping data from organizations

In [21]:
# Load all organizations:
all_orgs_df = pd.read_csv('../data/raw/api/w3c_organizations_list_2024-07-06.csv')
all_org_list = list('https://www.w3.org/organizations/' + all_orgs_df['href'].str.split('/').str.slice(-1).str.join(''))

In [173]:
# Load organizations in WGs list:
filenames = glob('../dados/brutos/w3c-wg-members_*.csv')
raw_members_df = pd.concat([pd.read_csv(f) for f in filenames])
org_list = raw_members_df['org_url'].dropna().unique()

In [20]:
# Start browser:
root = 'https://www.w3.org'
driver = Firefox()
driver.get(root)

The geckodriver version (0.33.0) detected in PATH at /home/hxavier/system/bin/geckodriver might not be compatible with the detected firefox version (127.0.2); currently, geckodriver 0.34.0 is recommended for firefox 127.*, so it is advised to delete the driver in PATH and retry


It is necessary to authenticate in the browser above to scrap information.

In [23]:
# Over all organizations:
org_data = []
for i in range(len(all_org_list)):
    print(i, all_org_list[i])
    data = scrap_org_info(all_org_list[i], root, driver)
    data['url'] = all_org_list[i]
    org_data.append(data)

    sleep(2)

0 https://www.w3.org/organizations/1001


AttributeError: 'NoneType' object has no attribute 'find'

In [175]:
# Over WG organizations:
org_data = []
for i in range(len(org_list)):
    print(i, org_list[i])
    data = scrap_org_info(org_list[i], root, driver)
    data['url'] = org_list[i]
    org_data.append(data)

    sleep(2)

2 https://www.w3.org/organizations/1066/
3 https://www.w3.org/organizations/112585/
4 https://www.w3.org/organizations/62028/
5 https://www.w3.org/organizations/1202/
6 https://www.w3.org/organizations/79706/
7 https://www.w3.org/organizations/131353/
8 https://www.w3.org/organizations/116778/
9 https://www.w3.org/organizations/46877/
10 https://www.w3.org/organizations/133378/
11 https://www.w3.org/organizations/1057/
12 https://www.w3.org/organizations/1219/
13 https://www.w3.org/organizations/119171/
14 https://www.w3.org/organizations/140732/
15 https://www.w3.org/organizations/61566/
16 https://www.w3.org/organizations/48024/
17 https://www.w3.org/organizations/69906/
18 https://www.w3.org/organizations/143173/
19 https://www.w3.org/organizations/93515/
20 https://www.w3.org/organizations/43576/
21 https://www.w3.org/organizations/141535/
22 https://www.w3.org/organizations/1092/
23 https://www.w3.org/organizations/114837/
24 https://www.w3.org/organizations/116845/
25 https://www

In [178]:
org_df = pd.DataFrame(org_data)

In [180]:
# Save data:
#org_df.to_csv('../dados/brutos/w3c_organizations_2024-07-04.csv', index=False)