## Prepare

In [49]:
#Import PyMySQL library
import pymysql

In [50]:
def make_connection(wiki, replica_type="analytics"):
    """Connects to a host and database of the same name.
    
    `replica_type` can be either "analytics" (default), or "web"."""
    assert replica_type == "web" or replica_type == "analytics"
    return pymysql.connect(
        host=f"{wiki}.{replica_type}.db.svc.wikimedia.cloud",
        read_default_file=".my.cnf",
        database=f"{wiki}_p",
        charset='utf8'
    )

In [51]:
def query(conn, query):
    """Execute a SQL query against the connection, and return **all** the results."""
    with conn.cursor() as cur:
        cur.execute(query)
        data = cur.fetchall()
        return data

## Execute

In [52]:
eswiki_conn = make_connection("eswiki")


In [54]:
results = query(
    eswiki_conn,
    "SELECT page_title FROM page LIMIT 10")

for result in results:
    print(result)
    
eswiki_conn.close()

(b'!',)
(b'!!',)
(b'!!!',)
(b'!!!_(\xc3\xa1lbum)',)
(b'!Kung',)
(b'!Kung_Ekoka',)
(b'!Queridos_Camaradas!',)
(b'!Revoluci\xc3\xb3n_del_arte_de_mujeres',)
(b'!Women_Art_Revolution',)
(b'!_(desambiguaci\xc3\xb3n)',)


## Implementation

### Get the translation counts by language

In [3]:
import sys
!{sys.executable} -m pip install selenium

Collecting selenium
  Using cached selenium-4.11.2-py3-none-any.whl (7.2 MB)
Collecting trio-websocket~=0.9
  Using cached trio_websocket-0.10.3-py3-none-any.whl (17 kB)
Collecting trio~=0.17
  Using cached trio-0.22.2-py3-none-any.whl (400 kB)
Collecting outcome
  Using cached outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting exceptiongroup>=1.0.0rc9
  Using cached exceptiongroup-1.1.2-py3-none-any.whl (14 kB)
Collecting wsproto>=0.14
  Using cached wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: outcome, h11, exceptiongroup, wsproto, trio, trio-websocket, selenium
Successfully installed exceptiongroup-1.1.2 h11-0.14.0 outcome-1.2.0 selenium-4.11.2 trio-0.22.2 trio-websocket-0.10.3 wsproto-1.2.0


In [126]:
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Chrome options
chrome_options = Options()
# run the Selenium tests using a headless browser. This prevents the browser from opening new tab to show the process
#chrome_options.add_argument("--headless")
# disable the Dev SHM mode
chrome_options.add_argument("--disable-dev-shm-usage")
# disable the Sandbox mode.
chrome_options.add_argument('--no-sandbox')
# add language
chrome_options.add_argument('--lang=en-US')
# this is tested on Firefox or you can use "webdriver.Chrome()"
wb = webdriver.Chrome(options=chrome_options)


In [62]:
# Get page #1 and obtain the official languages and the codes
wb.get("https://en.wikipedia.org/wiki/List_of_Wikipedias")
# assert "List of Wikipedias - Wikipedia" in wb.title
title = wb.title
assert title == "List of Wikipedias - Wikipedia"

# obtain list of languages with a Wikipedia edition and their Wikipedia codes
languages = wb.find_elements(By.XPATH, "//table[contains(@class,'wikitable plainrowheaders sortable static-row-numbers static-row-header-hash jquery-tablesorter')]//tr/th[contains(@scope, 'row')]") #[contains(@scope, 'row')]
wp_codes = wb.find_elements(By.XPATH, "//table[contains(@class,'wikitable plainrowheaders sortable static-row-numbers static-row-header-hash jquery-tablesorter')]//tr/td[2]") #[contains(@scope, 'row')]

# get text from objects
wp_codes = [code.text for code in wp_codes]
languages = [lang.text for lang in languages]

In [83]:
# construct dictionary

wikis_cx_info = dict.fromkeys(wp_codes, {})

for i in range(len(languages)):
    wp_code = wp_codes[i]
    lang = languages[i]
    wikis_cx_info[wp_code] = {"language": lang, "wp_code": wp_code}
    
wikis_cx_info

332
332


{'en': {'language': 'English', 'wp_code': 'en'},
 'ceb': {'language': 'Cebuano', 'wp_code': 'ceb'},
 'de': {'language': 'German', 'wp_code': 'de'},
 'sv': {'language': 'Swedish', 'wp_code': 'sv'},
 'fr': {'language': 'French', 'wp_code': 'fr'},
 'nl': {'language': 'Dutch', 'wp_code': 'nl'},
 'ru': {'language': 'Russian', 'wp_code': 'ru'},
 'es': {'language': 'Spanish', 'wp_code': 'es'},
 'it': {'language': 'Italian', 'wp_code': 'it'},
 'arz': {'language': 'Egyptian Arabic', 'wp_code': 'arz'},
 'pl': {'language': 'Polish', 'wp_code': 'pl'},
 'ja': {'language': 'Japanese', 'wp_code': 'ja'},
 'zh': {'language': 'Chinese', 'wp_code': 'zh'},
 'vi': {'language': 'Vietnamese', 'wp_code': 'vi'},
 'uk': {'language': 'Ukrainian', 'wp_code': 'uk'},
 'war': {'language': 'Waray', 'wp_code': 'war'},
 'ar': {'language': 'Arabic', 'wp_code': 'ar'},
 'pt': {'language': 'Portuguese', 'wp_code': 'pt'},
 'fa': {'language': 'Persian', 'wp_code': 'fa'},
 'ca': {'language': 'Catalan', 'wp_code': 'ca'},
 'sr'

In [127]:
# Get page #2 and obtain the translation counts

# NOTE: even though we are going to the specific page of the Enhlish Wikipedia, we will get 
# the data that is available in all of them 
wb.get("https://en.wikipedia.org/wiki/Special:ContentTranslationStats")
# assert "Content Translation statistics - Wikipedia" in wb.title
title = wb.title
assert title == "Content Translation statistics - Wikipedia"


timeout = 50
try:
    # wait until the element is present
    element_present = EC.presence_of_element_located((By.ID, "cx-stats-publishedtab-0"))
    WebDriverWait(wb, timeout).until(element_present)
    
    translations_to_data = wb.find_elements(By.XPATH, "//div[contains(@id, 'cx-stats-publishedtab-0')]") 
    translations_from_data = wb.find_elements(By.XPATH, "//div[@id='cx-stats-publishedtab-1']")
    
except TimeoutException:
    print("Timed out waiting for page to load")




In [128]:
print(translations_to_data)
print(translations_from_data)

[<selenium.webdriver.remote.webelement.WebElement (session="7fbb8be06164b2f79f48206c4b9eed0a", element="0A9C622ADEF8414C946AC968B228B9C8_element_54")>]
[<selenium.webdriver.remote.webelement.WebElement (session="7fbb8be06164b2f79f48206c4b9eed0a", element="0A9C622ADEF8414C946AC968B228B9C8_element_6589")>]


In [107]:
wb.close()