## Prepare

In [49]:
#Import PyMySQL library
import pymysql

In [50]:
def make_connection(wiki, replica_type="analytics"):
    """Connects to a host and database of the same name.
    
    `replica_type` can be either "analytics" (default), or "web"."""
    assert replica_type == "web" or replica_type == "analytics"
    return pymysql.connect(
        host=f"{wiki}.{replica_type}.db.svc.wikimedia.cloud",
        read_default_file=".my.cnf",
        database=f"{wiki}_p",
        charset='utf8'
    )

In [51]:
def query(conn, query):
    """Execute a SQL query against the connection, and return **all** the results."""
    with conn.cursor() as cur:
        cur.execute(query)
        data = cur.fetchall()
        return data

In [52]:
eswiki_conn = make_connection("eswiki")


In [54]:
results = query(
    eswiki_conn,
    "SELECT page_title FROM page LIMIT 10")

for result in results:
    print(result)
    
eswiki_conn.close()

(b'!',)
(b'!!',)
(b'!!!',)
(b'!!!_(\xc3\xa1lbum)',)
(b'!Kung',)
(b'!Kung_Ekoka',)
(b'!Queridos_Camaradas!',)
(b'!Revoluci\xc3\xb3n_del_arte_de_mujeres',)
(b'!Women_Art_Revolution',)
(b'!_(desambiguaci\xc3\xb3n)',)


## Implementation

### Get Wikipedia languages and their Wikipedia codes

In [15]:
import sys
import json
!{sys.executable} -m pip install selenium



In [3]:
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Chrome options
chrome_options = Options()
# run the Selenium tests using a headless browser. This prevents the browser from opening new tab to show the process
#chrome_options.add_argument("--headless")
# disable the Dev SHM mode
chrome_options.add_argument("--disable-dev-shm-usage")
# disable the Sandbox mode.
chrome_options.add_argument('--no-sandbox')
# add language
chrome_options.add_argument('--lang=en-US')
# this is tested on Firefox or you can use "webdriver.Chrome()"
wb = webdriver.Chrome(options=chrome_options)


In [5]:
# Get page #1 and obtain the official languages and the codes
wb.get("https://en.wikipedia.org/wiki/List_of_Wikipedias")
# assert "List of Wikipedias - Wikipedia" in wb.title
title = wb.title
assert title == "List of Wikipedias - Wikipedia"

# obtain list of languages with a Wikipedia edition and their Wikipedia codes
languages = wb.find_elements(By.XPATH, "//table[contains(@class,'wikitable plainrowheaders sortable static-row-numbers static-row-header-hash jquery-tablesorter')]//tr/th[contains(@scope, 'row')]") #[contains(@scope, 'row')]
wp_codes = wb.find_elements(By.XPATH, "//table[contains(@class,'wikitable plainrowheaders sortable static-row-numbers static-row-header-hash jquery-tablesorter')]//tr/td[2]") #[contains(@scope, 'row')]

# get text from objects
wp_codes = [code.text for code in wp_codes]
languages = [lang.text for lang in languages]

In [16]:
# construct dictionary

wikis_cx_info = dict.fromkeys(wp_codes, {})

for i in range(len(languages)):
    wp_code = wp_codes[i]
    lang = languages[i]
    wikis_cx_info[wp_code] = {"language": lang, "wp_code": wp_code}
    
wikis_cx_info


# save to json
with open("wikipedia_language_codes.json", "w") as outfile:
    json.dump(wikis_cx_info, outfile)

In [13]:
# Get page #2 and obtain the translation counts
from selenium.common.exceptions import TimeoutException

# NOTE: even though we are going to the specific page of the Enhlish Wikipedia, we will get 
# the data that is available in all of them 
wb.get("https://en.wikipedia.org/wiki/Special:ContentTranslationStats")
# assert "Content Translation statistics - Wikipedia" in wb.title
title = wb.title
assert title == "Content Translation statistics - Wikipedia"


timeout = 160
try:
    # wait until the element is present
    element_present = EC.presence_of_element_located((By.ID, "cx-stats-publishedtab-0"))
    WebDriverWait(wb, timeout).until(element_present)
    
    # //div[@id, 'cx-stats-publishedtab-0']//span[@class, 'cx-stats-chart__row-label-container']
    translations_to_data = wb.find_elements(By.XPATH, "//div[@id='cx-stats-publishedtab-0']") 
    translations_from_data = wb.find_elements(By.XPATH, "//div[@id='cx-stats-publishedtab-1']")
    
except TimeoutException:
    print("Timed out waiting for page to load")




Timed out waiting for page to load


In [107]:
wb.close()

### Get the MT avalability details by language

In [None]:
# Here we would fetch the results obtained by other library/tool that I created