# Wikimedia Research - Translation Imbalances: Testing hypothesis #2

# Tests

#### Initial test with dumps

In [2]:
# import libraries here
import gzip  # necessary for decompressing dump file into text format
import pandas as pd
import numpy as np

In [18]:
# Every language on Wikipedia has its own page restrictions table
# you can find all the dbnames (e.g., enwiki) here: https://www.mediawiki.org/w/api.php?action=sitematrix
# for example, you could replace the LANGUAGE parameter of 'enwiki' with 'arwiki' to study Arabic Wikipedia
LANGUAGE = 'eswiki'
MONTH = '2023-03'
# e.g., enwiki -> en.wikipedia (this is necessary for the API section)
SITENAME = LANGUAGE.replace('wiki', '.wikipedia')
# directory on PAWS server that holds Wikimedia dumps
# DUMP_DIR = f"/public/dumps/public/other/clickstream/{MONTH}/"
DUMP_DIR = f"data/"
CLICKSTREAM_FN = f'clickstream-{LANGUAGE}-{MONTH}.tsv.gz'

In [19]:
# English dataset
df_es = pd.read_csv(DUMP_DIR + CLICKSTREAM_FN,  sep = '\t', names = ['source', 'destination', 'type', 'n'], 
                    dtype ={'type': 'category', 'n': 'uint32'}, usecols=['source', 'destination', 'n'], nrows = 4000000)

In [15]:
df_es.head()

Unnamed: 0,source,destination,n
0,Paula,Santa_Paula,17
1,other-search,Emirato_de_Trarza,17
2,Acteón,Palacio_Real_de_Caserta,11
3,other-search,Sam_Bass,88
4,Luigi_Vanvitelli,Palacio_Real_de_Caserta,16


#### Dutch Wikimedia - page articles multistream data

In [20]:
DUMP_DIR = f"data/"
FILENAME = 'nlwiki-20230601-pages-articles-multistream-index5.txt-p2069271p3569270'
CLICKSTREAM_FN = f'{FILENAME}.bz2'

In [27]:
# Dutch dataset
df_nl = pd.read_csv(DUMP_DIR + CLICKSTREAM_FN,  sep = '\t', nrows = 40000000)

In [24]:
df_nl.head()

Unnamed: 0,606:2069272:Wereldkampioenschap handbal mannen 2011
0,606:2069273:Jacob van Artois
1,606:2069276:Gonzales Coques
2,606:2069278:Zech von Burkensroda
3,606:2069280:Sulfoleen
4,606:2069281:Butadieensulfon


In [14]:
DUMP_DIR = f"data/"
FILENAME = 'zhwiki-20230601-pages-articles-multistream-index4.txt-p1389649p2889648'
CLICKSTREAM_FN = f'{FILENAME}.bz2'

In [16]:
df_zh = pd.read_csv(DUMP_DIR + CLICKSTREAM_FN, sep = '\t', nrows = 40000000)

In [17]:
df_zh.head()

Unnamed: 0,631:1389652:陸奧號戰艦
0,631:1389657:先天性四肢切斷症
1,631:1389658:Category:以倫敦為背景的電影
2,631:1389659:尹桐阳
3,631:1389660:子癇
4,631:1389716:邓高镜


### Notebook example

__NOTE__ : In order for this to work, we need to have access to a Tools account t access the file needed for the "read_default_file" property: https://wikitech.wikimedia.org/wiki/Help:Toolforge/Quickstart#Get_access

In [None]:
import pymysql

'conda install -c conda-forge pymysql'

In [9]:
def make_connection(wiki, replica_type="analytics"):
    """Connects to a host and database of the same name.
    
    `replica_type` can be either "analytics" (default), or "web"."""
    assert replica_type == "web" or replica_type == "analytics"
    return pymysql.connect(
        host=f"{wiki}.{replica_type}.db.svc.wikimedia.cloud",
        read_default_file=".my.cnf",
        database=f"{wiki}_p",
        charset='utf8'
    )

In [10]:
def query(conn, query, args):
    """Execute a SQL query against the connection, and return **all** the results."""
    with conn.cursor() as cur:
        cur.execute(query, args=args)
        data = cur.fetchall()
        return data

In [None]:
commons_conn = make_connection("commonswiki")

In [None]:
results = query(
    commons_conn,
    "SELECT page_title FROM page WHERE page_title LIKE %s LIMIT 10",
    "%Alicante%"
)

for result in results:
    print('*', str(result[0], encoding="utf-8"))
    
commons_conn.close()

## Data collection

__NOTE__: The MediaWiki API can be language specific. We can use it to contruct the relevant datasets of the users we need. The following will yield different results:
- https://es.wikipedia.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount
- https://de.wikipedia.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount
- https://www.mediawiki.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount&format=json

In [12]:
import requests

url = "https://www.mediawiki.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount&format=json"

# Making a GET request
response = requests.get(url)
# See status code
print(response.status_code)
# See request text
print(response.text)

response.json()


200
{"batchcomplete":"","query":{"users":[{"userid":398607,"name":"Adamw","editcount":1506}]}}


{'batchcomplete': '',
 'query': {'users': [{'userid': 398607, 'name': 'Adamw', 'editcount': 1506}]}}

In [7]:

import requests

url = "https://en.wikipedia.org/w/rest.php/v1/search/page?q=earth&limit=1"

# Making a GET request
response = requests.get(url)
# See URL
response.url
# See status code
response.status_code
# See request header
response.text




'{"pages":[{"id":9228,"key":"Earth","title":"Earth","excerpt":"<span class=\\"searchmatch\\">Earth</span> is the third planet from the Sun and the only place known in the universe where life has originated and found habitability. <span class=\\"searchmatch\\">Earth</span> is the only planet","matched_title":null,"description":"Third planet from the Sun","thumbnail":{"mimetype":"image/jpeg","width":60,"height":60,"duration":null,"url":"//upload.wikimedia.org/wikipedia/commons/thumb/c/cb/The_Blue_Marble_%28remastered%29.jpg/60px-The_Blue_Marble_%28remastered%29.jpg"}}]}'

## Data processing

## Data analysis