# Wikimedia Research - Translation Imbalances: Testing hypothesis #2

# 1. Tests

#### Initial test with dumps

In [None]:
# import libraries here
import gzip  # necessary for decompressing dump file into text format
import pandas as pd
import numpy as np

__NOTE__: The MediaWiki API can be language specific. We can use it to contruct the relevant datasets of the users we need. The following will yield different results:
- https://es.wikipedia.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount
- https://de.wikipedia.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount
- https://www.mediawiki.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount&format=json

In [1]:
import requests

url = "https://www.mediawiki.org/w/api.php?action=query&list=users&ususers=Adamw&usprop=editcount&format=json"

# Making a GET request
response = requests.get(url)
# See status code
print(response.status_code)
# See request text
print(response.text)

response.json()


200
{"batchcomplete":"","query":{"users":[{"userid":398607,"name":"Adamw","editcount":1506}]}}


{'batchcomplete': '',
 'query': {'users': [{'userid': 398607, 'name': 'Adamw', 'editcount': 1506}]}}

In [2]:
import requests

url = "https://en.wikipedia.org/w/rest.php/v1/search/page?q=earth&limit=1"

# Making a GET request
response = requests.get(url)
# See URL
response.url
# See status code
response.status_code
# See request header
response.text


'{"pages":[{"id":9228,"key":"Earth","title":"Earth","excerpt":"<span class=\\"searchmatch\\">Earth</span> is the third planet from the Sun and the only place known in the universe where life has originated and found habitability. <span class=\\"searchmatch\\">Earth</span> is the only planet","matched_title":null,"description":"Third planet from the Sun","thumbnail":{"mimetype":"image/jpeg","width":60,"height":60,"duration":null,"url":"//upload.wikimedia.org/wikipedia/commons/thumb/c/cb/The_Blue_Marble_%28remastered%29.jpg/60px-The_Blue_Marble_%28remastered%29.jpg"}}]}'

# 2. Implementation

## Data collection

In [203]:
# import general libraries
import csv
import ast
import re
import requests
import pandas as pd

In [204]:
## Collect users and languages
from csv import DictReader

# open file in read mode and convert to list of dictionaries
with open('data/user_languages_user_template.csv', encoding="utf8") as f:
    users = csv.DictReader(f)
    users_lat = []
    # TODO: AW: coordinating a list index is very fragile, a lot can go wrong and it's hard to detect mistakes.
    # initialize list to help us associate index and users for future merge
    user_index = {}

    # iterate over results to initialize copy and set new columns
    for i, row in enumerate(users):
    
        # append row to copy
        users_lat.append(row)

        # initialize fields
        row['edit_counts'] = []
        row['levels'] = []
        row['langs'] = []

        # update mapper
        username = row['username'].split(':')[1]
        row['username'] = username
        user_index[username] = i 
    
    
# out
print(users_lat[:10])    
len(user_index)   

[{'username': 'Olivier LPB', 'language': "['fr', 'en-2']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'Gamesmasterg9', 'language': "['en', 'hi-4', 'bn-3', 'mr-1']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'Dvermeirre', 'language': "['fr', 'en-5', 'de-1']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'Jklamo', 'language': "['en-3', 'cs', 'sk', 'fr-1']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'Calliopejen1', 'language': "['en', 'es-3', 'fr-1']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'Jpbrenna', 'language': "['en', 'el-2', 'la-2', 'es-2', 'ar-1']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'WarKosign', 'language': "['en-3', 'he-4', 'ru']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'Appaches', 'language': "['fr', 'en-3']", 'edit_counts': [], 'levels': [], 'langs': []}, {'username': 'IYY', 'language': "['en-N', 'he-3', 'ru-3']", 'edit_counts': [], 'levels': [],

469

In [205]:
# we are adding this list because the CSV files includes items that are not in it
allowed_languages = ['aa', 'ab', 'ace', 'ady', 'af', 'ak', 'als', 'alt', 'am', 'ami', 'an', 'ang', 'ar', 'arc', 'ary',
                     'arz', 'as', 'ast', 'atj', 'av', 'avk', 'awa', 'ay', 'az', 'azb', 'ba', 'ban', 'bar', 'bat-smg',
                     'bcl', 'be', 'be-tarask', 'bg', 'bh', 'bi', 'bjn', 'blk', 'bm', 'bn', 'bo', 'bpy', 'br', 'bs',
                     'bug', 'bxr', 'ca', 'cbk-zam', 'cdo', 'ce', 'ceb', 'ch', 'cho', 'chr', 'chy', 'ckb', 'co', 'cr',
                     'crh', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'dag', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'dz',
                     'ee', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'ff', 'fi', 'fiu-vro', 'fj', 'fo', 
                     'gur', 'fr', 'frp', 'frr', 'fur', 'fy', 'ga', 'gag', 'gan', 'gcr', 'gd', 'gl', 'glk', 'gn', 'gom',
                     'gor', 'got', 'gu', 'guw', 'gv', 'ha', 'hak', 'haw', 'he', 'hi', 'hif', 'ho', 'hr', 'hsb', 'ht', 
                     'hu', 'hy', 'hyw', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'ilo', 'inh', 'io', 'is', 'it', 'iu',
                     'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kcg', 'kg', 'ki', 'kj', 'kk', 'kl', 
                     'km', 'kn', 'ko', 'koi', 'kr', 'krc', 'ks', 'ksh', 'ku', 'kv', 'kw', 'ky', 'la', 'lad', 'lb', 'lbe', 
                     'lez', 'lfn', 'lg', 'li', 'lij', 'lld', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'ltg', 'lv', 'mad', 'mai', 
                     'map-bms', 'mdf', 'mg', 'mh', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mni', 'mnw', 'mr', 'mrj', 'ms',
                     'mt', 'mus', 'mwl', 'my', 'myv', 'mzn', 'na', 'nah', 'nap', 'nds', 'nds-nl', 'ne', 'new', 'ng', 'nia',
                     'nl', 'nn', 'no', 'nov', 'nqo', 'nrm', 'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pag',
                     'pam', 'pap', 'pcd', 'pcm', 'pdc', 'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'pnt', 'ps', 'pt', 'pwn',
                     'qu', 'rm', 'rmy', 'rn', 'ro', 'roa-rup', 'roa-tara', 'ru', 'rue', 'rw', 'sa', 'sah', 'sat', 'sc',
                     'scn', 'sco', 'sd', 'se', 'sg', 'sh', 'shi', 'shn', 'si', 'simple', 'sk', 'skr', 'sl', 'smn', 'sm', 
                     'sn', 'so', 'sq', 'sr', 'srn', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'szy', 'ta', 'tay', 'tcy',
                     'te', 'tet', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tpi', 'tr', 'trv', 'ts', 'tt', 'tum', 'tw', 'ty',
                     'tyv', 'udm', 'ug', 'uk', 'ur', 'uz', 've', 'vec', 'vep', 'vi', 'vls', 'vo', 'wa', 'war', 'guc', 'wo', 'wuu',
                     'xal', 'xh', 'xmf', 'yi', 'yo', 'za', 'zea', 'zh', 'zh-classical', 'zh-min-nan', 'zh-yue', 'zu']

## Data processing

Some possibilities is to have a dataframe that has 
[username],[language],[level],[edit count] 

In [238]:
## Clean initial data and update rows and prepare data for requests
from collections import defaultdict

# dictionary neede to make request by language and associated user list
language_match_dict = defaultdict(list)
    
for row in users_lat:
    username = row['username']
    languages = ast.literal_eval(row['language'])
    langs = []
    levels = []
    edit_keys = {}
    
    for lang in languages:
        # TODO: AW: Push this into a separate function that can be tested.
        # split by digits with a preceding character ('-') and include it
        lang_extract = list(filter(None, re.split('-(\d)|-N', lang)))
        # get the language
        lang = lang_extract[0]
        # get language level if present, else, assume native level
        level = lang_extract[1] if len(lang_extract) > 1 else 5 
        
        if (lang in allowed_languages) and (lang not in langs):
            # append
            # AW: It seems fragile to pass the level through separately in
            # a parallel structure.  I believe pandas will accept a column with
            # List(tuple) values eg. [("hi", 4), ("en", 6)].  You can still
            # transform the data before graphing to simplify visualization code.
            levels.append(level)
            langs.append(lang)
            # initialize key
            # AW: or use a `set` which has the same performance and slightly
            # more obvious semantics.
            edit_keys[lang] = 0

            # add to the user to the corresponding language group
            language_match_dict[lang].append(username)
    
    # update row columns after cleaning
    row['langs'] = langs
    row['levels'] = levels
    # AW: should we unset(row['languages']) ?
    row['edit_counts'] = edit_keys

# out
# print(language_match_dict)
print(f'\n {users_lat[:10]}')


 [{'username': 'Olivier LPB', 'language': "['fr', 'en-2']", 'edit_counts': {'fr': 0, 'en': 0}, 'levels': [5, '2'], 'langs': ['fr', 'en']}, {'username': 'Gamesmasterg9', 'language': "['en', 'hi-4', 'bn-3', 'mr-1']", 'edit_counts': {'en': 0, 'hi': 0, 'bn': 0, 'mr': 0}, 'levels': [5, '4', '3', '1'], 'langs': ['en', 'hi', 'bn', 'mr']}, {'username': 'Dvermeirre', 'language': "['fr', 'en-5', 'de-1']", 'edit_counts': {'fr': 0, 'en': 0, 'de': 0}, 'levels': [5, '5', '1'], 'langs': ['fr', 'en', 'de']}, {'username': 'Jklamo', 'language': "['en-3', 'cs', 'sk', 'fr-1']", 'edit_counts': {'en': 0, 'cs': 0, 'sk': 0, 'fr': 0}, 'levels': ['3', 5, 5, '1'], 'langs': ['en', 'cs', 'sk', 'fr']}, {'username': 'Calliopejen1', 'language': "['en', 'es-3', 'fr-1']", 'edit_counts': {'en': 0, 'es': 0, 'fr': 0}, 'levels': [5, '3', '1'], 'langs': ['en', 'es', 'fr']}, {'username': 'Jpbrenna', 'language': "['en', 'el-2', 'la-2', 'es-2', 'ar-1']", 'edit_counts': {'en': 0, 'el': 0, 'la': 0, 'es': 0, 'ar': 0}, 'levels': 

In [239]:
user_index['Fredericknoronha']
users_lat[10]

{'username': 'Fredericknoronha',
 'language': "['en-4', 'gom-2', 'hi-2', 'mr-1', 'pt-1', 'fr-0', 'de-0']",
 'edit_counts': {'en': 0,
  'gom': 0,
  'hi': 0,
  'mr': 0,
  'pt': 0,
  'fr': 0,
  'de': 0},
 'levels': ['4', '2', '2', '1', '1', '0', '0'],
 'langs': ['en', 'gom', 'hi', 'mr', 'pt', 'fr', 'de']}

In [240]:
user_index['Anamdas']
users_lat[19]

{'username': 'Anamdas',
 'language': "['hi', 'pa', 'en-3', 'pa-1', 'sa-1']",
 'edit_counts': {'hi': 0, 'pa': 0, 'en': 0, 'sa': 0},
 'levels': [5, 5, '3', '1'],
 'langs': ['hi', 'pa', 'en', 'sa']}

In [241]:
language_match_dict['bn']

['Gamesmasterg9', 'Che12Guevara', 'Debashish', 'UserNumber', 'RockyMasum']

In [242]:
language_match_dict['mr']

['Gamesmasterg9', 'Fredericknoronha', 'Wiki.editAnshu']

In [243]:
## Per each language, make request to get the data and merge it with larger dictionary
# Reference: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Busers
import traceback
from time import sleep

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# general
user_edit_counts = {}
langs = list(language_match_dict.keys())

# set requests parameters to retry on 'Max retries exceeded with url' and have enough wait time between exceptions
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

# used to track call number
api_call_number = 0

# initialize
slice_start = 0
slice_end = 50
lang = langs.pop()
next_lang = lang
users_len = len(language_match_dict[lang])

while len(langs) > 0:
    try: 
        print(lang)
        lang = next_lang
        # work with uses batches that make request per 50 users within a user language group
        users_l = language_match_dict[lang][slice_start:slice_end]
        users =  "|".join(users_l)
        url = f'https://{lang}.wikipedia.org/w/api.php?action=query&list=users&ususers={users}&usprop=editcount&format=json'
        
        print('\n')
        print("BEFORE REQUEST")
        print(f'LANG: {lang}')
        print(f'SLICE START: {slice_start}')
        print(f'SLICE END: {slice_end}')
        print(f'SLICE SIZE: {users_len}')
                    
        # check to update slice values
        if slice_end >= users_len:
            print("checking...")
            next_lang = langs.pop()

            slice_start = 0
            slice_end = 50
            users_len = len(language_match_dict[lang])
        else:
            slice_start = slice_start + 50
            slice_end =  slice_end + 50   

        # GET request
        response = session.get(url)
        if response.status_code != 200:
            raise Exception(f"The call failed: {response.status_code}")

        else:
            # See request JSON
            res = response.json()['query']['users']
            print(res)

            # save results to dictionary to process later

            # AW: Although I like the encapsulation of each user's fetched
            # information under data/users/, it's more efficient to swap
            # containment levels and instead save a csv for all users' edit
            # counts on a language wiki, at the end of this function.  This
            # is a much smaller number of files, and matches the structure of
            # iteration in program logic which becomes helpful if eg. the
            # application crashes and must be restarted.
            user_edit_counts[lang] = res
            print(res)

            api_call_number += 1
            print(f'\n\nLIMIT: {len(users_l)} \n\nCALL NUMBER: {api_call_number}'
                  
    except Exception as e:
        print(e)
        traceback.print_exc()
        continue






LANG: bar
SLICE SIZE: 2
bar


BEFORE REQUEST
LANG: bar
SLICE START: 0
SLICE END: 50
SLICE SIZE: 2
checking...
[{'name': 'MaterialWorks', 'missing': ''}, {'name': 'Cestgeorge', 'missing': ''}]
[{'name': 'MaterialWorks', 'missing': ''}, {'name': 'Cestgeorge', 'missing': ''}]


LIMIT: 2 

CALL NUMBER: 1
bar


BEFORE REQUEST
LANG: ga
SLICE START: 0
SLICE END: 50
SLICE SIZE: 2
checking...
[{'userid': 30643, 'name': 'Ser!', 'editcount': 11}]
[{'userid': 30643, 'name': 'Ser!', 'editcount': 11}]


LIMIT: 1 

CALL NUMBER: 2
ga


BEFORE REQUEST
LANG: sw
SLICE START: 0
SLICE END: 50
SLICE SIZE: 1
checking...
[{'name': 'PoetishBookwormus', 'missing': ''}]
[{'name': 'PoetishBookwormus', 'missing': ''}]


LIMIT: 1 

CALL NUMBER: 3
sw


BEFORE REQUEST
LANG: be
SLICE START: 0
SLICE END: 50
SLICE SIZE: 1
checking...
[{'userid': 51607, 'name': 'Artem.G', 'editcount': 2}, {'userid': 33259, 'name': 'Janka1410', 'editcount': 114}]
[{'userid': 51607, 'name': 'Artem.G', 'editcount': 2}, {'userid': 33259,

[{'userid': 630315, 'name': 'Rosguill', 'editcount': 0}, {'userid': 11, 'name': 'DHN', 'editcount': 103820}]
[{'userid': 630315, 'name': 'Rosguill', 'editcount': 0}, {'userid': 11, 'name': 'DHN', 'editcount': 103820}]


LIMIT: 2 

CALL NUMBER: 31
vi


BEFORE REQUEST
LANG: yi
SLICE START: 0
SLICE END: 50
SLICE SIZE: 2
checking...
[{'userid': 30969, 'name': 'Rosguill', 'editcount': 0}, {'userid': 3184, 'name': 'Yihyetov', 'editcount': 0}, {'userid': 60, 'name': 'Firespeaker', 'editcount': 1}]
[{'userid': 30969, 'name': 'Rosguill', 'editcount': 0}, {'userid': 3184, 'name': 'Yihyetov', 'editcount': 0}, {'userid': 60, 'name': 'Firespeaker', 'editcount': 1}]


LIMIT: 3 

CALL NUMBER: 32
yi


BEFORE REQUEST
LANG: simple
SLICE START: 0
SLICE END: 50
SLICE SIZE: 3
checking...
[{'userid': 1280586, 'name': 'Sailoratlantis', 'editcount': 0}, {'userid': 705631, 'name': 'Piotr Bart', 'editcount': 16}, {'userid': 728129, 'name': 'NewManila2000', 'editcount': 92}]
[{'userid': 1280586, 'name': 'Sailora

[{'userid': 28385, 'name': 'Gdominik100', 'editcount': 3}, {'userid': 5444, 'name': 'Firespeaker', 'editcount': 0}]
[{'userid': 28385, 'name': 'Gdominik100', 'editcount': 3}, {'userid': 5444, 'name': 'Firespeaker', 'editcount': 0}]


LIMIT: 2 

CALL NUMBER: 47
mn


BEFORE REQUEST
LANG: nn
SLICE START: 0
SLICE END: 50
SLICE SIZE: 2
checking...
[{'userid': 3157, 'name': 'Lillingen', 'editcount': 2}, {'userid': 128, 'name': 'Egil', 'editcount': 59}]
[{'userid': 3157, 'name': 'Lillingen', 'editcount': 2}, {'userid': 128, 'name': 'Egil', 'editcount': 59}]


LIMIT: 2 

CALL NUMBER: 48
nn


BEFORE REQUEST
LANG: ace
SLICE START: 0
SLICE END: 50
SLICE SIZE: 2
checking...
[{'userid': 408, 'name': 'Naval Scene', 'editcount': 425}, {'userid': 16285, 'name': 'Ivan Humphrey', 'editcount': 5}]
[{'userid': 408, 'name': 'Naval Scene', 'editcount': 425}, {'userid': 16285, 'name': 'Ivan Humphrey', 'editcount': 5}]


LIMIT: 2 

CALL NUMBER: 49
ace


BEFORE REQUEST
LANG: su
SLICE START: 0
SLICE END: 50
SLI

[{'userid': 33378, 'name': 'IBayern', 'editcount': 0}, {'userid': 57661, 'name': 'Briantin de Montrei', 'editcount': 6}]
[{'userid': 33378, 'name': 'IBayern', 'editcount': 0}, {'userid': 57661, 'name': 'Briantin de Montrei', 'editcount': 6}]


LIMIT: 2 

CALL NUMBER: 64
sco


BEFORE REQUEST
LANG: zh-classical
SLICE START: 0
SLICE END: 50
SLICE SIZE: 2
checking...
[{'userid': 59236, 'name': 'IBayern', 'editcount': 2}]
[{'userid': 59236, 'name': 'IBayern', 'editcount': 2}]


LIMIT: 1 

CALL NUMBER: 65
zh-classical


BEFORE REQUEST
LANG: fi
SLICE START: 0
SLICE END: 50
SLICE SIZE: 1
checking...
[{'userid': 258605, 'name': 'Sjomadhr', 'editcount': 12}, {'userid': 25352, 'name': 'Andsam', 'editcount': 118}, {'userid': 281180, 'name': 'Shadess', 'editcount': 118}, {'userid': 510234, 'name': 'Bsskchaitanya', 'editcount': 0}, {'userid': 323993, 'name': 'Nihonfreak', 'editcount': 0}, {'userid': 482537, 'name': 'CarrotPieFI', 'editcount': 52}]
[{'userid': 258605, 'name': 'Sjomadhr', 'editcount':

[{'userid': 4328, 'name': 'Guillermo2149', 'editcount': 619}]
[{'userid': 4328, 'name': 'Guillermo2149', 'editcount': 619}]


LIMIT: 1 

CALL NUMBER: 73
gn


BEFORE REQUEST
LANG: haw
SLICE START: 0
SLICE END: 50
SLICE SIZE: 1
checking...
[{'name': '丘明利', 'missing': ''}]
[{'name': '丘明利', 'missing': ''}]


LIMIT: 1 

CALL NUMBER: 74
haw


BEFORE REQUEST
LANG: hak
SLICE START: 0
SLICE END: 50
SLICE SIZE: 1
checking...
[{'userid': 10418, 'name': '丘明利', 'editcount': 4}]
[{'userid': 10418, 'name': '丘明利', 'editcount': 4}]


LIMIT: 1 

CALL NUMBER: 75
hak


BEFORE REQUEST
LANG: th
SLICE START: 0
SLICE END: 50
SLICE SIZE: 1
checking...
[{'userid': 210384, 'name': '丘明利', 'editcount': 2}, {'userid': 240425, 'name': 'Phokhamvone', 'editcount': 0}, {'userid': 263740, 'name': 'Tris T7', 'editcount': 11721}, {'userid': 353892, 'name': 'PointlessUsername', 'editcount': 207}, {'userid': 420542, 'name': 'Haidit', 'editcount': 14}, {'userid': 436139, 'name': 'Bujsaran', 'editcount': 203}]
[{'userid': 210

[{'userid': 37409, 'name': 'Guy of india', 'editcount': 1108}, {'userid': 66230, 'name': '丘明利', 'editcount': 0}, {'userid': 17123, 'name': '12afser12', 'editcount': 13}, {'userid': 80080, 'name': 'Jose Mathew C', 'editcount': 0}, {'userid': 29428, 'name': 'Balajijagadesh', 'editcount': 6545}, {'userid': 99318, 'name': 'Bsskchaitanya', 'editcount': 2}, {'userid': 173656, 'name': 'Ajeeshkumar4u', 'editcount': 5}]
[{'userid': 37409, 'name': 'Guy of india', 'editcount': 1108}, {'userid': 66230, 'name': '丘明利', 'editcount': 0}, {'userid': 17123, 'name': '12afser12', 'editcount': 13}, {'userid': 80080, 'name': 'Jose Mathew C', 'editcount': 0}, {'userid': 29428, 'name': 'Balajijagadesh', 'editcount': 6545}, {'userid': 99318, 'name': 'Bsskchaitanya', 'editcount': 2}, {'userid': 173656, 'name': 'Ajeeshkumar4u', 'editcount': 5}]


LIMIT: 7 

CALL NUMBER: 85
ta


BEFORE REQUEST
LANG: xh
SLICE START: 0
SLICE END: 50
SLICE SIZE: 7
checking...
[{'userid': 123, 'name': 'Greenman', 'editcount': 108}]
[

[{'userid': 840697, 'name': 'Roriromrack', 'editcount': 2}, {'userid': 1004065, 'name': '永続繁栄', 'editcount': 663}, {'userid': 1881, 'name': 'Miya', 'editcount': 33045}, {'userid': 779132, 'name': 'I JethroBT', 'editcount': 1}, {'userid': 804227, 'name': 'Lifeinfluxus', 'editcount': 95}, {'userid': 825262, 'name': '丘明利', 'editcount': 4}, {'userid': 1091862, 'name': 'IBayern', 'editcount': 8}, {'userid': 1082022, 'name': 'Miyika', 'editcount': 13}, {'userid': 1036472, 'name': 'ツバル', 'editcount': 1917}, {'userid': 483946, 'name': 'Mr. Stradivarius', 'editcount': 44}, {'userid': 1578, 'name': 'Sekicho', 'editcount': 178}, {'name': 'Thureinminnoo', 'missing': ''}, {'userid': 1042383, 'name': 'Hylblog', 'editcount': 25}, {'userid': 434417, 'name': 'Gunkarta', 'editcount': 43}, {'userid': 1281270, 'name': 'Mcampany', 'editcount': 66}, {'userid': 628293, 'name': 'Saung Tadashi', 'editcount': 4}, {'userid': 293269, 'name': 'Yasunorihayashi', 'editcount': 89}, {'userid': 897738, 'name': 'VulcanS

[{'userid': 134630, 'name': 'Ziko', 'editcount': 2174}, {'userid': 326438, 'name': 'Pensées de Pascal', 'editcount': 58}, {'userid': 277754, 'name': 'Greenman', 'editcount': 55}, {'userid': 536212, 'name': 'AndrewTheLott', 'editcount': 30}, {'userid': 671602, 'name': 'XPanettaa', 'editcount': 503}, {'userid': 323478, 'name': 'Keizers', 'editcount': 325}, {'userid': 8575, 'name': 'Effeietsanders', 'editcount': 35424}, {'userid': 641397, 'name': 'VulcanSphere', 'editcount': 328}, {'userid': 105062, 'name': 'Peter Isotalo', 'editcount': 26}, {'userid': 177945, 'name': 'NaidNdeso', 'editcount': 9}, {'userid': 792931, 'name': 'Pdekyvere', 'editcount': 161}, {'userid': 953655, 'name': 'Matbla1', 'editcount': 10}, {'userid': 528055, 'name': 'Tcr25', 'editcount': 4}, {'userid': 816920, 'name': 'Kreb', 'editcount': 3}, {'userid': 377852, 'name': 'Neumannk', 'editcount': 0}, {'userid': 887390, 'name': 'Dousaer', 'editcount': 61}, {'userid': 802847, 'name': 'Balonlon', 'editcount': 1}, {'userid':

[{'userid': 1302167, 'name': 'WarKosign', 'editcount': 0}, {'userid': 1570871, 'name': 'IYY', 'editcount': 0}, {'userid': 67470, 'name': 'Дмитрий Кошелев', 'editcount': 24126}, {'userid': 765013, 'name': 'Oleg Bor', 'editcount': 39404}, {'userid': 1546309, 'name': 'Kges1901', 'editcount': 102}, {'userid': 711284, 'name': 'Marko Sarajevo', 'editcount': 3}, {'userid': 10205, 'name': 'Л.П. Джепко', 'editcount': 25371}, {'userid': 1910370, 'name': 'IBayern', 'editcount': 1}, {'userid': 531608, 'name': 'Keizers', 'editcount': 88}, {'userid': 1990613, 'name': 'OriginalOldMan', 'editcount': 0}, {'userid': 989598, 'name': 'Gdominik100', 'editcount': 2}, {'userid': 980465, 'name': 'Nicolas Perrault III', 'editcount': 0}, {'userid': 1040556, 'name': 'Kaliforniyka', 'editcount': 507}, {'userid': 139606, 'name': 'Smihael', 'editcount': 16}, {'userid': 368601, 'name': 'GregZak', 'editcount': 7187}, {'userid': 2262190, 'name': 'ჯეო', 'editcount': 188}, {'userid': 50681, 'name': 'BokicaK', 'editcount

[{'userid': 1012387, 'name': 'Jpbrenna', 'editcount': 0}, {'userid': 657050, 'name': 'Keilana', 'editcount': 2}, {'userid': 358485, 'name': 'AhmedPS', 'editcount': 170}, {'userid': 5139, 'name': 'Fjmustak', 'editcount': 4401}, {'userid': 967492, 'name': '丘明利', 'editcount': 0}, {'userid': 595174, 'name': 'Zak-mieleur', 'editcount': 4}, {'userid': 1192549, 'name': 'Miyika', 'editcount': 6}, {'userid': 143096, 'name': 'Aboluay', 'editcount': 21260}, {'userid': 367136, 'name': 'Greyshark09', 'editcount': 18}, {'userid': 287008, 'name': 'Iranianson', 'editcount': 16}, {'userid': 197106, 'name': 'Ibrahim.ID', 'editcount': 39345}, {'userid': 715124, 'name': 'FæɹHaad', 'editcount': 70}, {'userid': 1345703, 'name': 'أنور', 'editcount': 3754}, {'userid': 250011, 'name': 'Braindot4', 'editcount': 846}, {'userid': 789339, 'name': 'Flycatchr', 'editcount': 53}, {'userid': 2622, 'name': 'MatthewS.', 'editcount': 773}, {'userid': 1497265, 'name': 'Nehme1499', 'editcount': 449}, {'userid': 237660, 'na

[{'userid': 579, 'name': 'Jklamo', 'editcount': 37}, {'userid': 93621, 'name': 'Pensées de Pascal', 'editcount': 0}, {'userid': 6572, 'name': 'Brankom', 'editcount': 300}]
[{'userid': 579, 'name': 'Jklamo', 'editcount': 37}, {'userid': 93621, 'name': 'Pensées de Pascal', 'editcount': 0}, {'userid': 6572, 'name': 'Brankom', 'editcount': 300}]


LIMIT: 3 

CALL NUMBER: 112
sk


BEFORE REQUEST
LANG: sk
SLICE START: 50
SLICE END: 100
SLICE SIZE: 120
[]
[]


LIMIT: 0 

CALL NUMBER: 113
sk


BEFORE REQUEST
LANG: sk
SLICE START: 100
SLICE END: 150
SLICE SIZE: 120
checking...
[]
[]


LIMIT: 0 

CALL NUMBER: 114
sk


BEFORE REQUEST
LANG: cs
SLICE START: 0
SLICE END: 50
SLICE SIZE: 3
checking...
[{'userid': 946, 'name': 'Jklamo', 'editcount': 12323}, {'userid': 28156, 'name': 'Glomerata', 'editcount': 443}, {'userid': 302945, 'name': 'Brankom', 'editcount': 4}, {'userid': 402494, 'name': 'Misha Wolf', 'editcount': 0}]
[{'userid': 946, 'name': 'Jklamo', 'editcount': 12323}, {'userid': 28156, 'nam

[{'userid': 194061, 'name': 'Gamesmasterg9', 'editcount': 3}, {'userid': 234373, 'name': 'Fredericknoronha', 'editcount': 0}, {'userid': 92838, 'name': 'Anamdas', 'editcount': 26163}, {'userid': 224112, 'name': 'Che12Guevara', 'editcount': 2}, {'userid': 130, 'name': 'Debashish', 'editcount': 1513}, {'userid': 233706, 'name': 'UserNumber', 'editcount': 5}, {'userid': 267983, 'name': 'Hammad', 'editcount': 397}, {'userid': 56364, 'name': 'Balajijagadesh', 'editcount': 27}, {'userid': 269989, 'name': 'Bsskchaitanya', 'editcount': 0}, {'userid': 2, 'name': 'Yann', 'editcount': 557}, {'userid': 580288, 'name': 'Ajeeshkumar4u', 'editcount': 4}, {'userid': 302819, 'name': 'Wiki.editAnshu', 'editcount': 5}, {'userid': 509601, 'name': 'AbhigyaDahal', 'editcount': 0}, {'userid': 697892, 'name': 'TheManishPanwar', 'editcount': 148}, {'userid': 391028, 'name': 'SandeepKumarMeena', 'editcount': 274}]
[{'userid': 194061, 'name': 'Gamesmasterg9', 'editcount': 3}, {'userid': 234373, 'name': 'Frederic

In [None]:
# TODO:  AW: Very cool!  Also consider pushing this into a regular python module and including dummy data as a test fixture.


# {"batchcomplete":"","query":{"users":[{"userid":398607,"name":"Adamw","editcount":1506}]}}
# NOTE: run it if you can run the previous two lines OR if you want to test dummy data 

# dummy data to use when there is not internet connection
user_edit_counts_temp = {
    'nl': [{"userid":198607,"name":"Ahn-nath","editcount":15}],
    'es': [{"userid":298607,"name":"Adamw","editcount":156}, 
           {"userid":198607,"name":"Ahn-nath","editcount":160},
           {"userid":298607,"name":"Galahad","editcount":4058}
          ], 
           
    'en': [{"userid":298607,"name":"Adamw","editcount":156}, 
           {"userid":198607,"name":"Ahn-nath","editcount":150},
           {"userid":398607,"name":"Galahad","editcount":4056},
           {"userid":498607,"name":"S9H","editcount":4056}
          ],
    'de': [{"userid":298607,"name":"Adamw","editcount":3506},
          {"userid":498607,"name":"S9H","editcount":24056}]
}

user_edit_counts = user_edit_counts_temp

In [248]:
## Process edit counts
for lang, lang_list in user_edit_counts.items(): 
    print(lang)
   
    for user_group in lang_list:
        # print("group:", user_group)
        
        if user_group.get('editcount', None):
            # find the user list based on the mapped username
            name = user_group['name']
            # AW: It's possible to eliminate this "index" and simplify.
            index = user_index[name]
            user_row = users_lat[index]

            # update the edits counts of the language
            user_row['edit_counts'][lang] = user_group.get('editcount', 0)

# out 
users_lat

bar
ga
sw
be
gan
tt
uz
kk
ky
wuu
ne
lv
gd
hy
gu
is
kn
te
frp
oc
cy
eu
lad
vls
bjn
bug
pnb
ilo
am
ckb
vi
yi
simple
vec
sh
bs
lo
as
ur
ka
uk
tr
pam
mt
bg
my
mn
nn
ace
su
jv
min
hr
ro
az
sl
ht
hu
ku
glk
tg
mzn
pl
sco
zh-classical
fi
arz
ary
si
tl
zh-yue
it
gn
haw
hak
th
id
ms
ca
gl
sr
als
crh
fa
ta
xh
af
mk
sq
nds
ml
sv
no
da
ja
zh
sa
pa
nl
eo
lt
ko
gom
pt
ru
he
ar
la
el
es
sk
cs
de
mr
bn










<<<<<<<<<<<ESPECIAL CASE
[{'userid': 101863, 'name': 'Gamesmasterg9', 'editcount': 1}, {'userid': 79353, 'name': 'Che12Guevara', 'editcount': 1472}, {'userid': 9091, 'name': 'Debashish', 'editcount': 1}, {'userid': 118025, 'name': 'UserNumber', 'editcount': 370}, {'userid': 49124, 'name': 'RockyMasum', 'editcount': 14006}]
hi
en


[{'username': 'Olivier LPB',
  'language': "['fr', 'en-2']",
  'edit_counts': {'fr': 0, 'en': 323},
  'levels': [5, '2'],
  'langs': ['fr', 'en']},
 {'username': 'Gamesmasterg9',
  'language': "['en', 'hi-4', 'bn-3', 'mr-1']",
  'edit_counts': {'en': 2753, 'hi': 3, 'bn': 1, 'mr': 0},
  'levels': [5, '4', '3', '1'],
  'langs': ['en', 'hi', 'bn', 'mr']},
 {'username': 'Dvermeirre',
  'language': "['fr', 'en-5', 'de-1']",
  'edit_counts': {'fr': 0, 'en': 550, 'de': 2},
  'levels': [5, '5', '1'],
  'langs': ['fr', 'en', 'de']},
 {'username': 'Jklamo',
  'language': "['en-3', 'cs', 'sk', 'fr-1']",
  'edit_counts': {'en': 12165, 'cs': 12323, 'sk': 0, 'fr': 0},
  'levels': ['3', 5, 5, '1'],
  'langs': ['en', 'cs', 'sk', 'fr']},
 {'username': 'Calliopejen1',
  'language': "['en', 'es-3', 'fr-1']",
  'edit_counts': {'en': 132858, 'es': 216, 'fr': 0},
  'levels': [5, '3', '1'],
  'langs': ['en', 'es', 'fr']},
 {'username': 'Jpbrenna',
  'language': "['en', 'el-2', 'la-2', 'es-2', 'ar-1']",
  'ed

In [252]:
## Create dataframe

# create series for each category and extend them with each objectb
user_list = []
langs = []
levels = []
edit_counts = []
 

# AW: I think pandas will accept a list of dictionaries: pd.DataFrame(users_lat)
for group_user in users_lat:
    # extend each list with each user's items
    general_len = len(group_user["langs"])
    user_list.extend([group_user['username']] * general_len)
    langs.extend(group_user['langs'])
    levels.extend(group_user['levels'])
    edit_counts.extend(group_user['edit_counts'].values())

    
# out
display(user_list[:10])
display(langs[:10])
display(levels[:10])
display(edit_counts[:10])

['Olivier LPB',
 'Olivier LPB',
 'Gamesmasterg9',
 'Gamesmasterg9',
 'Gamesmasterg9',
 'Gamesmasterg9',
 'Dvermeirre',
 'Dvermeirre',
 'Dvermeirre',
 'Jklamo']

['fr', 'en', 'en', 'hi', 'bn', 'mr', 'fr', 'en', 'de', 'en']

[5, '2', 5, '4', '3', '1', 5, '5', '1', '3']

[0, 323, 2753, 3, 1, 0, 0, 550, 2, 12165]

In [250]:
## Connect everything
  
# dictionary of lists 
dicti = {'username': user_list,'language': langs, 'level': levels,'edit_count':edit_counts} 

# AW: I get an error here which reinforces the idea that parallel lists are fragile:
#   ValueError: All arrays must be of the same length
df_users = pd.DataFrame(dicti)
df_users['level'] = pd.to_numeric(df_users['level'])

    
df_users

ValueError: Unable to parse string "L" at position 535

## Data analysis

### Questions we want to solve:
1. Do users contribute the most to the Wikipedia editions in languages they claim to be native in?
2. Do users contribute the most to the Wikipedia editions in languages they have a "sufficient" proficiency in? (3-4 levels)
3. What other questions can be asked and answered with this data?

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df_users.groupby('username')['edit_count'].sum()

In [None]:
# selecting rows based on condition
rslt_df = df_users
rslt_df['edit count percentage'] = (rslt_df.loc[:, 'edit_count'] /  
                                    df_users.groupby('username')['edit_count'].transform('sum') * 100).round(2)

# levels to filter by
rslt_df1 = rslt_df[rslt_df['level'] == 5]
rslt_df2 = rslt_df[rslt_df['level'] == 4]

print(f'\nDisplaying table showing relationship between user language level and the edit count percentage by native level')
display(rslt_df1)

print(f'\nDisplaying table showing relationship between user language level and the edit count percentage by advanced level')
display(rslt_df2)

In [None]:
plt.scatter(df_users['edit_count'], df_users['level'], color = '#88c999')

plt.title("Relationship between the user language level and their edit count in that language")
plt.xlabel("edit count")
plt.ylabel("language level")
plt.show()

__Observations:__ []

In [None]:
## Display pie

native_high_count_group = df_users.loc[(df_users['level'] == 5) & (df_users['edit count percentage'] > 60.0)]
proficient_high_count_group =  df_users.loc[(df_users['level'] == 4) & (df_users['edit count percentage'] > 60.0)]
sufficient_high_count_group =  df_users.loc[(df_users['level'] == 3) & (df_users['edit count percentage'] > 60.0)]
low_high_count_group =  df_users.loc[(df_users['level'] < 3) & (df_users['edit count percentage'] > 60.0)]

# get counts
counts_list = [native_high_count_group, proficient_high_count_group, sufficient_high_count_group, low_high_count_group]
counts_list_language_edit_number = [len(c) for c in counts_list]

# set labels
summary_labels = ["5th level --> mother Wiki", 
                  " 4th level --> mother Wiki", 
                  "3rd level --> mother Wiki", 
                  "under 3rd level --> mother Wiki"]

plt.title("Grouping by language proficiency and the percentage represented by edits in these levels")
plt.pie(counts_list_language_edit_number)
plt.legend( summary_labels, loc='upper right',)
plt.show() 


__Observations:__ []

#### TODO:
#### classify based on absolute values for expert, beginner and medium. Use a bar chart

In [None]:
## Create a new category based on absolute numbers of the edit_count column
df_users['category'] = df_users.apply(lambda row: "beginner" if (row.edit_count < 1)
                                      else "intermediate" if (row.edit_count < 1006) 
                                      else "expert", axis = 1)

# out
df_users

In [None]:
x = df_users.groupby(['level'], as_index=False)['edit_count'].sum()

fig = plt.figure(figsize = (10, 5))
plt.bar(x["level"], x["edit_count"])

plt.title("Total edit count by language proficiency level")
plt.xlabel("edit count")
plt.ylabel("language level")

In [None]:
## Show distribution by specific language level
levels = a = df_users['level'].unique()

for level in levels:
    df = df_users.loc[(df_users['level'] == level)]
    data = df['category']
    values = data.value_counts().values.tolist()
    labels = data.value_counts().index.tolist()

    plt.title(f'Editor level distribution by speaker of a level proficiency equal to {level}')
    plt.pie(values, labels=labels, autopct='%.0f%%')
    plt.show()