In [1]:
!pip install --upgrade pip
!pip install pandas seaborn 

Requirement already up-to-date: pip in /home/staeiou/conda/lib/python3.5/site-packages


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
import pickle
import numpy as np
%matplotlib inline

In [3]:
import datetime

In [4]:
start = datetime.datetime.now()

## Load data

In [5]:
!ls ../datasets/*.pickle

../datasets/df_all_2016.pickle
../datasets/df_all_comments_parsed_2016.pickle
../datasets/possible_botfights.pickle


In [6]:
with open("../datasets/df_all_2016.pickle", "rb") as f:
    df_all = pickle.load(f)

### Initial data format

In [7]:
df_all[0:2].transpose()

Unnamed: 0,0,1
archived,False,False
language,zh,zh
page_namespace,0,0
rev_deleted,False,False
rev_id,10764032,16265305
rev_minor_edit,True,True
rev_page,796015,796015
rev_parent_id,1.0169e+07,1.62328e+07
rev_revert_offset,1,1
rev_sha1,r64h9ccp8u81sv76rchiqlopzvbjz7y,e19g7y6aw3j2k4yw3k6q0o3vn3f69i7


# Comments analysis

### Comment parsing functions

There are two functions that are used to parse comments. `comment_categorization()` runs first and applies a series of pattern matching to comments. If a match is not found, then `interwiki_confirm()` is called, which checks for languages codes in certain patterns that indicate interwiki links.

In [8]:
def comment_categorization(row):
    """
    Takes a row from a pandas dataframe or dict and returns a string with a
    kind of activity based on metadata. Used with df.apply(). Mostly parses
    comments, but makes some use of usernames too.
    """
    
    reverting_user = str(row['reverting_user_text'])
    
    reverted_user = str(row['rev_user_text'])
    
    langcode = str(row['language'])
    
    if reverting_user.find("HBC AIV") >= 0:
        return 'AIV helperbot'
    
    try:
        comment = str(row['reverting_comment'])
    except Exception as e:
        return 'other'
    
    comment_lower = comment.lower().strip()
    comment_lower = " ".join(comment_lower.split())
 
    if comment == 'nan':
        return "deleted revision"
    
    if reverting_user == 'Cyberbot II' and reverted_user == 'AnomieBOT' and comment.find("tagging/redirecting to OCC") >= 0:
        return 'botfight: Cyberbot II vs AnomieBOT date tagging'
        
    if reverting_user == 'AnomieBOT' and reverted_user == 'Cyberbot II' and comment.find("{{Deadlink}}") >= 0:
        return 'botfight: Cyberbot II vs AnomieBOT date tagging'                

    if reverting_user == 'RussBot' and reverted_user == 'Cydebot':
        return 'botfight: Russbot vs Cydebot category renaming'  

    if reverting_user == 'Cydebot' and reverted_user == 'RussBot':
        return 'botfight: Russbot vs Cydebot category renaming'  
    
    elif comment.find("Undoing massive unnecessary addition of infoboxneeded by a (now blocked) bot") >= 0:
        return "botfight: infoboxneeded"
    
    elif comment_lower.find("commonsdelinker") >=0 and reverting_user.find("CommonsDelinker") == -1:
        return "botfight: reverting CommonsDelinker"
        
    elif comment.find("Reverted edits by [[Special:Contributions/ImageRemovalBot") >= 0:
        return "botfight: 718bot vs ImageRemovalBot"
    
    elif comment_lower.find("double redirect") >= 0:
        return "fixing double redirect"
    
    elif comment_lower.find("double-redirect") >= 0:
        return "fixing double redirect"

    elif comment_lower.find("has been moved; it now redirects to") >= 0:
        return "fixing double redirect"
    
    elif comment_lower.find("correction du redirect") >= 0:
        return "fixing double redirect"   
        
    elif comment_lower.find("redirect tagging") >= 0:
        return "redirect tagging/sorting"
    
    elif comment_lower.find("sorting redirect") >= 0:
        return "redirect tagging/sorting"
    
    elif comment_lower.find("redirecciones") >= 0 and comment_lower.find("categoría") >= 0:
        return "category redirect cleanup"    
    
    elif comment_lower.find("change redirected category") >= 0:
        return "category redirect cleanup"
    
    elif comment_lower.find("redirected category") >=0:
        return "category redirect cleanup"
    
    elif comment.find("[[User:Addbot|Bot:]] Adding ") >= 0:
        return "template tagging"
    
    elif comment_lower.find("interwiki") >= 0:
        return "interwiki link cleanup -- method1"
    
    elif comment_lower.find("langlinks") >= 0:
        return "interwiki link cleanup -- method1"
    
    elif comment_lower.find("iw-link") >= 0:
        return "interwiki link cleanup -- method1"
    
    elif comment_lower.find("changing category") >= 0:
        return "moving category"
    
    elif comment_lower.find("recat per") >= 0:
        return "moving category"
    
    elif comment_lower.find("moving category") >= 0:
        return "moving category"

    elif comment_lower.find("move category") >= 0:
        return "moving category"
    
    elif comment_lower.find("re-categorisation") >= 0:
        return "moving category"
    
    elif comment_lower.find("recatégorisation") >= 0:
        return "moving category"   
    
    elif comment_lower.find("Updating users status to") >= 0:
        return "user online status update"
    
    elif comment_lower.find("{{Copy to Wikimedia Commons}} either because the file") >= 0:
        return "template cleanup"
        
    elif comment_lower.find("removing a protection template") >= 0:
        return "protection template cleanup"
    
    elif comment_lower.find("removing categorization template") >= 0:
        return "template cleanup"    
    
    elif comment_lower.find("rm ibid template per") >= 0:
        return "template cleanup"      
    
    elif comment_lower.find("page is not protected") >= 0:
        return "template cleanup"          
    
    elif comment_lower.find("removing protection template") >= 0:
        return "template cleanup"    
    
    elif comment_lower.find("correcting transcluded template per tfd") >= 0:
        return "template cleanup"   
    
    elif comment_lower.find("removing orphan t") >= 0:
        return "template cleanup"
    
    elif comment_lower.find("non-applicable orphan") >= 0:
        return "template cleanup"
    
    elif comment_lower.find("plantilla") >= 0 and comment_lower.find("huérfano") >= 0:
        return "template cleanup"
    
    elif comment_lower.find("removed orphan t") >= 0:
        return "template cleanup"    
    
    elif comment_lower.find("sandbox") >= 0:
        return "clearing sandbox"
    
    elif comment_lower.find("archiving") >= 0:
        return "archiving"
    
    elif comment_lower.find("duplicate on commons") >= 0:
        return "commons image migration"
    
    elif comment_lower.find("user:mathbot/changes to mathlists") >= 0:
        return "botfight: mathbot mathlist updates"
    
    elif reverting_user == 'MathBot' or reverted_user == 'MathBot' >= 0:
        return "botfight: mathbot mathlist updates"
    
    elif comment_lower.find("link syntax") >= 0:
        return "fixing links"
    
    elif comment_lower.find("links syntax") >= 0:
        return "fixing links" 
    
    elif comment_lower.find("no broken #section links left") >= 0:
        return "fixing links"  
    
    elif comment_lower.find("removing redlinks") >= 0:
        return "fixing links" 
    
    elif comment_lower.find("to wikidata") >= 0:
        return "interwiki link cleanup -- method1"
    
    elif comment.find("言語間") >=0:
        return "interwiki link cleanup -- method1"
        
    elif comment_lower.find("interproyecto") >=0:
        return "interwiki link cleanup -- method1"    
        
    elif comment.find("语言链接") >=0:
        return "interwiki link cleanup -- method1"  
    
    elif comment.find("interling") >=0:
        return "interwiki link cleanup -- method1"  
    
    elif comment.find("interlang") >=0:
        return "interwiki link cleanup -- method1"      
    
    elif comment.find("双重重定向") >=0 or comment.find("雙重重定向") >= 0:
        return "fixing double redirect"   

    elif comment.find("二重リダイレクト") >=0:
        return "fixing double redirect"  
    
    elif comment_lower.find("doppelten redirect") >=0:
        return "fixing double redirect"  
    
    elif comment_lower.find("doppelte weiterleitung") >=0:
        return "fixing double redirect"      
    
    
    elif comment_lower.find("redirectauflösung") >=0:
        return "fixing double redirect"      
    
    elif comment_lower.find("doble redirección") >=0 or comment_lower.find("redirección doble") >= 0:
        return "fixing double redirect"  
    
    elif comment_lower.find("redireccionamento duplo") >=0:
        return "fixing double redirect"  

    elif comment_lower.find("duplo redirecionamento") >=0:
        return "fixing double redirect"      
    
    elif comment_lower.find("suppression bandeau") >= 0:
        return "template cleanup"
    
    elif comment_lower.find("archiviert") >= 0:
        return "archiving"

    elif comment_lower.find("revert") >= 0:
        return "other w/ revert in comment"  
    
    elif comment_lower.find("rv ") >= 0 or comment_lower.find("rv") == 0:
        return "other w/ revert in comment"  
    
    elif comment_lower.find(" per ") >= 0:
        return "other w/ per justification"  
    
    elif comment_lower.find(" según") >= 0:
        return "other w/ per justification"      
 
    elif comment_lower.find("suite à discussion") >= 0:
        return "other w/ per justification"  
    
    elif comment_lower.find("suite à conservation") >= 0:
        return "other w/ per justification"     
    
    elif comment_lower.find("conforme pedido") >= 0:
        return "other w/ per justification"
    
    else:
        return interwiki_confirm(comment, langcode)

In [9]:
def interwiki_confirm(comment, langcode):
    """
    Takes a comment string, searches for language codes bordered by 
    two punctuation marks from [](){},: or one punctuation mark and
    one space. Beginning and end of a comment string counts as a
    space, not a punctuation mark.
    
    Does not recognize the current langcode.
    """
    import string, re
    
    with open("../datasets/lang_codes.tsv", "r") as f:
        lang_codes = f.read().split("\n")
        
    lang_codes.pop() # a blank '' is in the list that gets returned
    
    lang_codes.remove(langcode)
    
    #print(langcode in lang_codes)
    
    try:
        comment = str(comment)
        comment = comment.lower()
        comment = comment.replace(": ", ":")
        comment = " " + comment + " "  # pad start and end of string with non-punctuation
        #print(comment)
        
    except Exception as e:
        return 'other'
    
    for lang_code in lang_codes:
        
        lang_code_pos = comment.find(lang_code)
        lang_code_len = len(lang_code)
        
        char_before = " "
        char_after = " "
        
        if lang_code_pos >= 0:
            char_before = comment[lang_code_pos-1]
        
            #print("Char before: '", char_before, "'", sep='')
             
            char_after = comment[lang_code_pos+lang_code_len]

            #print("Char after: '", char_after, "'", sep='')
            
            if char_before in string.punctuation and char_after in "[]{}(),:":
                #print(comment, lang_code)
                return 'interwiki link cleanup -- method2'
            
            elif char_after in string.punctuation and char_before in "[]{}(),:":
                #print(comment, lang_code)
                return 'interwiki link cleanup -- method2'
            
            elif char_before == " " and char_after in "[]{}(),:":
                #print(comment, lang_code)
                return 'interwiki link cleanup -- method2'
            
            elif char_after == " " and char_before in "[]{}(),:":
                #print(comment, lang_code)
                return 'interwiki link cleanup -- method2'               
    return 'other'
    

Testing interwiki confirm

In [10]:
tests_yes = ["Robot adding [[es:Test]]",
             "adding es:Test",
             "linking es, it, en",
             "modifying fr:",
             "modifying:zh",
             "modifying: ja"]

tests_no = ["test", 
            "discuss policies on enwiki vs eswiki", 
            "it is done", 
            "per [[en:WP:AIV]]",
            "it's not its", 
            "its not it's",
            "modifying it all",
            "modifying italy"]

print("Should return interwiki link cleanup -- method2")
for test in tests_yes:
    print("\t", interwiki_confirm(test, 'en'))

print("Should return other")
for test in tests_no:
    print("\t", interwiki_confirm(test, 'en'))

Should return interwiki link cleanup -- method2
	 interwiki link cleanup -- method2
	 interwiki link cleanup -- method2
	 interwiki link cleanup -- method2
	 interwiki link cleanup -- method2
	 interwiki link cleanup -- method2
	 interwiki link cleanup -- method2
Should return other
	 other
	 other
	 other
	 other
	 other
	 other
	 other
	 other


Apply categorization

In [11]:
%%time
df_all['bottype'] = df_all.apply(comment_categorization, axis=1)

CPU times: user 4min 3s, sys: 3.26 s, total: 4min 7s
Wall time: 4min 7s


### Consolidate groups

In [12]:
def bottype_group(bottype):
    if bottype == "interwiki link cleanup -- method2":
        return "interwiki link cleanup -- method2"
    
    elif bottype == "interwiki link cleanup -- method1":
        return "interwiki link cleanup -- method1"
    
    elif bottype.find("botfight") >= 0:
        return 'botfight'
    
    elif bottype == 'other':
        return 'not classified'
    
    elif bottype == 'fixing double redirect':
        return 'fixing double redirect'
    
    elif bottype == 'protection template cleanup':
        return 'protection template cleanup'
    
    elif bottype.find("category") >= 0:
        return 'category work'
    
    elif bottype.find("template") >= 0:
        return 'template work'
    
    elif bottype == "other w/ revert in comment":
        return "other w/ revert in comment"
    
    else:
        return "other classified"

In [13]:
df_all['bottype_group'] = df_all['bottype'].apply(bottype_group)

## Analysis

Much of what we're interested in are articles, which are in namespace 0. 

In [14]:
df_all_ns0 = df_all[df_all['page_namespace']==0].copy()

### Bottype counts and percentages across all languages in the dataset, articles only

In [15]:
type_counts = df_all_ns0['bottype'].value_counts().rename("count")
type_percent = df_all_ns0['bottype'].value_counts(normalize=True).rename("percent") * 100
type_percent = type_percent.round(2).astype(str) + "%"

pd.concat([type_counts, type_percent], axis=1)

Unnamed: 0,count,percent
interwiki link cleanup -- method2,244844,43.63%
interwiki link cleanup -- method1,159910,28.49%
fixing double redirect,129523,23.08%
other,13506,2.41%
protection template cleanup,2843,0.51%
other w/ revert in comment,2424,0.43%
botfight: Russbot vs Cydebot category renaming,2125,0.38%
moving category,1662,0.3%
template cleanup,1387,0.25%
other w/ per justification,776,0.14%


### Bottype counts and percentages for each language, articles only

In [16]:
counts_dict = {}
for lang in df_all_ns0['language'].unique():

    df_lang_ns0 = df_all_ns0[df_all_ns0['language']==lang]
    
    type_counts = df_lang_ns0['bottype'].value_counts().rename("count")
    type_percent = df_lang_ns0['bottype'].value_counts(normalize=True).rename("percent") * 100
    type_percent = type_percent.round(2).astype(str) + "%"

    counts_dict[lang]=pd.concat([type_counts, type_percent], axis=1)

In [17]:
df_all_ns0['language'].unique()

array(['zh', 'fr', 'pt', 'de', 'en', 'ja', 'es'], dtype=object)

In [18]:
counts_dict['en']

Unnamed: 0,count,percent
fixing double redirect,110086,45.04%
interwiki link cleanup -- method1,83761,34.27%
interwiki link cleanup -- method2,37102,15.18%
protection template cleanup,2837,1.16%
other,2619,1.07%
botfight: Russbot vs Cydebot category renaming,2116,0.87%
moving category,1387,0.57%
template cleanup,1248,0.51%
other w/ revert in comment,1007,0.41%
botfight: mathbot mathlist updates,514,0.21%


In [19]:
counts_dict['ja']

Unnamed: 0,count,percent
interwiki link cleanup -- method2,27631,79.85%
interwiki link cleanup -- method1,5044,14.58%
other,1617,4.67%
fixing double redirect,294,0.85%
other w/ revert in comment,11,0.03%
other w/ per justification,7,0.02%


In [20]:
counts_dict['zh']

Unnamed: 0,count,percent
interwiki link cleanup -- method2,23649,54.98%
interwiki link cleanup -- method1,14672,34.11%
fixing double redirect,3634,8.45%
other,794,1.85%
other w/ revert in comment,257,0.6%
other w/ per justification,6,0.01%
botfight: reverting CommonsDelinker,3,0.01%


In [21]:
counts_dict['de']

Unnamed: 0,count,percent
interwiki link cleanup -- method2,35883,65.35%
interwiki link cleanup -- method1,16565,30.17%
other,1410,2.57%
fixing double redirect,986,1.8%
other w/ revert in comment,21,0.04%
other w/ per justification,10,0.02%
botfight: Russbot vs Cydebot category renaming,9,0.02%
protection template cleanup,6,0.01%
moving category,6,0.01%
botfight: reverting CommonsDelinker,5,0.01%


In [22]:
counts_dict['fr']

Unnamed: 0,count,percent
interwiki link cleanup -- method2,41165,73.18%
interwiki link cleanup -- method1,10093,17.94%
fixing double redirect,3296,5.86%
other,1027,1.83%
other w/ per justification,398,0.71%
moving category,269,0.48%
other w/ revert in comment,3,0.01%
clearing sandbox,2,0.0%
botfight: reverting CommonsDelinker,2,0.0%


In [23]:
counts_dict['pt']

Unnamed: 0,count,percent
interwiki link cleanup -- method2,41588,69.27%
interwiki link cleanup -- method1,13629,22.7%
other,2846,4.74%
fixing double redirect,1908,3.18%
other w/ per justification,36,0.06%
other w/ revert in comment,26,0.04%
clearing sandbox,1,0.0%
botfight: reverting CommonsDelinker,1,0.0%


In [24]:
counts_dict['es']

Unnamed: 0,count,percent
interwiki link cleanup -- method2,37826,55.64%
interwiki link cleanup -- method1,16146,23.75%
fixing double redirect,9319,13.71%
other,3193,4.7%
other w/ revert in comment,1099,1.62%
other w/ per justification,140,0.21%
template cleanup,134,0.2%
category redirect cleanup,106,0.16%
botfight: reverting CommonsDelinker,16,0.02%
clearing sandbox,1,0.0%


### Condolidation results

In [25]:
gb_lang_bottype = df_all.query("page_namespace == 0").groupby(["language", "bottype_group"])

In [26]:
gb_lang_bottype['rev_id'].count().unstack()

bottype_group,botfight,category work,fixing double redirect,interwiki link cleanup -- method1,interwiki link cleanup -- method2,not classified,other classified,other w/ revert in comment,protection template cleanup,template work
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
de,15.0,6.0,986.0,16565.0,35883.0,1410.0,10.0,21.0,6.0,5.0
en,3429.0,1726.0,110086.0,83761.0,37102.0,2619.0,560.0,1007.0,2837.0,1272.0
es,16.0,106.0,9319.0,16146.0,37826.0,3193.0,141.0,1099.0,,134.0
fr,2.0,269.0,3296.0,10093.0,41165.0,1027.0,400.0,3.0,,
ja,,,294.0,5044.0,27631.0,1617.0,7.0,11.0,,
pt,1.0,,1908.0,13629.0,41588.0,2846.0,37.0,26.0,,
zh,3.0,,3634.0,14672.0,23649.0,794.0,6.0,257.0,,


## Final data format

In [27]:
df_all[0:2].transpose()

Unnamed: 0,0,1
archived,False,False
language,zh,zh
page_namespace,0,0
rev_deleted,False,False
rev_id,10764032,16265305
rev_minor_edit,True,True
rev_page,796015,796015
rev_parent_id,1.0169e+07,1.62328e+07
rev_revert_offset,1,1
rev_sha1,r64h9ccp8u81sv76rchiqlopzvbjz7y,e19g7y6aw3j2k4yw3k6q0o3vn3f69i7


## Export data

In [28]:
df_all.to_pickle("../datasets/df_all_comments_parsed_2016.pickle")

## How long did this take to run?

In [29]:
end = datetime.datetime.now()

In [30]:
time_to_run = end - start
minutes = int(time_to_run.seconds/60)
seconds = time_to_run.seconds % 60
print("Total runtime: ", minutes, "minutes, ", seconds, "seconds")

Total runtime:  4 minutes,  15 seconds


## Export possible botfights

In [31]:
def is_possible_botfight(bottype_str):
    
    if bottype_str == 'other':
        return True
    elif bottype_str == 'other w/ revert in comment':
        return True
    elif bottype_str.find('botfight') >= 0:
        return True
    else:
        return False

In [32]:
df_ttr_under_180_d = df_all.query("time_to_revert_days < 180")
df_possible_botfights_mask = df_ttr_under_180_d['bottype'].apply(is_possible_botfight)

In [33]:
df_possible_botfights = df_ttr_under_180_d[df_possible_botfights_mask]
df_possible_botfights_ns0 = df_possible_botfights[df_possible_botfights['page_namespace']==0]

In [34]:
df_possible_botfights['language'].value_counts()

en    18465
es     4717
de     2914
zh     2585
pt     1377
ja     1325
fr     1276
Name: language, dtype: int64

In [35]:
df_possible_botfights_ns0['language'].value_counts()

en    5835
es    3762
de    1370
ja    1195
pt    1065
fr     967
zh     600
Name: language, dtype: int64

In [36]:
df_possible_botfights[0:2].transpose()

Unnamed: 0,6,117
archived,False,False
language,zh,zh
page_namespace,0,0
rev_deleted,False,False
rev_id,38279206,37156722
rev_minor_edit,True,False
rev_page,13,797638
rev_parent_id,3.82792e+07,3.43412e+07
rev_revert_offset,1,3
rev_sha1,3ywr47fh7rv56ea9gdw4nwbpouan6mz,nvzboeb7p8ml7a1oav4vp7rqfqvr39z


In [37]:
df_possible_botfights.to_pickle("../datasets/possible_botfights.pickle")
df_possible_botfights.to_csv("../datasets/possible_botfights.tsv", sep="\t")

In [38]:
lang_l = 'fr'
other_count = 0
df_lang_ns0 = df_all_ns0[df_all_ns0['language']==lang_l]

for comment, count in df_lang_ns0[df_lang_ns0['bottype']=='other']['reverting_comment_nobracket'].value_counts().iteritems():
    if count > 5:
        print(count, "\t", comment)
    else:
        other_count = other_count + count
print(other_count, "\tOther cases")

115 	 robot Modifie:
96 	 r2.7.3)
96 	 robot Retire:
81 	 r2.7.1)
79 	 robot Retire :
46 	 r2.7.2)
44 	 Correction: créer n'était pas nécéssaire, puisque la plante est . Allez, on refait tout dans l'autre sens...
42 	 correction d'initiative malheureuse...
31 	 r2.6.4)
28 	 r2.7.2+)
27 	 liens avec le portail trop faible, Replaced: |Floride}} → }}
19 	 Robot : retire de Catégorie:Cacographie
17 	 r2.6.5)
14 	 Robot : Remplacement de texte automatisé
12 	 
11 	 a été renommé, ceci est maintenant une redirection vers
8 	 r2.6.3)
8 	 robot Ajoute:
8 	 -
7 	 r2.5.2)
6 	 Retrait du lien , supprimé sur Commons par ; motif : Copyright violation, see
6 	 Révocation des modifications de
6 	 Homonymie résolue à l’aide du robot: Championnat du Monde poids-moyens de la WWE - Modifications du lien pour
220 	Other cases
