In [1]:
import pandas as pd
import numpy as np

import datetime as dt
import time
from functools import reduce
from multiprocess import Pool, cpu_count

import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
import re
from string import ascii_uppercase as auc

In [9]:
## LOAD DATA
df = pd.read_csv('wiki_data_all.csv', index_col=0)

for t in ['15', '30', '60']:
    with open(f'batches_{t}.npy', 'rb') as f:
        batches = np.load(f, allow_pickle=True)
        df_batches = pd.DataFrame(batches, columns=['revid', f'batchid_{t}'])
        df_batches['revid'] = df_batches['revid'].astype(int)
    
    df = df.set_index('revid').join(df_batches.set_index('revid')).reset_index()
    df[f'batchid_{t}'] = df[f'batchid_{t}'].fillna(df.revid)

In [10]:
df.head()

Unnamed: 0,revid,user,userid,userhidden,timestamp,anon,commenthidden,page,pageid,batchid_15,batchid_30,batchid_60
0,1063058808,Reywas92,1233313.0,False,2022-01-01 00:00:00,False,False,Betty_White,415045,1063058808,1063058808,1063058808
1,1063058809,99.237.103.147,0.0,False,2022-01-01 00:00:00,True,False,ARM_Cortex-A78,64280898,1063058809,1063058809,1063058809
2,1063058810,Dmoore5556,29278485.0,False,2022-01-01 00:00:01,False,False,2021_Cotton_Bowl_Classic,67976697,1063058810,1063058810,1063058810
3,1063058811,Extraordinary Writ,39795743.0,False,2022-01-01 00:00:00,False,False,Francis_Moorehouse,34931769,1063058811,1063058811,1063058811
4,1063058812,Twozenhauer,14955567.0,False,2022-01-01 00:00:01,False,False,Trouble_(1931_film),69644588,1063058812,1063058812,1063058812


In [11]:
df.timestamp.min(), df.timestamp.max()

('2022-01-01 00:00:00', '2022-02-01 00:00:00')

In [6]:
%%time
# put timestamps into datetime
df['dt_timestamp'] = df.timestamp.apply(lambda t: dt.datetime.strptime(t, '%Y-%m-%d %H:%M:%S'))

CPU times: user 37.8 s, sys: 327 ms, total: 38.1 s
Wall time: 39.8 s


In [22]:
g = df.groupby('batchid_15').pageid.nunique()
pct_15 = df[df.batchid_15.isin(g[g>1].index)].shape[0] / df.shape[0]

g = df.groupby('batchid_30').pageid.nunique()
pct_30 = df[df.batchid_30.isin(g[g>1].index)].shape[0] / df.shape[0]

g = df.groupby('batchid_60').pageid.nunique()
pct_60 = df[df.batchid_60.isin(g[g>1].index)].shape[0] / df.shape[0]

In [23]:
pct_15, pct_30, pct_60

(0.65825409492516, 0.702480254269096, 0.735061410509022)

In [7]:
g = df.groupby('user').pageid.nunique()

In [8]:
n_revisions = df.shape[0]
n_users = len(g)
pct_multiple_page_edits = len(g[g>1]) / len(g)
pct_anon = df[df.anon].shape[0] / n_revisions
pct_revisions_multi = g[g>1].sum() / g.sum()

n_revisions, n_users, pct_multiple_page_edits*100, pct_anon * 100, pct_revisions_multi * 100

(3872914, 359919, 25.127320313737258, 18.484479645042466, 88.80258119739803)

In [17]:
df[df.user == g[g>=5].index[3]].sort_values(by='timestamp')

Unnamed: 0,user,userid,userhidden,timestamp,anon,commenthidden,revid,page,pageid,dt_timestamp
9559,(Na(Boo)mBap),40672539.0,False,2022-01-09 13:08:22,False,False,1064644081,Everyone_Says_I_Love_You,42225,2022-01-09 13:08:22
100618,(Na(Boo)mBap),40672539.0,False,2022-01-10 13:20:58,False,False,1064840481,Alien_vs._Predator,511332,2022-01-10 13:20:58
11961,(Na(Boo)mBap),40672539.0,False,2022-01-16 17:05:03,False,False,1066064992,Diane_Morgan,32780992,2022-01-16 17:05:03
2225,(Na(Boo)mBap),40672539.0,False,2022-01-24 13:07:28,False,False,1067640917,Hannah_Murray,9279949,2022-01-24 13:07:28
2224,(Na(Boo)mBap),40672539.0,False,2022-01-24 13:08:15,False,False,1067641107,Hannah_Murray,9279949,2022-01-24 13:08:15
11691,(Na(Boo)mBap),40672539.0,False,2022-01-26 16:39:55,False,False,1068098036,Rosabell_Laurenti_Sellers,39522300,2022-01-26 16:39:55
20304,(Na(Boo)mBap),40672539.0,False,2022-01-28 09:11:26,False,False,1068417677,Lawnmower_Deth,2955008,2022-01-28 09:11:26


## Get math pages

In [None]:
BASE_MATH_URL = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Mathematics/List_of_mathematics_articles'

def get_all_math_articles():
    all_pages = df.page.unique()
    
    math_pages = []

    for i in auc:
        page = requests.get(f'{BASE_MATH_URL}_({i})') 
        soup = BeautifulSoup(page.content, 'html.parser') 
        
        links = soup.select("a[href*=\/wiki\/]")



        for link in links:
#             title = link.get('title')
            href = link.get('href')
            if match := re.search('\/wiki\/(.+)', href, re.IGNORECASE):
                page = match.group(1)
                if page is not None and page not in ['Privacy_policy']:
                    math_pages += [page]
        
        return math_pages

math_pages = np.unique(get_all_math_articles())
math_df = df[df.page.isin(math_pages) & (~df.user.isin(['Citation bot', 'WikiCleanerBot', 'AnomieBOT']))]

sum(math_df.groupby(['user'])['page'].nunique() > 1), math_df.user.nunique()