# Set-up

In [3]:
import altair as alt
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import time

from altair_saver import save
from datetime import datetime
from sseclient import SSEClient as EventSource

pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Data collection

In [None]:
os.makedirs('../data', exist_ok=True)

url = 'https://stream.wikimedia.org/v2/stream/recentchange'
dataset = []
events_counter = 0
time_threshold = 60.0
t_0  = time.time()

for event in EventSource(url): # start streaming
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
            
        events_counter += 1
        user_hash = hash(change['user'])
        
        if user_hash % 10 < 2: # filter
            dataset.append(change)
            
        if (time.time() - t_0)//60 > time_threshold:
            break
            
df = pd.DataFrame(dataset)
df.to_csv(f'../data/dataset_{t_0}.csv')

## Import after preparation

In [4]:
# reading prepared dataset
df = pd.read_csv('https://raw.githubusercontent.com/madmalewolf/stat-open-/main/all_data.tsv.gz', sep='\t', compression='gzip',)
print(f'df.shape = {df.shape}')
df.head(2)

df.shape = (120467, 26)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
0,0,0,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P570]]:...,1633634790,Quesotiotyo,False,False,True,"{'old': 28649, 'new': 29712}","{'old': 1509178938, 'new': 1509178965}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,
1,1,1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q175855...,1556609000.0,edit,0,Q17585531,/* wbsetreference-set:2| */ [[Property:P141]]:...,1633634789,SuccuBot,True,False,True,"{'old': 16134, 'new': 16134}","{'old': 1458793714, 'new': 1509178961}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Fu...",,,,,


# Data exploration

## Number of changes done by user

In [5]:
os.makedirs('../charts', exist_ok=True) # create dir for charts

gd_dict = {row['user']: row['bot'] for _, row in df.iterrows()}
df_counts = df.user.value_counts().reset_index()
df_counts.columns = ['user', 'changes']
df_counts['bot'] = df_counts.user.map(lambda x: gd_dict[x])
df_counts.head()

np.random.seed(42)
sample_df = df_counts.sample(min(df_counts.shape[0], 400))

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/total_number_changes.html')
chart

## Avg number of changes done by user per timestamp

In [6]:
# df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
sample_df = avg_changes_per_timestamp.sample(min(avg_changes_per_timestamp.shape[0], 400))

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_changes_per_timestamp.html')
chart

## Avg number of changes done by user per timestamp (balanced by bot/not bot)

In [7]:
avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_changes_per_timestamp_balanced.html')
chart

In [8]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,1.477535
True,1.655237


## Avg number of changes done by user per timestamp (balanced by bot/not bot) having more than 1 change

In [9]:
avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

np.random.seed(42)
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_changes_per_timestamp_balanced_with_filter.html')
chart

There are 76 bots


In [10]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,2.738688
True,2.163909


In [11]:
df[(df.user == 'Phediuk')&(df.timestamp == 1633643510)]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
15504,8066,8066,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Day_of_...,1430680000.0,edit,0,Day of the Figurines,,1633643510,Phediuk,False,False,,"{'old': 2292, 'new': 2426}","{'old': 987420945, 'new': 1048773585}",https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,,,,,,
15507,8069,8069,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430680000.0,categorize,14,Category:2005 video games,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",,,,,
15508,8070,8070,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430680000.0,categorize,14,Category:Massively multiplayer online games,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",,,,,
15510,8072,8072,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430680000.0,categorize,14,Category:Video games developed in the United K...,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",,,,,


In [12]:
df[(df.user == 'Prairie Astronomer')&(df.timestamp == 1633653280)]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
38386,30948,30948,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Talk:20...,1430713000.0,edit,1,Talk:2021 Balochistan earthquake,"Assessment: banner shell, Disaster management ...",1633653280,Prairie Astronomer,False,False,,"{'old': 230, 'new': 293}","{'old': 1048758694, 'new': 1048790189}",https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"Assessment: banner shell, Disaster management ...",,,,,
38392,30954,30954,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Mid-importance Disaster management ar...,[[:Talk:2021 Balochistan earthquake]] added to...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,
38393,30955,30955,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Low-importance Pakistan articles,[[:Talk:2021 Balochistan earthquake]] added to...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,
38394,30956,30956,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Unknown-importance Disaster managemen...,[[:Talk:2021 Balochistan earthquake]] removed ...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,
38395,30957,30957,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Unknown-importance Pakistan articles,[[:Talk:2021 Balochistan earthquake]] removed ...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,


## Change type analysis

In [13]:
df.type.value_counts()

edit          75857
categorize    29798
log           11134
new            3653
142              25
Name: type, dtype: int64

In [14]:
df[['type', 'bot', 'minor']].value_counts()

type  bot    minor
edit  False  False    34630
      True   False    29800
      False  True      6379
      True   True      5048
new   False  False     2762
      True   False      891
dtype: int64

In [15]:
df_edits = df[df.type == 'edit']
df_edits.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
0,0,0,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P570]]:...,1633634790,Quesotiotyo,False,False,True,"{'old': 28649, 'new': 29712}","{'old': 1509178938, 'new': 1509178965}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,
1,1,1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q175855...,1556609000.0,edit,0,Q17585531,/* wbsetreference-set:2| */ [[Property:P141]]:...,1633634789,SuccuBot,True,False,True,"{'old': 16134, 'new': 16134}","{'old': 1458793714, 'new': 1509178961}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Fu...",,,,,


## Avg number of edits done by user per timestamp (balanced by bot/not bot) having more than 1 change

In [16]:
avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

np.random.seed(42)
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of edits by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_edits_per_timestamp_balanced_with_filter.html')
chart

There are 36 bots


In [17]:
df_edits[df_edits.user == 'Jamc2'].head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
37,37,37,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784906000.0,edit,6,File:Wild flowers in stubble field - geograph....,Copying from [[Category:United Kingdom photogr...,1633634797,Jamc2,False,True,True,"{'old': 6857, 'new': 6904}","{'old': 538914825, 'new': 596582011}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",,,,,
38,38,38,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784906000.0,edit,6,File:View towards Howard's Hill - geograph.org...,Copying from [[Category:United Kingdom photogr...,1633634797,Jamc2,False,True,True,"{'old': 7362, 'new': 7409}","{'old': 538913003, 'new': 596582010}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",,,,,
40,40,40,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784906000.0,edit,6,File:Where is the footpath - geograph.org.uk -...,Copying from [[Category:United Kingdom photogr...,1633634797,Jamc2,False,True,True,"{'old': 7550, 'new': 7597}","{'old': 538914589, 'new': 596582007}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",,,,,
900,900,900,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784908000.0,edit,6,"File:Cragside - Leeds Road, Windhill - geograp...",Copying from [[Category:United Kingdom photogr...,1633634976,Jamc2,False,True,True,"{'old': 6494, 'new': 6533}","{'old': 578058026, 'new': 596583273}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",,,,,
6132,6132,6132,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784922000.0,edit,6,File:Boyne Street - Hopwood Lane - geograph.or...,Copying from [[Category:United Kingdom photogr...,1633636307,Jamc2,False,True,True,"{'old': 6453, 'new': 6522}","{'old': 538920102, 'new': 596591492}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",,,,,


In [18]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,1.863656
True,1.509182


# Max number of edits done by user per timestamp (balanced by bot/not bot) having more than 1 change

In [19]:
avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').max().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

np.random.seed(42)
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Max number of edits by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/max_number_edits_per_timestamp_balanced_with_filter.html')
chart

There are 36 bots


In [20]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').max()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,19
True,18


In [21]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').min()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,2
True,2
