# Data collection

In [2]:
import altair as alt
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import time
from sseclient import SSEClient as EventSource

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
dataset = []
users = set()
events_counter = 0
time_threshold = 5.0
t_0  = time.time()

for event in EventSource(url): # start streaming
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
            
        events_counter += 1

        users.add(change['user'])
        user_hash = hash(change['user'])
        if user_hash % 10 < 2:
            dataset.append(change)
            
        if (time.time() - t_0)//60 > time_threshold:
            break
            
df = pd.DataFrame(dataset)
# df.to_csv(f'dataset_{t_0}.tsv')

In [9]:
len(dataset), events_counter, len(dataset)*100/events_counter

(2739, 9477, 28.901551123773345)

In [None]:
len(dataset), events_counter, len(set(dataset['user']))*100/events_counter

In [47]:
dfs = []
for f_ in ['dataset.tsv',
 'dataset_1633640647.5015893.tsv',
 'dataset_1633702613.0484471.tsv',
 'dataset_1633636846.3666255.tsv',
 'morning_data.csv']:
    dfs.append(pd.read_csv('data/' + f_))
    
all_data = pd.concat(dfs)
all_data.head(2)

Unnamed: 0.1,Unnamed: 0,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
0,0,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P570]]:...,1633634790,Quesotiotyo,False,False,True,"{'old': 28649, 'new': 29712}","{'old': 1509178938, 'new': 1509178965}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,
1,1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q175855...,1556609000.0,edit,0,Q17585531,/* wbsetreference-set:2| */ [[Property:P141]]:...,1633634789,SuccuBot,True,False,True,"{'old': 16134, 'new': 16134}","{'old': 1458793714, 'new': 1509178961}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Fu...",,,,,
2,2,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q302623...,1556609000.0,edit,0,Q30262345,/* wbsetreference-set:2| */ [[Property:P3016]]...,1633634791,KrBot,True,False,True,"{'old': 11328, 'new': 11328}","{'old': 1304488366, 'new': 1509178977}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Из...",,,,,
3,3,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P20]]: ...,1633634792,Quesotiotyo,False,False,True,"{'old': 29712, 'new': 30775}","{'old': 1509178965, 'new': 1509178984}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,
4,4,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q302623...,1556609000.0,edit,0,Q30262346,/* wbsetreference-set:2| */ [[Property:P3016]]...,1633634792,KrBot,True,False,True,"{'old': 15657, 'new': 15657}","{'old': 1304474435, 'new': 1509178983}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Из...",,,,,


In [52]:
all_data.shape

(120467, 25)

In [53]:
len(all_data.user.unique())

7109

In [57]:
all_data.bot.value_counts()

False    75046
True     45421
Name: bot, dtype: int64

In [61]:
len(all_data[all_data.bot].user.unique())

208

In [63]:
all_data.to_csv('all_data.tsv', sep='\t')

In [64]:
df = all_data.copy()

# Data exploration

## Number of changes done by user

In [65]:
# ! pip install altair
import altair as alt
import numpy as np
import pandas as pd

gd_dict = {row['user']: row['bot'] for _, row in df.iterrows()}
df_counts = df.user.value_counts().reset_index()
df_counts.columns = ['user', 'changes']
df_counts['bot'] = df_counts.user.map(lambda x: gd_dict[x])
df_counts.head()

np.random.seed(42)
sample_df = df_counts.sample(min(df_counts.shape[0], 400))

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes:Q'),
    color = alt.Color('bot:N'),
    tooltip = [
        alt.Tooltip('user'), 
        alt.Tooltip('changes')
    ]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

In [32]:
df[(df.user == 'ST47ProxyBot')].head()

Unnamed: 0,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment,timestamp_parsed
3,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/User:41...,1430871000.0,log,2,User:41.217.27.208,{{blocked p2p proxy|ip=41.217.27.208|isp=Used ...,1633699044,ST47ProxyBot,True,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,{{blocked p2p proxy|ip=41.217.27.208|isp=Used ...,122507192.0,block,block,"{'duration': '71 hours', 'flags': 'nocreate', ...",blocked User:41.217.27.208 (account creation b...,2021-10-08 16:17:24
4,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/User:41...,1430871000.0,log,2,User:41.68.97.100,{{blocked p2p proxy|ip=41.68.97.100|isp=Vodafo...,1633699044,ST47ProxyBot,True,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,{{blocked p2p proxy|ip=41.68.97.100|isp=Vodafo...,122507191.0,block,block,"{'duration': '71 hours', 'flags': 'nocreate', ...",blocked User:41.68.97.100 (account creation bl...,2021-10-08 16:17:24
5,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/User:12...,1430871000.0,log,2,User:123.231.107.212,{{blocked p2p proxy|ip=123.231.107.212|isp=MTT...,1633699045,ST47ProxyBot,True,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,{{blocked p2p proxy|ip=123.231.107.212|isp=MTT...,122507193.0,block,block,"{'duration': '71 hours', 'flags': 'nocreate', ...",blocked User:123.231.107.212 (account creation...,2021-10-08 16:17:25
9,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/User:11...,1430871000.0,log,2,User:119.160.103.155,{{blocked p2p proxy|ip=119.160.103.155|isp=Mob...,1633699045,ST47ProxyBot,True,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,{{blocked p2p proxy|ip=119.160.103.155|isp=Mob...,122507196.0,block,block,"{'duration': '71 hours', 'flags': 'anononly,no...","blocked User:119.160.103.155 (anon. only, acco...",2021-10-08 16:17:25
12,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/User:41...,1430871000.0,log,2,User:41.107.0.80,{{blocked p2p proxy|ip=41.107.0.80|isp=Residen...,1633699045,ST47ProxyBot,True,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,{{blocked p2p proxy|ip=41.107.0.80|isp=Residen...,122507194.0,block,block,"{'duration': '71 hours', 'flags': 'nocreate', ...",blocked User:41.107.0.80 (account creation blo...,2021-10-08 16:17:25


## Avg number of changes done by user per timestamp

In [67]:
from datetime import datetime

df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
sample_df = avg_changes_per_timestamp.sample(min(avg_changes_per_timestamp.shape[0], 400))

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

## Avg number of changes done by user per timestamp (balanced by bot/not bot)

In [68]:
from datetime import datetime

df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

In [69]:
sample_df[['changes_per_timestamp', 'bot']][sample_df.changes_per_timestamp > 1].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,2.57237
True,2.163909


## Avg number of changes done by user per timestamp (balanced by bot/not bot and with more than 1 change)

In [70]:
from datetime import datetime

df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

There are 76 bots


In [72]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,2.738688
True,2.163909


In [73]:
df[(df.user == 'Phediuk')&(df.timestamp == 1633643510)]

Unnamed: 0.1,Unnamed: 0,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment,timestamp_parsed
8066,8066,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Day_of_...,1430680000.0,edit,0,Day of the Figurines,,1633643510,Phediuk,False,False,,"{'old': 2292, 'new': 2426}","{'old': 987420945, 'new': 1048773585}",https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,,,,,,,2021-10-08 00:51:50
8069,8069,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430680000.0,categorize,14,Category:2005 video games,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",,,,,,2021-10-08 00:51:50
8070,8070,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430680000.0,categorize,14,Category:Massively multiplayer online games,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",,,,,,2021-10-08 00:51:50
8072,8072,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430680000.0,categorize,14,Category:Video games developed in the United K...,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",,,,,,2021-10-08 00:51:50


In [74]:
df[(df.user == 'Prairie Astronomer')&(df.timestamp == 1633653280)]

Unnamed: 0.1,Unnamed: 0,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment,timestamp_parsed
30948,30948,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Talk:20...,1430713000.0,edit,1,Talk:2021 Balochistan earthquake,"Assessment: banner shell, Disaster management ...",1633653280,Prairie Astronomer,False,False,,"{'old': 230, 'new': 293}","{'old': 1048758694, 'new': 1048790189}",https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"Assessment: banner shell, Disaster management ...",,,,,,2021-10-08 03:34:40
30954,30954,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Mid-importance Disaster management ar...,[[:Talk:2021 Balochistan earthquake]] added to...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,,2021-10-08 03:34:40
30955,30955,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Low-importance Pakistan articles,[[:Talk:2021 Balochistan earthquake]] added to...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,,2021-10-08 03:34:40
30956,30956,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Unknown-importance Disaster managemen...,[[:Talk:2021 Balochistan earthquake]] removed ...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,,2021-10-08 03:34:40
30957,30957,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,1430713000.0,categorize,14,Category:Unknown-importance Pakistan articles,[[:Talk:2021 Balochistan earthquake]] removed ...,1633653280,Prairie Astronomer,False,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",,,,,,2021-10-08 03:34:40


## Change type analysis

In [75]:
df.type.value_counts()

edit          75857
categorize    29798
log           11134
new            3653
142              25
Name: type, dtype: int64

In [76]:
df[['type', 'bot', 'minor']].value_counts()

type  bot    minor
edit  False  False    34630
      True   False    29800
      False  True      6379
      True   True      5048
new   False  False     2762
      True   False      891
dtype: int64

In [77]:
df_edits = df[df.type == 'edit']
df_edits.head()

Unnamed: 0.1,Unnamed: 0,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment,timestamp_parsed
0,0,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P570]]:...,1633634790,Quesotiotyo,False,False,True,"{'old': 28649, 'new': 29712}","{'old': 1509178938, 'new': 1509178965}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,,2021-10-07 22:26:30
1,1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q175855...,1556609000.0,edit,0,Q17585531,/* wbsetreference-set:2| */ [[Property:P141]]:...,1633634789,SuccuBot,True,False,True,"{'old': 16134, 'new': 16134}","{'old': 1458793714, 'new': 1509178961}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Fu...",,,,,,2021-10-07 22:26:29
2,2,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q302623...,1556609000.0,edit,0,Q30262345,/* wbsetreference-set:2| */ [[Property:P3016]]...,1633634791,KrBot,True,False,True,"{'old': 11328, 'new': 11328}","{'old': 1304488366, 'new': 1509178977}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Из...",,,,,,2021-10-07 22:26:31
3,3,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P20]]: ...,1633634792,Quesotiotyo,False,False,True,"{'old': 29712, 'new': 30775}","{'old': 1509178965, 'new': 1509178984}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,,2021-10-07 22:26:32
4,4,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q302623...,1556609000.0,edit,0,Q30262346,/* wbsetreference-set:2| */ [[Property:P3016]]...,1633634792,KrBot,True,False,True,"{'old': 15657, 'new': 15657}","{'old': 1304474435, 'new': 1509178983}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Из...",,,,,,2021-10-07 22:26:32


In [78]:
# df_edits['timestamp_parsed'] = df_edits.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

There are 36 bots


In [79]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,1.863656
True,1.509182


In [80]:
temp = df_edits[['user', 'timestamp', 'bot']].value_counts().groupby('user').max().reset_index()
temp[temp.user == 'Raugeier']

Unnamed: 0,user,0
4903,Raugeier,22


In [81]:
df_edits[(df_edits.user == 'Raugeier')&(df_edits.timestamp == 1633642165)]

Unnamed: 0.1,Unnamed: 0,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment,timestamp_parsed
4005,4005,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen -3351-Pano.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 4128, 'new': 4059}","{'old': 593273218, 'new': 596627307}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4006,4006,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-4624-Pano.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2402, 'new': 2332}","{'old': 590529766, 'new': 596627314}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4007,4007,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-3384.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2109, 'new': 2039}","{'old': 593271182, 'new': 596627319}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4008,4008,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-3404.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2152, 'new': 2082}","{'old': 593271297, 'new': 596627308}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4009,4009,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-4620.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2183, 'new': 2113}","{'old': 591791253, 'new': 596627316}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4010,4010,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-3362.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2155, 'new': 2085}","{'old': 590529927, 'new': 596627317}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4011,4011,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-3259.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2154, 'new': 2084}","{'old': 593270776, 'new': 596627310}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4012,4012,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-3300.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2161, 'new': 2091}","{'old': 590530180, 'new': 596627312}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4013,4013,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:St. Nicolai in Gödringen-3327.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 2153, 'new': 2083}","{'old': 590528880, 'new': 596627324}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25
4015,4015,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,1784986000.0,edit,6,File:Ehemaliges Parrhaus (Gödringen) -3409.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,True,True,"{'old': 3813, 'new': 3743}","{'old': 592631375, 'new': 596627325}",https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",,,,,,2021-10-08 00:29:25


In [91]:
from datetime import datetime

df_edits['timestamp_parsed'] = df_edits.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])
sample_df = sample_df[sample_df.changes_per_timestamp < 4]

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Max of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edits['timestamp_parsed'] = df_edits.timestamp.map(lambda x: datetime.fromtimestamp(x))


There are 36 bots


In [83]:
avg_changes_per_timestamp[avg_changes_per_timestamp.user == 'Raugeier']

Unnamed: 0,user,changes_per_timestamp,bot
4903,Raugeier,22,False
