# Data collection

In [16]:
import altair as alt
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import time
from sseclient import SSEClient as EventSource

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [23]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
dataset = []
events_counter = 0
time_threshold = 60.0
t_0  = time.time()

for event in EventSource(url): # start streaming
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
            
        events_counter += 1
        user_hash = hash(change['user'])
        if user_hash % 10 < 2:
            dataset.append(change)
            
        if (time.time() - t_0)//60 > time_threshold:
            break
            
        if events_counter % 1000 == 0:
            print('saving data')
            df = pd.DataFrame(dataset)
            df.to_csv(f'dataset_{t_0}.csv')
            
df = pd.DataFrame(dataset)
df.to_csv(f'dataset_{t_0}.csv')

saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data

saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data

saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data

saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
saving data
s

In [217]:
len(dataset), events_counter, len(dataset)*100/events_counter

(36013, 305470, 11.789373751923266)

# Data exploration

## Number of changes done by user

In [166]:
# ! pip install altair
import altair as alt
import numpy as np
import pandas as pd

gd_dict = {row['user']: row['bot'] for _, row in df.iterrows()}
df_counts = df.user.value_counts().reset_index()
df_counts.columns = ['user', 'changes']
df_counts['bot'] = df_counts.user.map(lambda x: gd_dict[x])
df_counts.head()

np.random.seed(42)
sample_df = df_counts.sample(400)

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

## Avg number of changes done by user per timestamp

In [186]:
from datetime import datetime

df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
sample_df = avg_changes_per_timestamp.sample(400)

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

## Avg number of changes done by user per timestamp (balanced by bot/not bot)

In [200]:
from datetime import datetime

df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

In [206]:
sample_df[['changes_per_timestamp', 'bot']][sample_df.changes_per_timestamp > 1].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,3.070322
True,1.930707


## Avg number of changes done by user per timestamp (balanced by bot/not bot and with more than 1 change)

In [219]:
from datetime import datetime

df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

There are 27 bots


In [214]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,2.036153
True,1.930707


In [229]:
df[(df.user == 'Phediuk')&(df.timestamp == 1633643510)]

Unnamed: 0,$schema,meta,type,namespace,title,comment,timestamp,user,bot,log_id,log_type,log_action,log_params,log_action_comment,server_url,server_name,server_script_path,wiki,parsedcomment,id,minor,patrolled,length,revision,timestamp_parsed
8066,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Day_of_...,edit,0,Day of the Figurines,,1633643510,Phediuk,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,,1430680000.0,False,,"{'old': 2292, 'new': 2426}","{'old': 987420945, 'new': 1048773585}",2021-10-08 00:51:50
8069,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:2005 video games,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",1430680000.0,,,,,2021-10-08 00:51:50
8070,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:Massively multiplayer online games,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",1430680000.0,,,,,2021-10-08 00:51:50
8072,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:Video games developed in the United K...,"[[:Day of the Figurines]] added to category, [...",1633643510,Phediuk,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Day_of_the_Figurines"" title=""Da...",1430680000.0,,,,,2021-10-08 00:51:50


In [243]:
df[(df.user == 'Prairie Astronomer')&(df.timestamp == 1633653280)]

Unnamed: 0,$schema,meta,type,namespace,title,comment,timestamp,user,bot,log_id,log_type,log_action,log_params,log_action_comment,server_url,server_name,server_script_path,wiki,parsedcomment,id,minor,patrolled,length,revision,timestamp_parsed
30948,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Talk:20...,edit,1,Talk:2021 Balochistan earthquake,"Assessment: banner shell, Disaster management ...",1633653280,Prairie Astronomer,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"Assessment: banner shell, Disaster management ...",1430713000.0,False,,"{'old': 230, 'new': 293}","{'old': 1048758694, 'new': 1048790189}",2021-10-08 03:34:40
30954,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:Mid-importance Disaster management ar...,[[:Talk:2021 Balochistan earthquake]] added to...,1633653280,Prairie Astronomer,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",1430713000.0,,,,,2021-10-08 03:34:40
30955,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:Low-importance Pakistan articles,[[:Talk:2021 Balochistan earthquake]] added to...,1633653280,Prairie Astronomer,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",1430713000.0,,,,,2021-10-08 03:34:40
30956,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:Unknown-importance Disaster managemen...,[[:Talk:2021 Balochistan earthquake]] removed ...,1633653280,Prairie Astronomer,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",1430713000.0,,,,,2021-10-08 03:34:40
30957,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/Categor...,categorize,14,Category:Unknown-importance Pakistan articles,[[:Talk:2021 Balochistan earthquake]] removed ...,1633653280,Prairie Astronomer,False,,,,,,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Talk:2021_Balochistan_earthquak...",1430713000.0,,,,,2021-10-08 03:34:40


## Change type analysis

In [244]:
df.type.value_counts()

edit          19111
categorize     9958
log            5394
new            1535
142              15
Name: type, dtype: int64

In [302]:
df[['type', 'bot', 'minor']].value_counts()

type  bot    minor
edit  False  False    12118
      True   False     4161
      False  True      2226
new   False  False     1233
edit  True   True       606
new   True   False      302
dtype: int64

In [322]:
df_edits = df[df.type == 'edit']
df_edits.head(2)

Unnamed: 0,$schema,meta,type,namespace,title,comment,timestamp,user,bot,log_id,log_type,log_action,log_params,log_action_comment,server_url,server_name,server_script_path,wiki,parsedcomment,id,minor,patrolled,length,revision,timestamp_parsed
1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,edit,0,Q108828321,/* wbsetreference-add:2| */ [[Property:P26]]: ...,1633640647,Quesotiotyo,False,,,,,,https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",1556634000.0,False,True,"{'old': 21609, 'new': 22654}","{'old': 1509204168, 'new': 1509204170}",2021-10-08 00:04:07
2,/mediawiki/recentchange/1.0.0,{'uri': 'https://ca.wikipedia.org/wiki/Tiriy%C...,edit,0,Tiriyó,/* Dialectes */,1633640646,Walden69,False,,,,,,https://ca.wikipedia.org,ca.wikipedia.org,/w,cawiki,"<span dir=""auto""><span class=""autocomment""><a ...",104135200.0,False,True,"{'old': 10137, 'new': 11828}","{'old': 28378881, 'new': 28378955}",2021-10-08 00:04:06


## Avg number of edits per user per timestamp (threshold of more than one change)

In [324]:
avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)

# avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

# sample_df = pd.concat([bot_sample_df, not_bot_sample_df])
sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

There are 46 bots


In [325]:
df_edits[df_edits.user == 'Jamc2']

Unnamed: 0,$schema,meta,type,namespace,title,comment,timestamp,user,bot,log_id,log_type,log_action,log_params,log_action_comment,server_url,server_name,server_script_path,wiki,parsedcomment,id,minor,patrolled,length,revision,timestamp_parsed
145,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge piers of the old Border Counties R...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6522, 'new': 6567}","{'old': 538920554, 'new': 596618286}",2021-10-08 00:04:42
146,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge pier of the old Border Counties Ra...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6701, 'new': 6746}","{'old': 538920663, 'new': 596618279}",2021-10-08 00:04:42
147,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge pier of the old Border Counties Ra...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6480, 'new': 6525}","{'old': 538920543, 'new': 596618281}",2021-10-08 00:04:42
149,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge piers of the old Border Counties R...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6761, 'new': 6806}","{'old': 538920542, 'new': 596618289}",2021-10-08 00:04:42
153,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge carrying the A69 over the River Ty...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6465, 'new': 6510}","{'old': 539055128, 'new': 596618278}",2021-10-08 00:04:42
155,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge piers of the old Border Counties R...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6736, 'new': 6781}","{'old': 538920545, 'new': 596618287}",2021-10-08 00:04:42
156,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge piers of the old Border Counties R...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6522, 'new': 6567}","{'old': 538920552, 'new': 596618288}",2021-10-08 00:04:42
162,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Bridge carrying the A69 over the River Ty...,Copying from [[Category:United Kingdom photogr...,1633640682,Jamc2,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Copying from <a href=""/wiki/Category:United_Ki...",1784971000.0,True,True,"{'old': 6456, 'new': 6501}","{'old': 539055132, 'new': 596618285}",2021-10-08 00:04:42


In [320]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

Unnamed: 0_level_0,changes_per_timestamp
bot,Unnamed: 1_level_1
False,1.095276
True,1.1634


In [321]:
sample_df.head()

Unnamed: 0,user,changes_per_timestamp,bot
1233,FuzzyBot,1.637255,True
858,AnomieBOT,1.0033,True
1872,RottenBot,1.000939,True
1973,SteinsplitterBot,1.169811,True
943,BotMultichillT,1.048507,True


In [318]:
temp = df_edits[['user', 'timestamp', 'bot']].value_counts().groupby('user').max().reset_index()
temp[temp.user == 'Raugeier']

Unnamed: 0,user,0
1823,Raugeier,22


In [319]:
df_edits[(df_edits.user == 'Raugeier')&(df_edits.timestamp == 1633642165)]

Unnamed: 0,$schema,meta,type,namespace,title,comment,timestamp,user,bot,log_id,log_type,log_action,log_params,log_action_comment,server_url,server_name,server_script_path,wiki,parsedcomment,id,minor,patrolled,length,revision,timestamp_parsed
4005,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen -3351-Pano.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 4128, 'new': 4059}","{'old': 593273218, 'new': 596627307}",2021-10-08 00:29:25
4006,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-4624-Pano.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2402, 'new': 2332}","{'old': 590529766, 'new': 596627314}",2021-10-08 00:29:25
4007,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-3384.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2109, 'new': 2039}","{'old': 593271182, 'new': 596627319}",2021-10-08 00:29:25
4008,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-3404.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2152, 'new': 2082}","{'old': 593271297, 'new': 596627308}",2021-10-08 00:29:25
4009,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-4620.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2183, 'new': 2113}","{'old': 591791253, 'new': 596627316}",2021-10-08 00:29:25
4010,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-3362.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2155, 'new': 2085}","{'old': 590529927, 'new': 596627317}",2021-10-08 00:29:25
4011,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-3259.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2154, 'new': 2084}","{'old': 593270776, 'new': 596627310}",2021-10-08 00:29:25
4012,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-3300.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2161, 'new': 2091}","{'old': 590530180, 'new': 596627312}",2021-10-08 00:29:25
4013,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:St. Nicolai in Gödringen-3327.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 2153, 'new': 2083}","{'old': 590528880, 'new': 596627324}",2021-10-08 00:29:25
4015,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,edit,6,File:Ehemaliges Parrhaus (Gödringen) -3409.jpg,Removing from [[Category:Images from Lower Sax...,1633642165,Raugeier,False,,,,,,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"Removing from <a href=""/wiki/Category:Images_f...",1784986000.0,True,True,"{'old': 3813, 'new': 3743}","{'old': 592631375, 'new': 596627325}",2021-10-08 00:29:25


In [294]:
from datetime import datetime

df_edits['timestamp_parsed'] = df_edits.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').max().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Max of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart_counts | chart_bots

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edits['timestamp_parsed'] = df_edits.timestamp.map(lambda x: datetime.fromtimestamp(x))


There are 11 bots


In [296]:
avg_changes_per_timestamp[avg_changes_per_timestamp.user == 'Raugeier']

Unnamed: 0,user,changes_per_timestamp,bot
1823,Raugeier,22,False
