# Set-up

In [3]:
import altair as alt
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import time

from altair_saver import save
from datetime import datetime
from sseclient import SSEClient as EventSource

pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Data collection

In [None]:
os.makedirs('../data', exist_ok=True)

url = 'https://stream.wikimedia.org/v2/stream/recentchange'
dataset = []
events_counter = 0
time_threshold = 60.0
t_0  = time.time()

for event in EventSource(url): # start streaming
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
            
        events_counter += 1
        user_hash = hash(change['user'])
        
        if user_hash % 10 < 2: # filter
            dataset.append(change)
            
        if (time.time() - t_0)//60 > time_threshold:
            break
            
df = pd.DataFrame(dataset)
df.to_csv(f'../data/dataset_{t_0}.csv')

## Import after preparation

In [None]:
# reading prepared dataset
df = pd.read_csv('https://raw.githubusercontent.com/madmalewolf/stat-open-/main/all_data.tsv.gz', sep='\t', compression='gzip',)
print(f'df.shape = {df.shape}')
df.head(2)

# Data exploration

## Number of changes done by user

In [None]:
os.makedirs('../charts', exist_ok=True) # create dir for charts

gd_dict = {row['user']: row['bot'] for _, row in df.iterrows()}
df_counts = df.user.value_counts().reset_index()
df_counts.columns = ['user', 'changes']
df_counts['bot'] = df_counts.user.map(lambda x: gd_dict[x])
df_counts.head()

np.random.seed(42)
sample_df = df_counts.sample(min(df_counts.shape[0], 400))

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user')]
).properties(
    width = 600,
    height = 400,
    title = 'Number of changes by user'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/total_number_changes.html')
chart

## Avg number of changes done by user per timestamp

In [None]:
# df['timestamp_parsed'] = df.timestamp.map(lambda x: datetime.fromtimestamp(x))

avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
sample_df = avg_changes_per_timestamp.sample(min(avg_changes_per_timestamp.shape[0], 400))

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [alt.Tooltip('count()', title='No of users')]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_changes_per_timestamp.html')
chart

## Avg number of changes done by user per timestamp (balanced by bot/not bot)

In [None]:
avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

np.random.seed(42)
num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of changes by user per timestamp'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_changes_per_timestamp_balanced.html')
chart

In [None]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

## Avg number of changes done by user per timestamp (balanced by bot/not bot) having more than 1 change

In [None]:
avg_changes_per_timestamp = df[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

np.random.seed(42)
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of changes by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_changes_per_timestamp_balanced_with_filter.html')
chart

In [None]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

In [None]:
df[(df.user == 'Phediuk')&(df.timestamp == 1633643510)]

In [None]:
df[(df.user == 'Prairie Astronomer')&(df.timestamp == 1633653280)]

## Change type analysis

In [None]:
df.type.value_counts()

In [None]:
df[['type', 'bot', 'minor']].value_counts()

In [None]:
df_edits = df[df.type == 'edit']
df_edits.head(2)

## Avg number of edits done by user per timestamp (balanced by bot/not bot) having more than 1 change

In [None]:
avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').mean().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

np.random.seed(42)
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Avg number of edits by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/avg_number_edits_per_timestamp_balanced_with_filter.html')
chart

In [None]:
df_edits[df_edits.user == 'Jamc2'].head()

In [None]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').mean()

# Max number of edits done by user per timestamp (balanced by bot/not bot) having more than 1 change

In [None]:
avg_changes_per_timestamp = df_edits[['user', 'timestamp']].value_counts().groupby('user').max().reset_index()
avg_changes_per_timestamp.columns = ['user', 'changes_per_timestamp']
avg_changes_per_timestamp['bot'] = avg_changes_per_timestamp.user.map(lambda x: gd_dict[x])

avg_changes_per_timestamp = avg_changes_per_timestamp[avg_changes_per_timestamp.changes_per_timestamp > 1.]

num = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].shape[0]
print(f'There are {num} bots')

np.random.seed(42)
bot_sample_df = avg_changes_per_timestamp[avg_changes_per_timestamp.bot].sample(num)
not_bot_sample_df = avg_changes_per_timestamp[~avg_changes_per_timestamp.bot].sample(num)

sample_df = pd.concat([bot_sample_df, not_bot_sample_df])

chart_counts = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('user', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('changes_per_timestamp:Q'),
    color = alt.Color('bot:N'),
    tooltip = [alt.Tooltip('user'), alt.Tooltip('changes_per_timestamp:Q')]
).properties(
    width = 600,
    height = 400,
    title = 'Max number of edits by user per timestamp with more than 1 change'
)

chart_bots = alt.Chart(sample_df).mark_area(
    interpolate='step'
).encode(
    x = alt.X('bot:N', axis=alt.Axis(labels=False, ticks=False)),
    y = alt.Y('count()', title='count'),
    tooltip = [
        alt.Tooltip('count()', title='No of users'), 
        alt.Tooltip('bot')
    ]
).properties(
    width = 200,
    height = 400,
    title = 'Number of bots'
)

chart = chart_counts | chart_bots

save(chart, '../charts/max_number_edits_per_timestamp_balanced_with_filter.html')
chart

In [None]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').max()

In [None]:
sample_df[['changes_per_timestamp', 'bot']].groupby('bot').min()