In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')

In [None]:
import xml.etree.ElementTree as ET
import json
import pandas as pd

# exportdate = '20250216093424'
exportdate = '20250411161808'

filename = 'Wikibase+World-' + exportdate + '.xml'
tree = ET.parse(filename)

root = tree.getroot()

item_map = {}

for page in root.findall('{http://www.mediawiki.org/xml/export-0.11/}page'):
    title = page.find('{http://www.mediawiki.org/xml/export-0.11/}title').text
    if title.startswith('Item:'):
        item_key = title.replace('Item:', '')
        text = page.find('{http://www.mediawiki.org/xml/export-0.11/}revision').find('{http://www.mediawiki.org/xml/export-0.11/}text').text
        # item_map[item_key] = {"text": text}
        data = json.loads(text)
        item_map[item_key] = {"data": data}

# print(item_map['Q386'])

def get_statements(data, key):
    claims = data.get('claims', {})
    statements = {}
    if key in claims:
        statements[key] = []
        for claim in claims[key]:
            # Filter out mainsnak-> snaktype somevalue or novalue
            if claim['mainsnak']['snaktype'] == 'somevalue' or claim['mainsnak']['snaktype'] == 'novalue':
                continue
            if claim['mainsnak']['datavalue']['type'] == 'string':
                statements[key].append(claim['mainsnak']['datavalue']['value'])
            elif claim['mainsnak']['datavalue']['type'] == 'wikibase-entityid':
                statements[key].append(claim['mainsnak']['datavalue']['value']['id'])
            elif claim['mainsnak']['datavalue']['type'] == 'time':
                time_value = claim['mainsnak']['datavalue']['value']['time']
                if time_value.startswith('+'):
                    time_value = time_value[1:]
                # If the day is 00, set it to 01 so it is parsable
                if time_value[8:10] == '00':
                    time_value = time_value[:8] + '01' + time_value[10:]
                # If month is 00, set it to 01 so it is parsable
                if time_value[5:7] == '00':
                    time_value = time_value[:5] + '01' + time_value[7:]
                statements[key].append(pd.to_datetime(time_value))
            elif claim['mainsnak']['datavalue']['type'] == 'quantity':
                statements[key].append(int(claim['mainsnak']['datavalue']['value']['amount']))
            else:
                print('Unknown type')
    return statements

# Extract some statement values from the json
for key in item_map:
    data = item_map[key]['data']
    # Ignore things that are not P3 => Q10
    if 'P3' not in data.get('claims', {}):
        continue
    # Get the P3 value
    p3 = get_statements(data, 'P3').get('P3', [])
    if 'Q10' not in p3:
        item_map[key].update({"instance_of": p3})
    # TODO some of these statements we expect to only have 1 value? so dont get a list?
    item_map[key].update({"url": get_statements(data, 'P1').get('P1', [])})
    item_map[key].update({"host": get_statements(data, 'P2').get('P2', [])})
    item_map[key].update({"instance": get_statements(data, 'P3').get('P3', [])})
    item_map[key].update({"start_date": get_statements(data, 'P5').get('P5', [])})
    item_map[key].update({"status": get_statements(data, 'P13').get('P13', [])})
    item_map[key].update({"links_to": get_statements(data, 'P55').get('P55', [])})
    item_map[key].update({"links_from": get_statements(data, 'P56').get('P56', [])})
    item_map[key].update({"version": get_statements(data, 'P57').get('P57', [])})
    item_map[key].update({"properties": get_statements(data, 'P58').get('P58', [])})
    item_map[key].update({"edits": get_statements(data, 'P59').get('P59', [])})
    item_map[key].update({"users": get_statements(data, 'P60').get('P60', [])})
    item_map[key].update({"users_active": get_statements(data, 'P61').get('P61', [])})
    item_map[key].update({"pages": get_statements(data, 'P62').get('P62', [])})
    item_map[key].update({"highest_item": get_statements(data, 'P67').get('P67', [])})

# Extract the en label and description of the items
for key in item_map:
    data = item_map[key]['data']
    labels = data.get('labels', {})
    item_map[key] = {"label": key, **item_map[key]}
    if labels:
        item_map[key].update({"label": labels.get('en', {}).get('value', key)})
        # logging.info(f"Updated label for {key}: {item_map[key]['label']}")
    descriptions = data.get('descriptions', {})
    if descriptions:
        item_map[key].update({"description": descriptions.get('en', {}).get('value', '')})
    else:
        item_map[key].update({"description": ''})

# And then supplement some of the values with their labels..
for key in item_map:
    if 'host' in item_map[key]:
        host = item_map[key]['host']
        if host:
            host_label = item_map.get(host[0], {}).get('label', host[0])
            item_map[key].update({"host_label": host_label})
            # logging.info(f"Updated host_label for {key}: {host_label}")

# Remove everything that doesn't have instanceof containing the string `Q10` (Wikibase site)
item_map = {k: v for k, v in item_map.items() if 'Q10' in v.get('instance', [])}
# Remove everything that doesn't have anything in the properties list
item_map = {k: v for k, v in item_map.items() if v.get('properties', [])}
# And things that has properties[0] == 0 (As this is probably a wiki with wikibase installed, but not data)
item_map = {k: v for k, v in item_map.items() if v.get('properties', [0])[0] != 0}

print(item_map['Q384'])

# print how many there are
print(len(item_map))

In [None]:
# Look at all wikibases, by their status
# Q54 online
# Q57 offline permanently
# Q72 offline indefinitely
statusToString = {
    'Q54': 'online',
    'Q57': 'offline permanently',
    'Q72': 'offline indefinitely'
}

status_map = {}
with_status = 0
for key in item_map:
    status = item_map[key].get('status', [])
    if status:
        status = status[0]
        if status not in status_map:
            status_map[status] = 0
        status_map[status] += 1
        with_status += 1
# convert the map keys
status_map = {statusToString.get(k, k): v for k, v in status_map.items()}
# We assume the rest are online
status_map['unknown'] = len(item_map) - with_status
print(status_map)

In [None]:
# At this point, remove everything that isnt online, or unknown
item_map = {k: v for k, v in item_map.items() if v.get('status', ['Q54'])[0] in ['Q54', 'unknown']}

In [None]:
# Supplement with more info just for the graph

# Define specific colors for certain host (using host, not host_label)
specific_colors = {
    'Q4': '#F00000',  # Wikimedia Foundation
    'Q5': '#5be971',  # independently hosted Wikibase
    'Q6': '#f11fbe',  # Wikimedia Cloud Services
    'Q7': '#6283ca',  # The Wikibase Consultancy
    'Q8': '#ADD8E6',  # Wikibase Cloud
    'Q117': '#ff6347',  # WBStack
    'Q118': '#5cd45b',  # Miraheze
    'Q322': '#ffa500',  # WikiTide
    'Q323': '#8a2be2',  # WikiForge
    'Q1434': '#ff69b4',  # wikibase-docker
}

# Add a "graph_group" to the item_map based on the host, including an unknown group
# Also add the graph_color based on the specific_colors, unknown is grey
for key in item_map:
    host = item_map[key].get('host', [])
    if host and host[0] in specific_colors:
        item_map[key].update({"graph_group": item_map[key].get('host_label', 'unknown')})
        item_map[key].update({"graph_color": specific_colors[host[0]]})
    else:
        item_map[key].update({"graph_group": 'unknown'})
        item_map[key].update({"graph_color": '#D3D3D3'})
        if host:
            print (f"Unknown host for {host}") # << you might wat to add a colour for this...

In [None]:
# Export the item_map to a file (that could easily be loaded back up)
def convert_timestamps(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.isoformat()
    raise TypeError("Type not serializable")

with open('wikibase_world_' + exportdate + '.json', 'w') as f:
    json.dump(item_map, f, indent=4, default=convert_timestamps)
    logging.info(f"Exported {len(item_map)} items to wikibase_world_{exportdate}.json")

# Function that can load from a file to a map too
# def load_item_map(filename):
#     with open(filename, 'r') as f:
#         return json.load(f)

In [None]:
import plotly.graph_objects as go
import networkx as nx
import numpy as np

G = nx.Graph()

In [None]:
# Make a filtered list for the graph
item_map_f = item_map.copy()
for key in list(item_map_f.keys()):
    # need 250 highest_item
    if 'highest_item' in item_map_f[key]:
        highest_item = item_map_f[key]['highest_item']
        if highest_item and highest_item[0] != 0:
            if highest_item[0] < 250:
                del item_map_f[key]
                continue
    # Remove things that dont have more than 250 pages
    if 'pages' in item_map_f[key]:
        pages = item_map_f[key]['pages']
        if pages and pages[0] < 250:
            del item_map_f[key]
            continue

# Add directed edges to the graph
for key in item_map_f:
    # Add it as a node
    G.add_node(key)
    # Add the edges
    if 'links_to' in item_map_f[key]:
        for link in item_map_f[key]['links_to']:
            G.add_edge(key, link, direction='to')
    if 'links_from' in item_map_f[key]:
        for link in item_map_f[key]['links_from']:
            G.add_edge(link, key, direction='from')

# Calculate the positions of the nodes
pos = nx.spring_layout(G, k=0.5)  # Increase the value of k to spread out the nodes more

# Push nodes with no edges outward slightly
for node in G.nodes():
    if G.degree(node) == 0:
        pos[node] *= 1.1  # Push the node outward by 10%

# Create the edge trace
x_edges = []
y_edges = []
for edge in G.edges():
    x_edges.extend([pos[edge[0]][0], pos[edge[1]][0], None])
    y_edges.extend([pos[edge[0]][1], pos[edge[1]][1], None])

edge_trace = go.Scatter(
    x=x_edges, y=y_edges,
    line=dict(width=0.3, color='#888'),
    hoverinfo='none',
    mode='lines',
    showlegend=False)

# Create a trace for each group
group_traces = {}
group_counts = {}
for node in G.nodes():
    # Some things have been filtered out (like having no properties listed..)
    if node not in item_map_f:
        continue
    group = item_map_f[node]['graph_group']
    if group not in group_traces:
        group_traces[group] = {
            'x': [],
            'y': [],
            'hovertext': [],
            'color': item_map_f[node]['graph_color'],
            'size': []
        }
        group_counts[group] = 0
    group_traces[group]['x'].append(pos[node][0])
    group_traces[group]['y'].append(pos[node][1])
    active_users = item_map_f[node].get('users_active', [0])[0] if item_map_f[node].get('users_active') else 0
    properties = item_map_f[node].get('properties', [0])[0] if item_map_f[node].get('properties') else 0
    hover_text = (
        f"{item_map_f[node]['label']}<br>"
        f"URL: {item_map_f[node].get('url', [''])[0] if item_map_f[node].get('url') else ''}<br>"
        f"Host: {item_map_f[node].get('host_label', ['']) if item_map_f[node].get('host_label') else ''}<br>"
        f"Version: {item_map_f[node].get('version', [''])[0] if item_map_f[node].get('version') else ''}<br>"
        f"Start Date: {item_map_f[node].get('start_date', [''])[0] if item_map_f[node].get('start_date') else ''}<br>"
        f"Active Users: {active_users}<br>"
        f"Users: {item_map_f[node].get('users', [''])[0] if item_map_f[node].get('users') else ''}<br>"
        f"Properties: {properties}<br>"
        f"Edits: {item_map_f[node].get('edits', [''])[0] if item_map_f[node].get('edits') else ''}<br>"
        f"Pages: {item_map_f[node].get('pages', [''])[0] if item_map_f[node].get('pages') else ''}"
    )
    group_traces[group]['hovertext'].append(hover_text)

    size_active_users = 4
    if active_users:
        if active_users > 0 and active_users < 10:
            size_active_users = active_users + 5
        elif active_users >= 10 and active_users < 100:
            size_active_users = 20
        elif active_users >= 100 and active_users < 1000:
            size_active_users = 30
        elif active_users >= 1000:
            size_active_users = 40

    size_properties = 4
    if properties:
        if properties > 0 and properties < 10:
            size_properties = properties + 5
        elif properties >= 10 and properties < 100:
            size_properties = 20
        elif properties >= 100 and properties < 1000:
            size_properties = 30
        elif properties >= 1000:
            size_properties = 40

    group_traces[group]['size'].append(size_active_users)
    # group_traces[group]['size'].append(size_properties)
    group_counts[group] += 1

# Create the node traces
node_traces = []
for group, data in group_traces.items():
    group_name_with_count = f"{group} ({group_counts[group]})"
    node_trace = go.Scatter(
        x=data['x'], y=data['y'],
        mode='markers',
        hovertext=data['hovertext'],
        hoverinfo='text',
        showlegend=True,
        legendgrouptitle=dict(text="By host"),
        name=group_name_with_count,  # Use the graph_group string with count as the legend label
        marker=dict(
            showscale=False,
            size=data['size'],
            color=data['color'],
            line_width=1,
            line_color='black'
            ))
    node_traces.append(node_trace)

fig = go.Figure(data=[edge_trace] + node_traces,
                layout=go.Layout(
                    title='Wikibases, and links, sized by # of active users',
                    titlefont_size=16,
                    showlegend=True,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    annotations=[dict(
                        text="",
                        showarrow=False,
                        xref="paper", yref="paper")],
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False),
                    height=800  # Set the height to twice the default (400)
                    ))

# Show the plot
fig.show()

# save the figure as html
fig.write_html("wikibase_graph.html", include_plotlyjs='cdn')

In [None]:
from plotly.subplots import make_subplots

# Filter out the Wikibase Cloud host
filtered_group_counts = {k: v for k, v in group_counts.items() if k != 'Wikibase Cloud'}

filtered_labels = [f"{k} ({v})" for k, v in filtered_group_counts.items()]
filtered_values = list(filtered_group_counts.values())

# Prepare data for the pie charts
labels = [f"{k} ({v})" for k, v in group_counts.items()]
values = list(group_counts.values())

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=labels, values=values, name="All Hosts"), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_labels, values=filtered_values, name="Excluding Wikibase Cloud"), 1, 2)

# Update layout
fig.update_layout(
    title_text="Number of wikibases by host",
    height=500,
    width=800,
    annotations=[dict(text='All Hosts', x=0.18, y=1, font_size=12, showarrow=False),
                 dict(text='Excluding Wikibase Cloud', x=0.9, y=1, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_host_wikibases.html", include_plotlyjs='cdn')



In [None]:
from plotly.subplots import make_subplots

# Calculate the number of active users per host
active_users_by_host = {}
for key, value in item_map.items():
    host_label = value.get('host_label', 'unknown')
    active_users_count = value.get('users_active', [0])[0]
    if host_label not in active_users_by_host:
        active_users_by_host[host_label] = 0
    active_users_by_host[host_label] += active_users_count

# Update labels to include the total number of active users per host
labels = [f"{k} ({v})" for k, v in active_users_by_host.items()]
values = list(active_users_by_host.values())

# Filter out the Wikimedia Foundation
filtered_active_users_by_host = {k: v for k, v in active_users_by_host.items() if k != 'Wikimedia Foundation'}

filtered_labels = [f"{k} ({v})" for k, v in filtered_active_users_by_host.items()]
filtered_values = list(filtered_active_users_by_host.values())

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=labels, values=values, name="All Hosts"), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_labels, values=filtered_values, name="Excluding Wikimedia Foundation"), 1, 2)

# Update layout
fig.update_layout(
    title_text="Number of active users by host",
    height=500,
    width=800,
    annotations=[dict(text='All Hosts', x=0.18, y=1, font_size=12, showarrow=False),
                 dict(text='Excluding WMF', x=0.82, y=1, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_host_active_users.html", include_plotlyjs='cdn')


In [None]:
# Calculate the number of properties per host
properties_by_host = {}
for key, value in item_map.items():
    host_label = value.get('host_label', 'unknown')
    properties_count = value.get('properties', [0])[0]
    if host_label not in properties_by_host:
        properties_by_host[host_label] = 0
    properties_by_host[host_label] += properties_count

# Filter out the Wikibase Cloud host
filtered_properties_by_host = {k: v for k, v in properties_by_host.items() if k != 'Wikibase Cloud'}

# Prepare data for the pie charts
labels = [f"{k} ({v})" for k, v in properties_by_host.items()]
values = list(properties_by_host.values())
filtered_labels = [f"{k} ({v})" for k, v in filtered_properties_by_host.items()]
filtered_values = list(filtered_properties_by_host.values())

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=labels, values=values, name="All Hosts"), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_labels, values=filtered_values, name="Excluding Wikibase Cloud"), 1, 2)

# Update layout
fig.update_layout(
    title_text="Number of properties by host",
    height=500,
    width=800,
    annotations=[dict(text='All Hosts', x=0.18, y=1, font_size=12, showarrow=False),
                 dict(text='Excluding Wikibase Cloud', x=0.9, y=1, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_host_properties.html", include_plotlyjs='cdn')


In [None]:
from plotly.subplots import make_subplots

# Extract the version information from item_map
version_counts = {}
for key, value in item_map.items():
    version = value.get('version', ['unknown'])[0]
    if version not in version_counts:
        version_counts[version] = 0
    version_counts[version] += 1

# Prepare data for the pie charts
versions = list(version_counts.keys())
counts = list(version_counts.values())

# Filter out the most common version (for example, '1.39.7')
filtered_version_counts = {k: v for k, v in version_counts.items() if k != '1.39.7'}
filtered_versions = list(filtered_version_counts.keys())
filtered_counts = list(filtered_version_counts.values())

# Group versions by the first two parts for both filtered and unfiltered data
grouped_version_counts = {}
for version, count in version_counts.items():
    grouped_version = '.'.join(version.split('.')[:2])
    if grouped_version not in grouped_version_counts:
        grouped_version_counts[grouped_version] = 0
    grouped_version_counts[grouped_version] += count

grouped_version_counts_filtered = {}
for version, count in filtered_version_counts.items():
    grouped_version = '.'.join(version.split('.')[:2])
    if grouped_version not in grouped_version_counts_filtered:
        grouped_version_counts_filtered[grouped_version] = 0
    grouped_version_counts_filtered[grouped_version] += count

grouped_versions = list(grouped_version_counts.keys())
grouped_counts = list(grouped_version_counts.values())
grouped_versions_filtered = list(grouped_version_counts_filtered.keys())
grouped_counts_filtered = list(grouped_version_counts_filtered.values())

# Create subplots
fig = make_subplots(rows=1, cols=3, specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=versions, values=counts, name="All Versions", showlegend=False, textinfo='none'), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_versions, values=filtered_counts, name="Excluding 1.39.7", showlegend=False, textinfo='none'), 1, 2)

# Add the third pie chart
fig.add_trace(go.Pie(labels=grouped_versions, values=grouped_counts_filtered, name="Grouped Versions", showlegend=False, textinfo='none'), 1, 3)

# Update layout
fig.update_layout(
    title_text="Spread of Wikibases by Version",
    height=500,
    width=800,
    annotations=[dict(text='All Versions', x=0.12, y=1.05, font_size=12, showarrow=False),
                 dict(text='Excluding 1.39.7', x=0.5, y=1.05, font_size=12, showarrow=False),
                 dict(text='Excluding 1.39.7, Grouped as x.xx', x=0.95, y=1.05, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_versions.html", include_plotlyjs='cdn')

In [None]:
# Display a table underneath that just shows the grouped versions (including 1.39.7 thought) and the counts
df = pd.DataFrame(list(grouped_version_counts.items()), columns=['Version', 'Count'])
fig = go.Figure(data=[go.Table(header=dict(values=['Version', 'Count']),
                               cells=dict(values=[df['Version'], df['Count']]))
                     ])
fig.show()

In [None]:
print("Wikibases with properties: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0}))
print("Wikibases with properties, and more than 10 pages: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0 and v.get('pages', [0])[0] > 10}))
print("Wikibases with properties, and more than 10 pages, and 1 or more active users: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0 and v.get('users_active', [0])[0] >= 1 and v.get('pages', [0])[0] > 10}))
print("Wikibases with properties, and more than 10 pages, and 2 or more active users: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0 and v.get('users_active', [0])[0] >= 2 and v.get('pages', [0])[0] > 10}))
print("Wikibases that link to other wikibases: ", len({k: v for k, v in item_map.items() if v.get('links_to')}))
print("Wikibases that only link to non Wikimedia Foundation wikibases: ", len({k: v for k, v in item_map.items() if v.get('links_to') and not set(v.get('links_to')).intersection({'Q1', 'Q2'})}))
# for k, v in item_map.items():
#     if v.get('links_to') and not set(v.get('links_to')).intersection({'Q1', 'Q2'}):
#         print(" - ", k, v.get('label', k), v.get('links_to'))
print("Wikibases that link to other wikibases, excluding Wikimedia Foundation: ", len({k: v for k, v in item_map.items() if v.get('links_to') and len(set(v.get('links_to')) - {'Q1', 'Q2'}) > 0}))
# for k, v in item_map.items():
#     if v.get('links_to') and len(set(v.get('links_to')) - {'Q1', 'Q2'}) > 0:
#         print(" - ", k, v.get('label', k), v.get('links_to'))

In [None]:
# Random suite ramblings and estimates below here...

In [None]:
import csv

# show how many different versions there are across all wikibases
versions = set()
for key in item_map:
    versions.update(item_map[key].get('version', []))
print("Different versions: ", len(versions))
# Convert the set to a sorted list
sorted_versions = sorted(versions)

# Display the sorted versions
sorted_versions

# and output how many wikibases are using each version
version_counts = {}
for key in item_map:
    for version in item_map[key].get('version', []):
        if version not in version_counts:
            version_counts[version] = 0
        version_counts[version] += 1

# Display the version counts
version_counts