In [65]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')

In [66]:
import xml.etree.ElementTree as ET
import json
import pandas as pd

tree = ET.parse('Wikibase+World-20250216093424.xml')

root = tree.getroot()

item_map = {}

for page in root.findall('{http://www.mediawiki.org/xml/export-0.11/}page'):
    title = page.find('{http://www.mediawiki.org/xml/export-0.11/}title').text
    if title.startswith('Item:'):
        item_key = title.replace('Item:', '')
        text = page.find('{http://www.mediawiki.org/xml/export-0.11/}revision').find('{http://www.mediawiki.org/xml/export-0.11/}text').text
        # item_map[item_key] = {"text": text}
        data = json.loads(text)
        item_map[item_key] = {"data": data}

print(item_map['Q386'])


{'data': {'type': 'item', 'id': 'Q386', 'labels': {'en': {'language': 'en', 'value': 'mythogram'}}, 'descriptions': [], 'aliases': {'en': [{'language': 'en', 'value': 'mythogram.wikibase.cloud'}]}, 'claims': {'P1': [{'mainsnak': {'snaktype': 'value', 'property': 'P1', 'hash': 'eb2ea55d71d68f51fa4ee8ae2096f1d1fc60b8a2', 'datavalue': {'value': 'https://mythogram.wikibase.cloud', 'type': 'string'}}, 'type': 'statement', 'id': 'Q386$4F8A363D-90F4-41CC-BB5D-9F028DCEEF86', 'rank': 'normal'}], 'P2': [{'mainsnak': {'snaktype': 'value', 'property': 'P2', 'hash': '158db2825a7eb0bb1bc5d9a53a8734e67868e01e', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 8, 'id': 'Q8'}, 'type': 'wikibase-entityid'}}, 'type': 'statement', 'id': 'Q386$CB50B70F-A75B-48E6-86BF-8990E55DA842', 'rank': 'normal'}], 'P3': [{'mainsnak': {'snaktype': 'value', 'property': 'P3', 'hash': '10065a3bad46d2bf60504afdd276b581a0e8412f', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 10, 'id': 'Q10'}, 'type

In [67]:
def get_statements(data, key):
    claims = data.get('claims', {})
    statements = {}
    if key in claims:
        statements[key] = []
        for claim in claims[key]:
            # Filter out mainsnak-> snaktype somevalue or novalue
            if claim['mainsnak']['snaktype'] == 'somevalue' or claim['mainsnak']['snaktype'] == 'novalue':
                continue
            if claim['mainsnak']['datavalue']['type'] == 'string':
                statements[key].append(claim['mainsnak']['datavalue']['value'])
            elif claim['mainsnak']['datavalue']['type'] == 'wikibase-entityid':
                statements[key].append(claim['mainsnak']['datavalue']['value']['id'])
            elif claim['mainsnak']['datavalue']['type'] == 'time':
                time_value = claim['mainsnak']['datavalue']['value']['time']
                if time_value.startswith('+'):
                    time_value = time_value[1:]
                # If the day is 00, set it to 01 so it is parsable
                if time_value[8:10] == '00':
                    time_value = time_value[:8] + '01' + time_value[10:]
                # If month is 00, set it to 01 so it is parsable
                if time_value[5:7] == '00':
                    time_value = time_value[:5] + '01' + time_value[7:]
                statements[key].append(pd.to_datetime(time_value))
            elif claim['mainsnak']['datavalue']['type'] == 'quantity':
                statements[key].append(int(claim['mainsnak']['datavalue']['value']['amount']))
            else:
                print('Unknown type')
    return statements

# Extract some statement values from the json
for key in item_map:
    data = item_map[key]['data']
    # Ignore things that are not P3 => Q10
    if 'P3' not in data.get('claims', {}):
        continue
    # Get the P3 value
    p3 = get_statements(data, 'P3').get('P3', [])
    if 'Q10' not in p3:
        item_map[key].update({"instance_of": p3})
    # TODO some of these statements we expect to only have 1 value? so dont get a list?
    item_map[key].update({"url": get_statements(data, 'P1').get('P1', [])})
    item_map[key].update({"host": get_statements(data, 'P2').get('P2', [])})
    item_map[key].update({"instance": get_statements(data, 'P3').get('P3', [])})
    item_map[key].update({"start_date": get_statements(data, 'P5').get('P5', [])})
    item_map[key].update({"links_to": get_statements(data, 'P55').get('P55', [])})
    item_map[key].update({"links_from": get_statements(data, 'P56').get('P56', [])})
    item_map[key].update({"version": get_statements(data, 'P57').get('P57', [])})
    item_map[key].update({"properties": get_statements(data, 'P58').get('P58', [])})
    item_map[key].update({"edits": get_statements(data, 'P59').get('P59', [])})
    item_map[key].update({"users": get_statements(data, 'P60').get('P60', [])})
    item_map[key].update({"users_active": get_statements(data, 'P61').get('P61', [])})
    item_map[key].update({"pages": get_statements(data, 'P62').get('P62', [])})

# Extract the en label and description of the items
for key in item_map:
    data = item_map[key]['data']
    labels = data.get('labels', {})
    item_map[key] = {"label": key, **item_map[key]}
    if labels:
        item_map[key].update({"label": labels.get('en', {}).get('value', key)})
    descriptions = data.get('descriptions', {})
    if descriptions:
        item_map[key].update({"description": descriptions.get('en', {}).get('value', '')})
    else:
        item_map[key].update({"description": ''})

# And then supplement some of the values with their labels..
for key in item_map:
    if 'host' in item_map[key]:
        host = item_map[key]['host']
        if host:
            host_label = item_map.get(host[0], {}).get('label', host[0])
            item_map[key].update({"host_label": host_label})
            # logging.info(f"Updated host_label for {key}: {host_label}")

# Remove everything that doesn't have instanceof containing the string `Q10` (Wikibase site)
item_map = {k: v for k, v in item_map.items() if 'Q10' in v.get('instance', [])}
# Remove everything that doesn't have anything in the properties list
item_map = {k: v for k, v in item_map.items() if v.get('properties', [])}
# And things that has properties[0] == 0 (As this is probably a wiki with wikibase installed, but not data)
item_map = {k: v for k, v in item_map.items() if v.get('properties', [0])[0] != 0}

print(item_map['Q384'])

# print how many there are
print(len(item_map))

{'label': 'Serbian Wikibase', 'data': {'type': 'item', 'id': 'Q384', 'labels': {'en': {'language': 'en', 'value': 'Serbian Wikibase'}}, 'descriptions': [], 'aliases': {'en': [{'language': 'en', 'value': 'serbian.wikibase.cloud'}]}, 'claims': {'P1': [{'mainsnak': {'snaktype': 'value', 'property': 'P1', 'hash': '04e64ee23f8c770d9a036c0a21e91f7471e26b66', 'datavalue': {'value': 'https://serbian.wikibase.cloud', 'type': 'string'}}, 'type': 'statement', 'id': 'Q384$9F15E830-8A29-4185-8D5F-736629F88346', 'rank': 'normal'}], 'P2': [{'mainsnak': {'snaktype': 'value', 'property': 'P2', 'hash': '158db2825a7eb0bb1bc5d9a53a8734e67868e01e', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 8, 'id': 'Q8'}, 'type': 'wikibase-entityid'}}, 'type': 'statement', 'id': 'Q384$597EEE6D-20F8-4D57-9A23-2583FCF33301', 'rank': 'normal'}], 'P3': [{'mainsnak': {'snaktype': 'value', 'property': 'P3', 'hash': '10065a3bad46d2bf60504afdd276b581a0e8412f', 'datavalue': {'value': {'entity-type': 'item', 'nume

In [68]:
# TODO, export as a file, so it can be recorded historically?

In [88]:
# Supplement with more info just for the graph

# Define specific colors for certain host (using host, not host_label)
specific_colors = {
    'Q4': '#F00000',  # Wikimedia Foundation
    'Q5': '#5be971',  # independently hosted Wikibase
    'Q6': '#f11fbe',  # Wikimedia Cloud Services
    'Q7': '#6283ca',  # The Wikibase Consultancy
    'Q8': '#ADD8E6',  # Wikibase Cloud
    'Q117': '#ff6347',  # WBStack
    'Q118': '#5cd45b',  # Miraheze
    'Q322': '#ffa500',  # WikiTide
    'Q323': '#8a2be2',  # WikiForge
    'Q1434': '#ff69b4',  # wikibase-docker
}

# Add a "graph_group" to the item_map based on the host, including an unknown group
# Also add the graph_color based on the specific_colors, unknown is grey
for key in item_map:
    host = item_map[key].get('host', [])
    if host and host[0] in specific_colors:
        item_map[key].update({"graph_group": item_map[key].get('host_label', 'unknown')})
        item_map[key].update({"graph_color": specific_colors[host[0]]})
    else:
        item_map[key].update({"graph_group": 'unknown'})
        item_map[key].update({"graph_color": '#D3D3D3'})
        if host:
            print (f"Unknown host for {host}") # << you might wat to add a colour for this...

In [70]:
import plotly.graph_objects as go
import networkx as nx
import numpy as np

G = nx.Graph()

In [71]:
# Add directed edges to the graph
for key in item_map:
    # Add it as a node
    G.add_node(key)
    # Add the edges
    if 'links_to' in item_map[key]:
        for link in item_map[key]['links_to']:
            G.add_edge(key, link, direction='to')
    if 'links_from' in item_map[key]:
        for link in item_map[key]['links_from']:
            G.add_edge(link, key, direction='from')

# Calculate the positions of the nodes
pos = nx.spring_layout(G, k=0.5)  # Increase the value of k to spread out the nodes more

# Push nodes with no edges outward slightly
for node in G.nodes():
    if G.degree(node) == 0:
        pos[node] *= 1.1  # Push the node outward by 10%

# Create the edge trace
x_edges = []
y_edges = []
for edge in G.edges():
    x_edges.extend([pos[edge[0]][0], pos[edge[1]][0], None])
    y_edges.extend([pos[edge[0]][1], pos[edge[1]][1], None])

edge_trace = go.Scatter(
    x=x_edges, y=y_edges,
    line=dict(width=0.3, color='#888'),
    hoverinfo='none',
    mode='lines',
    showlegend=False)

# Create a trace for each group
group_traces = {}
group_counts = {}
for node in G.nodes():
    # Some things have been filtered out (like having no properties listed..)
    if node not in item_map:
        continue
    group = item_map[node]['graph_group']
    if group not in group_traces:
        group_traces[group] = {
            'x': [],
            'y': [],
            'hovertext': [],
            'color': item_map[node]['graph_color'],
            'size': []
        }
        group_counts[group] = 0
    group_traces[group]['x'].append(pos[node][0])
    group_traces[group]['y'].append(pos[node][1])
    active_users = item_map[node].get('users_active', [0])[0] if item_map[node].get('users_active') else 0
    properties = item_map[node].get('properties', [0])[0] if item_map[node].get('properties') else 0
    hover_text = (
        f"{item_map[node]['label']}<br>"
        f"URL: {item_map[node].get('url', [''])[0] if item_map[node].get('url') else ''}<br>"
        f"Host: {item_map[node].get('host_label', ['']) if item_map[node].get('host_label') else ''}<br>"
        f"Version: {item_map[node].get('version', [''])[0] if item_map[node].get('version') else ''}<br>"
        f"Start Date: {item_map[node].get('start_date', [''])[0] if item_map[node].get('start_date') else ''}<br>"
        f"Active Users: {active_users}<br>"
        f"Users: {item_map[node].get('users', [''])[0] if item_map[node].get('users') else ''}<br>"
        f"Properties: {properties}<br>"
        f"Edits: {item_map[node].get('edits', [''])[0] if item_map[node].get('edits') else ''}<br>"
        f"Pages: {item_map[node].get('pages', [''])[0] if item_map[node].get('pages') else ''}"
    )
    group_traces[group]['hovertext'].append(hover_text)

    size_active_users = 4
    if active_users:
        if active_users > 0 and active_users < 10:
            size_active_users = active_users + 5
        elif active_users >= 10 and active_users < 100:
            size_active_users = 20
        elif active_users >= 100 and active_users < 1000:
            size_active_users = 30
        elif active_users >= 1000:
            size_active_users = 40

    size_properties = 4
    if properties:
        if properties > 0 and properties < 10:
            size_properties = properties + 5
        elif properties >= 10 and properties < 100:
            size_properties = 20
        elif properties >= 100 and properties < 1000:
            size_properties = 30
        elif properties >= 1000:
            size_properties = 40

    group_traces[group]['size'].append(size_active_users)
    # group_traces[group]['size'].append(size_properties)
    group_counts[group] += 1

# Create the node traces
node_traces = []
for group, data in group_traces.items():
    group_name_with_count = f"{group} ({group_counts[group]})"
    node_trace = go.Scatter(
        x=data['x'], y=data['y'],
        mode='markers',
        hovertext=data['hovertext'],
        hoverinfo='text',
        showlegend=True,
        legendgrouptitle=dict(text="By host"),
        name=group_name_with_count,  # Use the graph_group string with count as the legend label
        marker=dict(
            showscale=False,
            size=data['size'],
            color=data['color'],
            line_width=1,
            line_color='black'
            ))
    node_traces.append(node_trace)

fig = go.Figure(data=[edge_trace] + node_traces,
                layout=go.Layout(
                    title='Wikibases, and links, sized by # of active users',
                    titlefont_size=16,
                    showlegend=True,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    annotations=[dict(
                        text="",
                        showarrow=False,
                        xref="paper", yref="paper")],
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False),
                    height=800  # Set the height to twice the default (400)
                    ))

# Show the plot
fig.show()

# save the figure as html
fig.write_html("wikibase_graph.html", include_plotlyjs='cdn')

In [72]:
from plotly.subplots import make_subplots

# Filter out the Wikibase Cloud host
filtered_group_counts = {k: v for k, v in group_counts.items() if k != 'Wikibase Cloud'}

filtered_labels = [f"{k} ({v})" for k, v in filtered_group_counts.items()]
filtered_values = list(filtered_group_counts.values())

# Prepare data for the pie charts
labels = [f"{k} ({v})" for k, v in group_counts.items()]
values = list(group_counts.values())

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=labels, values=values, name="All Hosts"), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_labels, values=filtered_values, name="Excluding Wikibase Cloud"), 1, 2)

# Update layout
fig.update_layout(
    title_text="Number of wikibases by host",
    height=500,
    width=800,
    annotations=[dict(text='All Hosts', x=0.18, y=1, font_size=12, showarrow=False),
                 dict(text='Excluding Wikibase Cloud', x=0.9, y=1, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_host_wikibases.html", include_plotlyjs='cdn')



In [73]:
from plotly.subplots import make_subplots

# Calculate the number of active users per host
active_users_by_host = {}
for key, value in item_map.items():
    host_label = value.get('host_label', 'unknown')
    active_users_count = value.get('users_active', [0])[0]
    if host_label not in active_users_by_host:
        active_users_by_host[host_label] = 0
    active_users_by_host[host_label] += active_users_count

# Update labels to include the total number of active users per host
labels = [f"{k} ({v})" for k, v in active_users_by_host.items()]
values = list(active_users_by_host.values())

# Filter out the Wikimedia Foundation
filtered_active_users_by_host = {k: v for k, v in active_users_by_host.items() if k != 'Wikimedia Foundation'}

filtered_labels = [f"{k} ({v})" for k, v in filtered_active_users_by_host.items()]
filtered_values = list(filtered_active_users_by_host.values())

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=labels, values=values, name="All Hosts"), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_labels, values=filtered_values, name="Excluding Wikimedia Foundation"), 1, 2)

# Update layout
fig.update_layout(
    title_text="Number of active users by host",
    height=500,
    width=800,
    annotations=[dict(text='All Hosts', x=0.18, y=1, font_size=12, showarrow=False),
                 dict(text='Excluding WMF', x=0.82, y=1, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_host_active_users.html", include_plotlyjs='cdn')


In [74]:
# Calculate the number of properties per host
properties_by_host = {}
for key, value in item_map.items():
    host_label = value.get('host_label', 'unknown')
    properties_count = value.get('properties', [0])[0]
    if host_label not in properties_by_host:
        properties_by_host[host_label] = 0
    properties_by_host[host_label] += properties_count

# Filter out the Wikibase Cloud host
filtered_properties_by_host = {k: v for k, v in properties_by_host.items() if k != 'Wikibase Cloud'}

# Prepare data for the pie charts
labels = [f"{k} ({v})" for k, v in properties_by_host.items()]
values = list(properties_by_host.values())
filtered_labels = [f"{k} ({v})" for k, v in filtered_properties_by_host.items()]
filtered_values = list(filtered_properties_by_host.values())

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=labels, values=values, name="All Hosts"), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_labels, values=filtered_values, name="Excluding Wikibase Cloud"), 1, 2)

# Update layout
fig.update_layout(
    title_text="Number of properties by host",
    height=500,
    width=800,
    annotations=[dict(text='All Hosts', x=0.18, y=1, font_size=12, showarrow=False),
                 dict(text='Excluding Wikibase Cloud', x=0.9, y=1, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_host_properties.html", include_plotlyjs='cdn')


In [75]:
from plotly.subplots import make_subplots

# Extract the version information from item_map
version_counts = {}
for key, value in item_map.items():
    version = value.get('version', ['unknown'])[0]
    if version not in version_counts:
        version_counts[version] = 0
    version_counts[version] += 1

# Prepare data for the pie charts
versions = list(version_counts.keys())
counts = list(version_counts.values())

# Filter out the most common version (for example, '1.39.7')
filtered_version_counts = {k: v for k, v in version_counts.items() if k != '1.39.7'}
filtered_versions = list(filtered_version_counts.keys())
filtered_counts = list(filtered_version_counts.values())

# Group versions by the first two parts for both filtered and unfiltered data
grouped_version_counts = {}
for version, count in version_counts.items():
    grouped_version = '.'.join(version.split('.')[:2])
    if grouped_version not in grouped_version_counts:
        grouped_version_counts[grouped_version] = 0
    grouped_version_counts[grouped_version] += count

grouped_version_counts_filtered = {}
for version, count in filtered_version_counts.items():
    grouped_version = '.'.join(version.split('.')[:2])
    if grouped_version not in grouped_version_counts_filtered:
        grouped_version_counts_filtered[grouped_version] = 0
    grouped_version_counts_filtered[grouped_version] += count

grouped_versions = list(grouped_version_counts.keys())
grouped_counts = list(grouped_version_counts.values())
grouped_versions_filtered = list(grouped_version_counts_filtered.keys())
grouped_counts_filtered = list(grouped_version_counts_filtered.values())

# Create subplots
fig = make_subplots(rows=1, cols=3, specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]])

# Add the first pie chart
fig.add_trace(go.Pie(labels=versions, values=counts, name="All Versions", showlegend=False, textinfo='none'), 1, 1)

# Add the second pie chart
fig.add_trace(go.Pie(labels=filtered_versions, values=filtered_counts, name="Excluding 1.39.7", showlegend=False, textinfo='none'), 1, 2)

# Add the third pie chart
fig.add_trace(go.Pie(labels=grouped_versions, values=grouped_counts_filtered, name="Grouped Versions", showlegend=False, textinfo='none'), 1, 3)

# Update layout
fig.update_layout(
    title_text="Spread of Wikibases by Version",
    height=500,
    width=800,
    annotations=[dict(text='All Versions', x=0.12, y=1.05, font_size=12, showarrow=False),
                 dict(text='Excluding 1.39.7', x=0.5, y=1.05, font_size=12, showarrow=False),
                 dict(text='Excluding 1.39.7, Grouped as x.xx', x=0.95, y=1.05, font_size=12, showarrow=False)]
)

fig.show()
fig.write_html("wikibase_pie_versions.html", include_plotlyjs='cdn')

In [76]:
# Display a table underneath that just shows the grouped versions (including 1.39.7 thought) and the counts
df = pd.DataFrame(list(grouped_version_counts.items()), columns=['Version', 'Count'])
fig = go.Figure(data=[go.Table(header=dict(values=['Version', 'Count']),
                               cells=dict(values=[df['Version'], df['Count']]))
                     ])
fig.show()

In [77]:
print("Wikibases with properties: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0}))
print("Wikibases with properties, and more than 10 pages: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0 and v.get('pages', [0])[0] > 10}))
print("Wikibases with properties, and more than 10 pages, and 1 or more active users: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0 and v.get('users_active', [0])[0] >= 1 and v.get('pages', [0])[0] > 10}))
print("Wikibases with properties, and more than 10 pages, and 2 or more active users: ", len({k: v for k, v in item_map.items() if v.get('properties', [0])[0] > 0 and v.get('users_active', [0])[0] >= 2 and v.get('pages', [0])[0] > 10}))
print("Wikibases that link to other wikibases: ", len({k: v for k, v in item_map.items() if v.get('links_to')}))
print("Wikibases that only link to non Wikimedia Foundation wikibases: ", len({k: v for k, v in item_map.items() if v.get('links_to') and not set(v.get('links_to')).intersection({'Q1', 'Q2'})}))
# for k, v in item_map.items():
#     if v.get('links_to') and not set(v.get('links_to')).intersection({'Q1', 'Q2'}):
#         print(" - ", k, v.get('label', k), v.get('links_to'))
print("Wikibases that link to other wikibases, excluding Wikimedia Foundation: ", len({k: v for k, v in item_map.items() if v.get('links_to') and len(set(v.get('links_to')) - {'Q1', 'Q2'}) > 0}))
# for k, v in item_map.items():
#     if v.get('links_to') and len(set(v.get('links_to')) - {'Q1', 'Q2'}) > 0:
#         print(" - ", k, v.get('label', k), v.get('links_to'))

Wikibases with properties:  777
Wikibases with properties, and more than 10 pages:  600
Wikibases with properties, and more than 10 pages, and 1 or more active users:  264
Wikibases with properties, and more than 10 pages, and 2 or more active users:  129
Wikibases that link to other wikibases:  194
Wikibases that only link to non Wikimedia Foundation wikibases:  5
Wikibases that link to other wikibases, excluding Wikimedia Foundation:  35


In [None]:
# Random suite ramblings and estimates below here...

In [None]:
import csv

# show how many different versions there are across all wikibases
versions = set()
for key in item_map:
    versions.update(item_map[key].get('version', []))
print("Different versions: ", len(versions))
# Convert the set to a sorted list
sorted_versions = sorted(versions)

# Display the sorted versions
sorted_versions

# and output how many wikibases are using each version
version_counts = {}
for key in item_map:
    for version in item_map[key].get('version', []):
        if version not in version_counts:
            version_counts[version] = 0
        version_counts[version] += 1

# Display the version counts
version_counts

Different versions:  37


{'1.44.0-wmf.16': 2,
 '1.39.7': 712,
 '1.39.1': 5,
 '1.39.4': 3,
 '1.39.11': 8,
 '1.37.0-alpha': 1,
 '1.39.10': 2,
 '1.42.4': 1,
 '1.35.2': 3,
 '1.44.0-wmf.15': 1,
 '1.39.5': 2,
 '1.35.1': 2,
 '1.42.1': 3,
 '1.32.5': 1,
 '1.27.0-rc.1': 1,
 '1.28.2': 1,
 '1.35.3': 1,
 '1.38.5': 6,
 '1.35.7': 1,
 '1.43.0-wmf.7': 1,
 '1.39.3': 1,
 '1.36.1': 1,
 '1.43.0': 2,
 '1.34.2': 1,
 '1.34.1': 1,
 '1.41.1': 2,
 '1.33.0': 1,
 '1.41.4': 1,
 '1.40.1': 1,
 '1.35.5': 2,
 '1.39.6': 1,
 '1.39.2': 1,
 '1.34.4': 1,
 '1.40.0-wmf.26': 1,
 '1.41.0': 1,
 '1.42.3': 1,
 '1.35.4': 1}

In [79]:
# curl -L -s 'https://registry.hub.docker.com/v2/repositories/wikibase/wikibase/tags?page_size=1024'|jq '."results"[]["name"]'
versions_list = [
    "mw1.39.10", "1.0", "1", "1.0.1", "mw1.42.3", "3.0", "3", "3.0.2", "mw1.42.1", "3.0.1",
    "1.0.0-mw1.39.8", "3.0.0-mw1.42.1", "1.0.0", "1.0.0-build20240715120214", "3.0.0", "3.0.0-build20240715120210",
    "2.0.0-mw1.41.2", "2", "2.0", "2.0.0", "2.0.0-build20240715114241", "1.39.7-wmde.18", "1.41.1-wmde.20",
    "1.40.3-wmde.19", "1.41.0-wmde.17", "1.40.2-wmde.16", "1.39.6-wmde.15", "1.40.1-wmde.14", "1.39.5-wmde.13",
    "1.38.7-wmde.12", "1.39.1-wmde.11", "1.38.5-wmde.10", "1.37.6-wmde.9", "1.36.4-wmde.8", "1.35.7-wmde.7",
    "1.35.7-wmdeprerelease", "1.35.5-wmde.6", "1.36.3-wmde.5", "1.36.3-wmde.4", "1.35.5-wmde.3", "1.35.4-wmde.2",
    "1.31-bundle", "1.31-base", "1.31", "1.35-bundle", "1.35-base", "1.35", "1.35.2-wmde.1", "1.35.0-wmde.0",
    "1.34-bundle", "1.34-base", "1.34", "1.33-bundle", "1.33-base", "1.33", "1.32-bundle", "1.32-base", "1.32",
    "1.30-bundle", "1.30-base", "1.30", "1.29"
]

# normalize the versions
# remove -base and -bundle
versions_list = [v.replace('-base', '').replace('-bundle', '') for v in versions_list]
# remove anything with "build" in it
versions_list = [v for v in versions_list if 'build' not in v]
# if the version has "mw" in it, just use the bit after it
versions_list = [v.split('mw')[-1] for v in versions_list]
# if there is -wmde. in it, just use the bit before
versions_list = [v.split('-wmde.')[0] for v in versions_list]
# remove things with -wmdeprerelease too
versions_list = [v for v in versions_list if 'wmdeprerelease' not in v]

# If we now have sometihng that matches only 1.\d.\d, then also add .0 to the end
# As main releases often didnt include the .0?
versions_list = [v + '.0' if len(v.split('.')) == 2 else v for v in versions_list]

# remove anything that is just 1 or 3 chars long
versions_list = [v for v in versions_list if len(v) > 3]
# remove anything that doesnt start with 1.
versions_list = [v for v in versions_list if v.startswith('1.')]
# everything already requires 2 .s in it
versions_list = [v for v in versions_list if v.count('.') == 2]
# remove anything that is 1.0*
versions_list = [v for v in versions_list if not v.startswith('1.0')]

# Make it unique
versions_list = list(set(versions_list))
# sort it
versions_list.sort()

versions_list

['1.29.0',
 '1.30.0',
 '1.31.0',
 '1.32.0',
 '1.33.0',
 '1.34.0',
 '1.35.0',
 '1.35.2',
 '1.35.4',
 '1.35.5',
 '1.35.7',
 '1.36.3',
 '1.36.4',
 '1.37.6',
 '1.38.5',
 '1.38.7',
 '1.39.1',
 '1.39.10',
 '1.39.5',
 '1.39.6',
 '1.39.7',
 '1.39.8',
 '1.40.1',
 '1.40.2',
 '1.40.3',
 '1.41.0',
 '1.41.1',
 '1.41.2',
 '1.42.1',
 '1.42.3']

In [80]:
# See how many of the items versions are in the versions_list, excluding Wikibase Cloud as the host_label
versions_in_list = {k: v for k, v in item_map.items() if v.get('version', ['unknown'])[0] in versions_list and v.get('host_label') != 'Wikibase Cloud'}
print("Wikibases with versions in the list, excluding Wikibase Cloud: ", len(versions_in_list))
# Also show the number of wikibases without versions in the list (excluding Wikibase Cloud)
versions_not_in_list = {k: v for k, v in item_map.items() if v.get('version', ['unknown'])[0] not in versions_list and v.get('host_label') != 'Wikibase Cloud'}
print("Wikibases without versions in the list, excluding Wikibase Cloud: ", len(versions_not_in_list))
# And show how many wikibase cloud instances there are (that we then excluded)
wikibase_cloud = {k: v for k, v in item_map.items() if v.get('host_label') == 'Wikibase Cloud'}
print("Wikibase Cloud instances: ", len(wikibase_cloud))

Wikibases with versions in the list, excluding Wikibase Cloud:  33
Wikibases without versions in the list, excluding Wikibase Cloud:  33
Wikibase Cloud instances:  711


In [81]:
import plotly.graph_objects as go

# Define the data
labels = ['Suite Installations?', 'Other Methods?', 'Wikibase Cloud']
values = [16, 50, 711]

# Create the pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent', insidetextorientation='radial')])

# Update layout
fig.update_layout(
    title_text="Estimated Distribution of Wikibase Installations",
    height=500,
    width=800
)

# Show the plot
fig.show()
fig.write_html("wikibase_pie_installations.html", include_plotlyjs='cdn')