In [None]:
import json

def load_item_map(filename):
    with open(filename, 'r') as f:
        return json.load(f)
    
dates = [
    '20250216093424',
    '20250411161808',
]

maps = {date: load_item_map(f'wikibase_world_{date}.json') for date in dates}

In [None]:
import pandas as pd

# Extract host data from the maps dictionary
host_counts = []
for date, items in maps.items():
    host_count = {}
    for item_id, item_data in items.items():
        host = item_data.get('host_label', 'unknown')
        host_count[host] = host_count.get(host, 0) + 1
    host_counts.append({'date': date, **host_count})

# Create a DataFrame
df_hosts = pd.DataFrame(host_counts)

# Fill NaN values with 0 (if any)
df_hosts = df_hosts.fillna(0)

print(df_hosts)


In [None]:
import plotly.express as px

# Convert date to datetime for better plotting
df_melted['date'] = pd.to_datetime(df_melted['date'], format='%Y%m%d%H%M%S')

# Create a line plot using Plotly
fig = px.line(
    df_melted,
    x='date',
    y='count',
    color='host',
    markers=True,
    title='Host Counts Over Time',
    labels={'date': 'Date', 'count': 'Count', 'host': 'Host'}
)

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Count',
    legend_title='Host',
    xaxis=dict(tickangle=45),
    yaxis_type='log',  # Apply log scale to y-axis
    template='plotly_white'
)

# Save the plot as an HTML file
fig.write_html('host_counts_over_time.html', include_plotlyjs='cdn')

# Show the plot
fig.show()

In [None]:
# Extract version data from the maps dictionary

version_counts = []
for date, items in maps.items():
    version_count = {}
    for item_id, item_data in items.items():
        version = item_data.get('version', ['unknown'])[0]  # Extract the first element or use 'unknown'
        version_count[version] = version_count.get(version, 0) + 1
    version_counts.append({'date': date, **version_count})
# Create a DataFrame
df_versions = pd.DataFrame(version_counts)
# Fill NaN values with 0 (if any)
df_versions = df_versions.fillna(0)
# Melt the DataFrame for easier plotting
df_melted_versions = df_versions.melt(id_vars=['date'], var_name='version', value_name='count')
# Convert date to datetime for better plotting
df_melted_versions['date'] = pd.to_datetime(df_melted_versions['date'], format='%Y%m%d%H%M%S')
# Create a line plot using Plotly
fig_versions = px.line(
    df_melted_versions,
    x='date',
    y='count',
    color='version',
    markers=True,
    title='Version Counts Over Time',
    labels={'date': 'Date', 'count': 'Count', 'version': 'Version'}
)
# Update layout for better visualization
fig_versions.update_layout(
    xaxis_title='Date',
    yaxis_title='Count',
    legend_title='Version',
    xaxis=dict(tickangle=45),
    yaxis_type='log',  # Apply log scale to y-axis
    template='plotly_white'
)
# Save the plot as an HTML file
fig_versions.write_html('version_counts_over_time.html', include_plotlyjs='cdn')
# Show the plot
fig_versions.show()

In [None]:
# Extract version data from the maps dictionary

version_counts2 = []
for date, items in maps.items():
    version_count = {}
    for item_id, item_data in items.items():
        version = item_data.get('version', ['unknown'])[0]  # Extract the first element or use 'unknown'
        version_major_minor = '.'.join(version.split('.')[:2])  # Get the major and minor version
        version_count[version_major_minor] = version_count.get(version_major_minor, 0) + 1
    version_counts2.append({'date': date, **version_count})
# Create a DataFrame
df_versions2 = pd.DataFrame(version_counts2)
# Fill NaN values with 0 (if any)
df_versions2 = df_versions2.fillna(0)
# Melt the DataFrame for easier plotting
df_melted_versions2 = df_versions2.melt(id_vars=['date'], var_name='version', value_name='count')
# Convert date to datetime for better plotting
df_melted_versions2['date'] = pd.to_datetime(df_melted_versions2['date'], format='%Y%m%d%H%M%S')
# Create a line plot using Plotly
fig_versions2 = px.line(
    df_melted_versions2,
    x='date',
    y='count',
    color='version',
    markers=True,
    title='Major.Minor Version Counts Over Time',
    labels={'date': 'Date', 'count': 'Count', 'version': 'Major.Minor Version'}
)
# Update layout for better visualization
fig_versions2.update_layout(
    xaxis_title='Date',
    yaxis_title='Count',
    legend_title='Major.Minor Version',
    xaxis=dict(tickangle=45),
    yaxis_type='log',  # Apply log scale to y-axis
    template='plotly_white'
)
# Save the plot as an HTML file
fig_versions2.write_html('major_minor_version_counts_over_time.html', include_plotlyjs='cdn')
# Show the plot
fig_versions2.show()


In [None]:
# Pivot the DataFrame to have versions as rows and dates as columns
df_pivoted = df_melted_versions2.pivot(index='version', columns='date', values='count').reset_index()

# Rename columns for better readability
df_pivoted.columns.name = None  # Remove the name of the columns
df_pivoted.columns = ['Version'] + [col.strftime('%Y-%m-%d') for col in df_pivoted.columns[1:]]

# Calculate the delta and add an arrow emoji for direction
df_pivoted['Delta'] = df_pivoted.iloc[:, -1] - df_pivoted.iloc[:, -2]
df_pivoted['Delta'] = df_pivoted['Delta'].astype(int).apply(lambda x: f"{x} {'⬆️' if x > 0 else '⬇️' if x < 0 else ''}")

# Sort the DataFrame by the 'Version' column in descending order
df_pivoted = df_pivoted.sort_values(by='Version', ascending=False)

# Output the sorted pivoted DataFrame as a markdown table
markdown_table = df_pivoted.to_markdown(index=False)
print(markdown_table)