In [36]:
import json
import requests
import pandas as pd
import plotly.express as px
from dash import html, dcc
from dash.dependencies import Input, Output
from jupyter_dash import JupyterDash
import numpy as np
import sqlite3
pd.options.mode.use_inf_as_na = True

In [2]:
base_coord = (52.5162829, 13.3777240)

Retrieve all German geolocated Wikipedia entries 1,000m around Brandenburg Gate, save as df with 1 page per row.

In [16]:
url = "https://de.wikipedia.org/w/api.php"

params = {
    "action": "query",
    "format": "json",
    "list": "geosearch",
    "formatversion": "2",
    "gscoord": "|".join(map(str, base_coord)),
    "gsradius": "1000",
    "gslimit": "500"
}

response = requests.get(url, params = params)
print(response)
response_dict = json.loads(response.text)
n_results = len( response_dict["query"]["geosearch"] )
print(f"Found {n_results} articles in this area.")

pages_df = pd.json_normalize(response_dict["query"]["geosearch"])
pages_df.set_index("pageid", inplace=True)
pages_df.head()

<Response [200]>
Found 360 articles in this area.


Unnamed: 0_level_0,ns,title,lat,lon,dist,primary
pageid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11830550,0,Berliner Siegesparade 1871,52.516389,13.377778,12.3,True
11349,0,Brandenburger Tor,52.516389,13.377778,12.3,True
10900841,0,Berliner Siegesparade 1945,52.516389,13.377778,12.3,True
613667,0,Radweg Berlin–Kopenhagen,52.516389,13.377778,12.3,True
1710722,0,Platz des 18. März,52.51625,13.377286,29.9,True


Extract page IDs and forward them to a query of pageviews.

In [32]:
params = {
    "action": "query",
    "format": "json",
    "prop": "pageviews",
    "pvipdays": "30",
    "pageids": "", # will add pageids next
    "formatversion": "2"
}

# prepare page ids as a checklist to work off:
page_ids = list(pages_df.index)

# returns a df [pageid | views] with each row a page with id from params:
def query_views_to_df(params):
    response = requests.get(url, params = params)
    response_dict = json.loads(response.text)
    response_df = pd.json_normalize(response_dict["query"]["pages"])
    views = response_df.iloc[:,0:3]
    views["views"] = response_df.filter(regex="pageviews").sum(axis=1)
    views = views[["pageid", "views"]]
    views.set_index("pageid", inplace=True)
    return(views)

# start with the first chunk:
page_id_string = "|".join(map(str, page_ids[0:50]))

# subtract these up to 50 ids from our check list:
if len(page_ids) >= 50:
    page_ids = page_ids[50:len(page_ids)]
else:
    page_ids = None

# prepare params and query:
params["pageids"] = page_id_string
page_views_df = query_views_to_df(params)

# now, given the df structure in page_views_df, proceed to end of page id list:
while page_ids:
    if len(page_ids) >= 50:
        params["pageids"] = "|".join(map(str, page_ids[0:50]))
        page_ids = page_ids[50:len(page_ids)]
    else:
        params["pageids"] = "|".join(map(str, page_ids))
        page_ids = None
    chunk_df = query_views_to_df(params)
    page_views_df = pd.concat( [page_views_df, chunk_df], axis=0)

In [139]:
df = pages_df.join(page_views_df)
df.sort_values("views", ascending=False, inplace=True)
df = df.loc[ df.views > 0 ] # both less wasteful and we avoid trouble with log and the histogram
df["log_views"] = np.log2(df.views)


def opacity(selected):
    if selected:
        return 1.
    else:
        return .4

# histogram:

counts, bins = np.histogram(df.log_views, bins=20)
bincenters = 0.5 * (bins[:-1] + bins[1:])
binlefts = bins[:-1]
binrights = bins[1:]
hist_df = pd.DataFrame({
    "binleft": binlefts,
    "binright": binrights,
    "bincenter": bincenters,
    "count": counts,
    "selected": True
})

def update_selection_from_hist(df, hist_df):
    conn = sqlite3.connect(":memory:")
    hist_df.to_sql("hist", conn, index = False)
    df.to_sql("plot", conn, index = False)
    query = """select title, lat, lon, dist, views, log_views, selected
               from plot, hist
               where plot.log_views between hist.binleft and hist.binright
               and hist.selected = True
            """
    out = pd.read_sql_query(query, conn)
    out.selected = out.selected.astype("bool")
    return(out)

# Plot

In [156]:
cscale_old = [
    (.000, "#0187c2"),
    (.260, "#5837ff"),
    (.577, "#8f50dc"),
    (.668, "#b162ae"),
    (.846, "#ff7674"),
    (.954, "#ffaf72"),
    (1.00, "#fff96b")
]
cscale = [
    (.00, "#0187c2"),
    (.46, "#5837ff"),
    (.58, "#8f50dc"),
    (.75, "#b162ae"),
    (.84, "#ff7674"),
    (.95, "#ffaf72"),
    (1.0, "#fff96b")
]

In [159]:
app = JupyterDash(__name__)

app.layout = html.Div([
    
    # Map background
    html.Div(
        style = {
            "width": "99%",
            "height": "300px"
        },
        children = [
            dcc.Graph(id="map")
        ]),
    
    # sidebar right
    html.Div(
        id = "sidebar",
        style = {
            "position": "fixed",
            "width": "250px",
            "right": "0px",
            "top": "15px",
            "marginRight": "30px",
        },
        children = [            
            html.Div(
                id = "hist-plot",
                style = {
                    "backgroundColor": "rgba(255,255,255, .3)",
                    "padding": "15px 15px 15px 15px",
                    "borderRadius": "5px",
                    "marginTop": "15px"
                },
                children = [
                    dcc.Graph(id = "histogram"),
                    dcc.RangeSlider(
                            min(hist_df.binleft)-.1,
                            max(hist_df.binright)+.1,
                            step=.001,
                            value=[min(hist_df.binleft), max(hist_df.binright)],
                            marks={},
                            id = "slider")
                ]
            ),
            
            html.Div(
                id = "displayA",
                style = {
                    "backgroundColor": "rgba(255,255,255, .3)",
                    "padding": "15px 15px 15px 15px",
                    "borderRadius": "5px",
                    "marginTop": "15px",
                    "color": "black"
                }),
            
            html.Div(
                id = "displayB",
                style = {
                    "backgroundColor": "rgba(255,255,255, .3)",
                    "padding": "15px 15px 15px 15px",
                    "borderRadius": "5px",
                    "marginTop": "15px"
                })
            
            
        ])
    
    #html.Div(
    #    style = {
    #        "position": "fixed",
    #        "width": "250px",
    #        "backgroundColor": "rgba(255,255,255, .3)",
    #        "right": "0px",
    #        
    #)
])

@app.callback(
    Output(component_id="map", component_property="figure"),
    Output(component_id="histogram", component_property="figure"),
    Output(component_id="displayA", component_property="children"),
    Output(component_id="displayB", component_property="children"),
    [Input(component_id='slider', component_property='value')],
    Input(component_id="map", component_property="relayoutData"),
    Input(component_id="map", component_property="clickData")
)

def update_app(slider, relayout, click):
        
    # histogram updating:
    # (from range selector values to which hist bins are on/off)
    lower, upper = slider
    hist_df.selected = (hist_df.binleft >= lower) & (hist_df.binright <= upper)
    
    # histogram rendering:
    # --------------------
    hist = px.bar(hist_df,
        x = "bincenter",
        y = "count",
        color = hist_df.bincenter,
        color_continuous_scale = cscale,
        opacity = list(map(opacity, hist_df.selected)),
        template = "plotly_dark",
        height = 150
    )
    
    hist.update_layout(
        margin = dict(t=0, r=0, b=0, l=0),
        xaxis = dict(
            title = None,
            tickvals = []
        ),
        yaxis = dict(
            title = None,
            tickvals = []
        ),
        coloraxis_showscale=False,
        plot_bgcolor = "rgba(0,0,0,0)",
        paper_bgcolor = "rgba(0,0,0,0)",
        bargap = 0
    )
    
    hist_hover = pd.DataFrame( [np.round(np.exp2(binlefts), 0),
                                np.round(np.exp2(binrights), 0)] ).transpose()
    
    hist.update_traces(
        marker_line_width = 0,
        customdata = hist_hover,
        hovertemplate = "%{customdata[0]}-%{customdata[1]} Aufrufe: %{y} versch. Orte"
    )
    
    # df filtering:
    # (after updating the hist bins we want, make df_plot the corresponding subset)
    plot_df = update_selection_from_hist(df, hist_df)
    
    # map rendering:
    fig = px.scatter_mapbox(plot_df,
                            lat = "lat",
                            lon = "lon",
                            color = "log_views",
                            color_continuous_scale = cscale,
                            size = "views",
                            hover_name = "title",
                            hover_data = ["views"],
                            mapbox_style="carto-darkmatter",
                            zoom = 14)
    fig.update_layout(margin = dict(t=0, r=0, b=0, l=0),
                      height = 800,
                      coloraxis_showscale=False)
    fig.update_traces(marker_sizemode = "area",
                      marker_sizeref = 5,
                      marker_sizemin = 3)
    fig.update_geos(fitbounds = False)
    fig["data"][0]["hovertemplate"] = "<b>%{hovertext}</b><br><br>Aufrufe in den letzten 30 Tagen: %{marker.size}<extra></extra>"
    fig['layout']['uirevision'] = 'something' # sort of keep zoom/position on data changes
    
    
    
    
    return fig, hist, f"{click}", f"{relayout}"


if __name__ == '__main__':
    app.run_server(mode = "inline")
