In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import List, Tuple
from itertools import product
from functools import lru_cache


from flaskapp.bigquery_interface import (
    BigQueryInterface,
    query_nodes,
    query_edges,
    query_nearby,
    query_nearby_extended, 
    query_neighbors
)

from flaskapp.wikidata_interfaces import PageidFinder

import pandas as pd
import networkx as nx

In [3]:
PROJECT_ID = "tourguide-388723"
NUM_NEARBY_TO_KEEP = 25

@lru_cache()
def find_nearby_pageids(lon: float, lat: float, num_to_keep: int = NUM_NEARBY_TO_KEEP) -> Tuple[int, ...]:

    with BigQueryInterface(project_id=PROJECT_ID) as bq:
        df_nearby = query_nearby(bq, lat=latlon[0], lon=latlon[1], num_to_keep=num_to_keep)
        df_nearby_extended = query_nearby_extended(bq, df_nearby["page_id"].to_list())

        all_nodes = set(df_nearby["page_id"].tolist())
        _extra_nodes = df_nearby_extended["page_id"].tolist() if "page_id" in df_nearby_extended else set()
    
        all_nodes = list(all_nodes.union(_extra_nodes))
        df_geo_nodes = query_nodes(bq, all_nodes)

    return tuple(df_geo_nodes["page_id"].tolist())

@lru_cache()
def get_topic_pageids(topics: Tuple[str]) -> Tuple[int, ...]:
    topic_to_pageid = dict(PageidFinder().get_payload(topics))
    return tuple(list(topic_to_pageid.values()))


def _get_edgedir(g, id0, id1):
    forward = id1 in g.neighbors(id0)
    backward = id0 in g.neighbors(id1)

    if forward and not backward:
        return "->"
    elif backward and not forward:
        return "<-"
    else:
        return "--"


def make_readable_path(graph, path, df_nodes):
    names = [df_nodes.loc[df_nodes["page_id"] == p, "title"].iloc[0] for p in path]
    connectors = [_get_edgedir(graph, p0, p1) for p0, p1 in zip(path[:-1], path[1:])]
    return " ".join([f"{n} {c}" for n, c in zip(names[:-1], connectors)] + names[-1:]) 

In [38]:
latlon = (44.8113, -91.4985)
#latlon = (42.460021, -74.647030)

In [39]:
genre_pageid = (
    559484,
    559485,
    559486,
    559487
)    

with BigQueryInterface(project_id=PROJECT_ID) as bq:
    _df_genre = bq.query(f"""
    SELECT DISTINCT g AS node
    FROM pages.links, UNNEST(out_links) AS g
    WHERE page_id IN {genre_pageid}
    """)

    genre_pageids = _df_genre["node"].tolist()
    df_genres = query_nodes(bq, nodes=genre_pageids)

idx = (
    (df_genres["page_typeroot"] == "Creation")
    | df_genres["page_typeroot"].isna()
)

all_genres = df_genres.loc[idx, "title"].tolist()

In [40]:
topics = ["Psychedelic rock", "Rock music", "Jazz"]

In [41]:
geo_pageids = find_nearby_pageids(
    lon=latlon[1],
    lat=latlon[0],
    num_to_keep=NUM_NEARBY_TO_KEEP
)
topic_pageids = get_topic_pageids(tuple(topics))

In [42]:
good_types = {
    "Person/Musician",
    "Organization/MusicalGroup",
    "Creation/CreativeWork",
}

with BigQueryInterface(project_id=PROJECT_ID) as bq:
    df_geo_neighbors = query_nearby_extended(bq, nodes=geo_pageids)
    df_topic_neighbors = query_neighbors(bq, nodes=topic_pageids)

    _idx_good_topic = ~(df_topic_neighbors["page_typeroot"].isna() | df_topic_neighbors["page_type"].isin(["Place"]))
    #_idx_good_topic = df_topic_neighbors["page_type"].isin(good_types)
    #_idx_good_topic = ~df_topic_neighbors["page_type"].isna()
    
    all_node_ids = tuple(set(
        df_geo_neighbors.loc[:, "page_id"].tolist() +
        df_topic_neighbors.loc[_idx_good_topic, "page_id"].tolist() +
        list(geo_pageids) +
        list(topic_pageids)
    ))

    df_nodes = query_nodes(bq, nodes=all_node_ids)
    df_edges = query_edges(bq, nodes=all_node_ids)

In [43]:
weightin_dict = {el["page_id"]: el["num_in_links"] for _, el in df_nodes.iterrows()}
weightout_dict = {el["page_id"]: el["num_out_links"] for _, el in df_nodes.iterrows()}
weighttot_dict = {el["page_id"]: el["degree"] for _, el in df_nodes.iterrows()}

In [44]:
weighted_dir_edges = [
    (
        r["from_node"],
        r["to_node"],
        weightout_dict.get(r["from_node"], 1)
    )
    for _, r in df_edges.iterrows()
]

weighted_undir_edges = {}
for e0, e1, w in weighted_dir_edges:
    k = (min(e0, e1), max(e0, e1))
    weighted_undir_edges[k] = min(w, weighted_undir_edges.get(k, w))

graph = nx.DiGraph()
graph.add_weighted_edges_from(weighted_dir_edges, weight="weight")
graph_undir = nx.Graph()
graph_undir.add_weighted_edges_from([(e0, e1, w) for (e0, e1), w in weighted_undir_edges.items()])

In [45]:
nonseed_nodes = set(graph.nodes) - set(geo_pageids + topic_pageids)
paths = []

for geo_id, topic_id in product(geo_pageids, topic_pageids):
    _graph = graph_undir.subgraph(nonseed_nodes.union({geo_id, topic_id}))
    try:
        _paths = nx.algorithms.simple_paths.all_simple_paths(
            _graph,
            geo_id,
            topic_id,
            cutoff=2
        )

        _weighted_paths = [
            (nx.path_weight(_graph, p, weight="weight"), p)
            for p in _paths
        ]
        if _weighted_paths:
            paths += _weighted_paths

    except (nx.NetworkXNoPath, nx.NodeNotFound):
        pass

sorted([
    (w, make_readable_path(graph, p, df_nodes))
    for w, p in paths
], key=lambda x: x[0])

[(87, 'University of Wisconsin–Eau Claire -> Jazz'),
 (194, 'University of Wisconsin–Eau Claire -> Lewis Nash -- Jazz'),
 (234, 'University of Wisconsin–Eau Claire -> Benny Goodman -- Jazz'),
 (262, 'University of Wisconsin–Eau Claire -> Gary Burton -- Jazz'),
 (277,
  'University of Wisconsin–Eau Claire -> Chris Potter (jazz saxophonist) -- Jazz'),
 (355, 'University of Wisconsin–Eau Claire -> Count Basie -- Jazz'),
 (476, 'Eau Claire, Wisconsin <- Bruce Hornsby -- Rock music'),
 (526,
  "St. Patrick's Church (Eau Claire, Wisconsin) -> Catholic Church <- Jazz"),
 (601, 'University of Wisconsin–Eau Claire -> The New York Times <- Jazz'),
 (601, 'University of Wisconsin–Eau Claire -> Woody Herman <- Jazz'),
 (601, 'University of Wisconsin–Eau Claire -> Charlie Byrd <- Jazz'),
 (601, 'University of Wisconsin–Eau Claire -> Bill Evans <- Jazz'),
 (789, 'Eau Claire, Wisconsin -> Catholic Church <- Jazz'),
 (789, 'Eau Claire, Wisconsin -> World War II <- Jazz'),
 (793, 'Lake Altoona (Wiscons

In [46]:
paths

[(476, [151296, 619068, 25423]),
 (1061, [151296, 5247, 25423]),
 (789, [151296, 606848, 15613]),
 (789, [151296, 32927, 15613]),
 (808, [2895089, 3434750, 25423]),
 (1078, [2895089, 3434750, 15613]),
 (277, [496729, 2029104, 15613]),
 (87, [496729, 15613]),
 (601, [496729, 30680, 15613]),
 (601, [496729, 284282, 15613]),
 (601, [496729, 1599087, 15613]),
 (601, [496729, 155974, 15613]),
 (355, [496729, 68090, 15613]),
 (194, [496729, 669656, 15613]),
 (234, [496729, 53855, 15613]),
 (262, [496729, 588791, 15613]),
 (833, [2049193, 3434750, 25423]),
 (1103, [2049193, 3434750, 15613]),
 (818, [3843690, 3434750, 25423]),
 (1088, [3843690, 3434750, 15613]),
 (793, [36492409, 3434750, 25423]),
 (1063, [36492409, 3434750, 15613]),
 (526, [34636869, 606848, 15613]),
 (880, [23332726, 3434750, 25423]),
 (1150, [23332726, 3434750, 15613])]