In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
from typing import List, Tuple
from itertools import product
from functools import lru_cache


from flaskapp.bigquery_interface import (
    BigQueryInterface,
    query_nodes,
    query_edges,
    query_nearby,
    query_nearby_extended, 
    query_neighbors
)

from flaskapp.wikidata_interfaces import PageidFinder

import pandas as pd
import networkx as nx

In [90]:
PROJECT_ID = "tourguide-388723"
NUM_NEARBY_TO_KEEP = 25

@lru_cache()
def find_nearby_pageids(lon: float, lat: float, num_to_keep: int = NUM_NEARBY_TO_KEEP) -> Tuple[int, ...]:

    with BigQueryInterface(project_id=PROJECT_ID) as bq:
        df_nearby = query_nearby(bq, lat=latlon[0], lon=latlon[1], num_to_keep=num_to_keep)
        df_nearby_extended = query_nearby_extended(bq, df_nearby["page_id"].to_list())
        all_nodes = list(set(
            df_nearby["page_id"].tolist() +
            df_nearby_extended["page_id"].tolist()
        ))
        df_geo_nodes = query_nodes(bq, all_nodes)

    return tuple(df_geo_nodes["page_id"].tolist())

@lru_cache()
def get_topic_pageids(topics: Tuple[str]) -> Tuple[int, ...]:
    topic_to_pageid = dict(PageidFinder().get_payload(topics))
    return tuple(list(topic_to_pageid.values()))


def _get_edgedir(g, id0, id1):
    forward = id1 in g.neighbors(id0)
    backward = id0 in g.neighbors(id1)

    if forward and not backward:
        return "->"
    elif backward and not forward:
        return "<-"
    else:
        return "--"


def make_readable_path(graph, path, df_nodes):
    names = [df_nodes.loc[df_nodes["page_id"] == p, "title"].iloc[0] for p in path]
    connectors = [_get_edgedir(graph, p0, p1) for p0, p1 in zip(path[:-1], path[1:])]
    return " ".join([f"{n} {c}" for n, c in zip(names[:-1], connectors)] + names[-1:]) 

In [23]:
latlon = (44.8113, -91.4985)
#latlon = (42.460021, -74.647030)

In [24]:
topics = [
    "List of cryptids",
    "Rock music",
    "American Revolution"
]

In [25]:
geo_pageids = find_nearby_pageids(lon=latlon[1], lat=latlon[0], num_to_keep=NUM_NEARBY_TO_KEEP)
topic_pageids = get_topic_pageids(tuple(topics))

In [42]:
with BigQueryInterface(project_id=PROJECT_ID) as bq:
    df_geo_neighbors = query_neighbors(bq, nodes=geo_pageids)
    df_topic_neighbors = query_neighbors(bq, nodes=topic_pageids)

    all_node_ids = tuple(set(
        df_geo_neighbors["page_id"].tolist() +
        df_topic_neighbors["page_id"].tolist() +
        list(geo_pageids) +
        list(topic_pageids)
    ))

    df_nodes = query_nodes(bq, nodes=all_node_ids)
    df_edges = query_edges(bq, nodes=all_node_ids)

In [64]:
weightin_dict = {el["page_id"]: el["num_in_links"] for _, el in df_nodes.iterrows()}
weightout_dict = {el["page_id"]: el["num_out_links"] for _, el in df_nodes.iterrows()}
weighttot_dict = {el["page_id"]: el["degree"] for _, el in df_nodes.iterrows()}

In [94]:
from math import sqrt

In [169]:
weighted_dir_edges = [
    (
        r["from_node"],
        r["to_node"],
        2 / (
            (1 / weightout_dict.get(r["from_node"], 1)) +
            (1 / weightin_dict.get(r["to_node"], 1))
        )
    )
    for _, r in df_edges.iterrows()
]

weighted_undir_edges = {}
for e0, e1, w in weighted_dir_edges:
    k = (min(e0, e1), max(e0, e1))
    weighted_undir_edges[k] = min(w, weighted_undir_edges.get(k, w))

graph = nx.DiGraph()
graph.add_weighted_edges_from(weighted_dir_edges, weight="weight")
graph_undir = nx.Graph()
graph_undir.add_weighted_edges_from([(e0, e1, w) for (e0, e1), w in weighted_undir_edges.items()])

In [170]:
nonseed_nodes = set(graph.nodes) - set(geo_pageids + topic_pageids)
paths = []

for geo_id, topic_id in product(geo_pageids, topic_pageids):
    _graph = graph_undir.subgraph(nonseed_nodes.union({geo_id, topic_id}))
    try:
        _paths = nx.algorithms.simple_paths.all_simple_paths(
            _graph,
            geo_id,
            topic_id,
            cutoff=2
        )

        _weighted_paths = [
            (nx.path_weight(_graph, p, weight="weight"), p)
            for p in _paths
        ]
        if _weighted_paths:
            paths += _weighted_paths

    except (nx.NetworkXNoPath, nx.NodeNotFound):
        pass

In [171]:
df_nodes.loc[df_nodes["title"].isin(["Curt Boettcher", "Classic rock"]), :]

Unnamed: 0,page_id,title,has_place_category,num_in_links,num_out_links,degree
714,1035241,Curt Boettcher,True,55,105,160
2116,294091,Classic rock,False,899,78,977


In [174]:
df_nodes.loc[df_nodes["title"].isin(["Canon law of the Episcopal Church in the United States"]), :]

Unnamed: 0,page_id,title,has_place_category,num_in_links,num_out_links,degree
1174,12188132,Canon law of the Episcopal Church in the Unite...,False,3,104,107


In [178]:
df_nodes.loc[df_nodes["page_id"] == 69803556, :]

Unnamed: 0,page_id,title,has_place_category,num_in_links,num_out_links,degree


In [177]:
df_nodes.loc[df_nodes["title"].str.contains("Eau "), "title"].tolist()

['Christ Church Cathedral (Eau Claire)',
 'Immanuel Lutheran College (Eau Claire)',
 'Foster, Eau Claire County, Wisconsin',
 'Eau Claire/Chippewa Falls',
 'Owen Park, Eau Claire, Wisconsin',
 'Lincoln, Eau Claire County, Wisconsin',
 'Eau Claire Bears',
 'Eau Claire/Chippewa Falls metropolitan area',
 'Eau Claire River (Chippewa River tributary)',
 'Eau Claire Wisconsin',
 'Greater Eau Claire',
 'Eau Claire County, WI',
 'Eau Claire City Council',
 'Eau Galle River',
 'Chippewa Falls/Eau Claire',
 'University of Wisconsin, Eau Claire',
 'Otter Creek, Eau Claire County, Wisconsin',
 'Eau Claire County, Wisconsin',
 'UW-Eau Claire',
 'Episcopal Diocese of Eau Claire',
 'Seymour, Eau Claire County, Wisconsin',
 'Eau Claire, Wisconsin',
 'Eau Claire - Chippewa Falls metropolitan area',
 'Eau Claire Masonic Temple',
 'Eau Claire metropolitan area',
 'Eau Claire-Menomonie, WI CSA',
 'University of Wisconsin-Eau Claire',
 'Wilson, Eau Claire County, Wisconsin',
 'Memorial High School (Eau Cl

In [172]:
sorted([
    (w, make_readable_path(graph, p, df_nodes))
    for w, p in paths
], key=lambda x: x[0])

[(60.757345511606324, 'Eau Claire, Wisconsin -- Eaux Claires -> Rock music'),
 (115.95060362173038,
  'Episcopal Diocese of Eau Claire -> Jackson Kemper -> American Revolution'),
 (179.5331064874212,
  'Chippewa River (Wisconsin) <- Wisconsin Central Railroad (1871–1899) -> American Revolution'),
 (222.94098564903916,
  'Eau Claire, Wisconsin <- Nathaniel P. Tallmadge -> American Revolution'),
 (240.1341302072302,
  'Episcopal Diocese of Eau Claire <- Canon law of the Episcopal Church in the United States -> American Revolution'),
 (268.4709688417238,
  'Wisconsin Department of Natural Resources <- Open-fields doctrine -> American Revolution'),
 (269.19234484717543,
  'Eau Claire, Wisconsin -> Roman Catholic Diocese of La Crosse -> American Revolution'),
 (289.2173668259406,
  'Eau Claire County, Wisconsin <- Roman Catholic Diocese of La Crosse -> American Revolution'),
 (384.45641770458315, 'Eau Claire, Wisconsin <- Curt Boettcher -> Rock music'),
 (427.99627003669946,
  'Eau Claire M

In [290]:
class ArticleNetwork(object):

    def __str__(self):
        return f"FROM ({self.lat}, {self.lon})\nTO: {self.topics}"

    def __init__(self,  latlon: Tuple[float, float], topics: List[str], **kwargs):
        self.lat, self.lon = latlon
        self.topics = tuple(topics)
        self.nearby_pageids = find_nearby_pageids(
            lon=self.lon,
            lat=self.lat,
        )
        self.topic_pageids = get_topic_pageids(topics=self.topics)

        assert len(self.nearby_pageids) > 0
        assert len(self.topic_pageids) > 0

    def _build_graph(self):
        pass
        

In [291]:
an = ArticleNetwork(latlon, topics)

In [292]:
print(an)

FROM (42.460021, -74.64703)
TO: ('List of cryptids', 'Rock music', 'American Revolution')
