In [1]:
%load_ext autoreload
%autoreload 2

In [66]:
from typing import List, Tuple
from itertools import product
from functools import lru_cache


from flaskapp.bigquery_interface import (
    BigQueryInterface,
    query_nodes,
    query_edges,
    query_nearby,
    query_nearby_extended
)

from flaskapp.wikidata_interfaces import PageidFinder

import pandas as pd
import networkx as nx


In [75]:
num_to_keep = 25
with BigQueryInterface(project_id=PROJECT_ID) as bq:
    df_nearby = query_nearby(bq, lat=latlon[0], lon=latlon[1], num_to_keep=num_to_keep)
    df_nearby_extended = query_nearby_extended(bq, df_nearby["page_id"].to_list())
    all_nodes = list(set(
        df_nearby["page_id"].tolist() +
        df_nearby_extended["page_id"].tolist()
    ))
    df_nodes = query_nodes(bq, all_nodes)


In [76]:
df_nearby_extended

Unnamed: 0,page_id
0,151296
1,29404189
2,90978
3,139014


In [55]:
df_nodes = df_nodes.set_index("page_id")
df_nearby = df_nearby.set_index("page_id")
_df = df_nodes.merge(df_nearby, how="outer", indicator=True, left_index=True, right_index=True)

In [63]:
from collections import Counter

In [65]:
df_nearby["

58261930
17861837
28210924
62033129
34735047
26565617
34730833
58213440
29351821
3916777
3918191


In [289]:
PROJECT_ID = "tourguide-388723"
NUM_NEARBY_TO_KEEP = 15

latlon = (44.8113, -91.4985)
latlon = (42.460021, -74.647030)

topics = [
    "List of cryptids",
    "Rock music",
    "American Revolution"
]

@lru_cache()
def find_nearby_pageids(lon: float, lat: float, num_to_keep: int = NUM_NEARBY_TO_KEEP) -> Tuple[int, ...]:

    with BigQueryInterface(project_id=PROJECT_ID) as bq:
        df_nearby = query_nearby(bq, lat=lat, lon=lon, num_to_keep=num_to_keep)
        df_nearby_extended = query_nearby_extended(bq, df_nearby["page_id"].to_list())
        all_nodes = list(set(
            df_nearby["page_id"].tolist() +
            df_nearby_extended["page_id"].tolist()
        ))
        df_nodes = query_nodes(bq, all_nodes)

    return tuple(df_nodes["page_id"].tolist())

@lru_cache()
def get_topic_pageids(topics: Tuple[str]) -> Tuple[int, ...]:
    topic_to_pageid = dict(PageidFinder().get_payload(topics))
    return tuple(list(topic_to_pageid.values()))


In [290]:
class ArticleNetwork(object):

    def __str__(self):
        return f"FROM ({self.lat}, {self.lon})\nTO: {self.topics}"

    def __init__(self,  latlon: Tuple[float, float], topics: List[str], **kwargs):
        self.lat, self.lon = latlon
        self.topics = tuple(topics)
        self.nearby_pageids = find_nearby_pageids(
            lon=self.lon,
            lat=self.lat,
        )
        self.topic_pageids = get_topic_pageids(topics=self.topics)

        assert len(self.nearby_pageids) > 0
        assert len(self.topic_pageids) > 0

    def _build_graph(self):
        pass
        

In [291]:
an = ArticleNetwork(latlon, topics)

In [292]:
print(an)

FROM (42.460021, -74.64703)
TO: ('List of cryptids', 'Rock music', 'American Revolution')


In [293]:
seed_nodes = an.nearby_pageids + an.topic_pageids
with BigQueryInterface(project_id=PROJECT_ID) as bq:
    df_edges = query_edges(bq, seed_nodes)
    nodes_in_graph = set(df_edges["from_node"].tolist() + df_edges["to_node"].tolist())
    df_nodes = query_nodes(bq, nodes_in_graph).set_index("page_id")

In [294]:
print(df_nodes.shape)
print(len(nodes_to_keep))

(28108, 5)
612


In [295]:
weightin_dict = {page_id: el["num_in_links"] for page_id, el in df_nodes.iterrows()}
weightout_dict = {page_id: el["num_out_links"] for page_id, el in df_nodes.iterrows()}

weight_dict = {page_id: el["degree"] for page_id, el in df_nodes.iterrows()}

weighted_edges = (
    (e0, e1, ((weight_dict[e0] + weight_dict[e1]) / 1000))
    for e0, e1 in zip(df_edges["from_node"].tolist(), df_edges["to_node"].tolist())
    if e0 in weightout_dict and e1 in weightin_dict
)

graph = nx.DiGraph()
graph.add_weighted_edges_from(weighted_edges, weight="weight")

In [296]:
def _get_edgedir(g, id0, id1):
    forward = id1 in g.neighbors(id0)
    backward = id0 in g.neighbors(id1)

    if forward and not backward:
        return "->"
    elif backward and not forward:
        return "<-"
    else:
        return "--"

def make_readable_path(graph, path, df_nodes):
    names = [df_nodes.loc[p, "title"] for p in path]
    connectors = [_get_edgedir(graph, p0, p1) for p0, p1 in zip(path[:-1], path[1:])]
    return " ".join([f"{n} {c}" for n, c in zip(names[:-1], connectors)] + names[-1:]) 
    
graph_undir = graph.to_undirected(reciprocal=False)
nonseed_nodes = set(graph.nodes) - set(an.nearby_pageids + an.topic_pageids)
paths = []

for geo_id, topic_id in product(an.nearby_pageids, an.topic_pageids):
    _graph = graph_undir.subgraph(nonseed_nodes.union({geo_id, topic_id}))
    try:
        _paths = list(nx.algorithms.simple_paths.all_simple_paths(
            _graph,
            geo_id,
            topic_id,
            cutoff=3
        ))

        _weighted_paths = [
            (nx.path_weight(_graph, p, weight="weight"), p)
            for p in _paths
        ]
        if _weighted_paths:
            paths += _weighted_paths

    except (nx.NetworkXNoPath, nx.NodeNotFound):
        pass

In [297]:
df_nodes.loc[an.nearby_pageids, :]

Unnamed: 0_level_0,title,has_place_category,num_in_links,num_out_links,degree
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
126355,"Harpersfield, New York",True,22,29,51
56505,"Schoharie County, New York",True,275,140,415
127145,"Jefferson, New York",True,15,28,43
54167,"Delaware County, New York",True,334,110,444
11631432,Parker 13-Sided Barn,True,1,3,4
520282,"Oneonta, New York",True,286,75,361
734128,Mount Jefferson (New York),False,5,10,15
1182620,List of towns in New York,True,172,940,1112
231223,Susquehanna River,True,1354,199,1553


In [298]:
sorted([
    (w, make_readable_path(graph, p, df_nodes))
    for w, p in paths
], key=lambda x: x[0])

[(5.498, 'Jefferson, New York -> American Revolution'),
 (5.5680000000000005,
  'Jefferson, New York <- Summit, New York -> American Revolution'),
 (5.601999999999999,
  'Jefferson, New York -- Blenheim, New York -> American Revolution'),
 (5.654,
  'Harpersfield, New York <- Franklin, Delaware County, New York -> American Revolution'),
 (5.754,
  'Harpersfield, New York <- Tryon County, New York -> American Revolution'),
 (5.886, 'Oneonta, New York <- Summit, New York -> American Revolution'),
 (5.92, 'Oneonta, New York <- Blenheim, New York -> American Revolution'),
 (5.934,
  'Schoharie County, New York <- Daniel F. Bakeman -> American Revolution'),
 (5.94,
  'Schoharie County, New York -- Summit, New York -> American Revolution'),
 (5.94,
  'Schoharie County, New York <- Prattsville (town), New York -> American Revolution'),
 (5.95,
  'Schoharie County, New York <- Ashland, Greene County, New York -> American Revolution'),
 (5.9639999999999995,
  'Oneonta, New York <- Franklin, Del