In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
from typing import List, Tuple, Dict
from itertools import product
from functools import lru_cache
from math import log1p, exp

from flaskapp.bigquery_interface import (
    BigQueryInterface,
    create_s2_table,
    query_nearby,
    query_mutual,
    query_bridges,
)

from flaskapp.wikidata_interfaces import PageidFinder

import pandas as pd
import networkx as nx

In [113]:
PROJECT_ID = "tourguide-388723"

@lru_cache()
def find_nearby_pageids(lon: float, lat: float) -> Tuple[Dict[str, int], Dict[int, int]]:
    with BigQueryInterface(project_id=PROJECT_ID) as bq:
        df_nearby = query_nearby(bq, lat=latlon[0], lon=latlon[1])

    output_ids = {}
    output_weights = {}
    if not df_nearby.empty:
        output_ids = df_nearby.set_index("page_id")["title"].to_dict()
        output_weights = df_nearby.set_index("page_id")["num_in_links"].to_dict()
    return output_ids, output_weights

@lru_cache()
def get_topic_pageids(topic: str) -> Tuple[Dict[str, int], Dict[int, int]]:
    pageid = PageidFinder().get_payload([topic])[0][1]
    output_ids = {pageid: topic}
    output_weights = {pageid: 1}

    # with BigQueryInterface(project_id=PROJECT_ID) as bq:
    #     df_topics = query_mutual(bq, pageid)

    # output_ids = {}
    # output_weights = {}
    # if not df_topics.empty:
    #     output_ids = df_topics.set_index("page_id")["title"].to_dict()
    #     output_weights = df_topics.set_index("page_id")["num_in_links"].to_dict()
    return output_ids, output_weights


In [114]:
# with BigQueryInterface(project_id=PROJECT_ID) as bq:
#     create_s2_table(bq)

In [115]:
#latlon = (44.8113, -91.4985)
#latlon = (44.79631638245842, -91.47084979690264)
#latlon = (42.460021, -74.647030)
#latlon = (30.274776, -97.740307)
latlon = (30.267222, -97.743056)


topic = "Psychedelic rock"  #, "Rock music", "Jazz"

In [116]:
nearby_pages, nearby_weights = find_nearby_pageids(latlon[1], latlon[0])
topic_pages, topic_weights = get_topic_pageids(topic)

In [117]:
with BigQueryInterface(project_id=PROJECT_ID) as bq:

    df_bridges = query_bridges(
        bq,
        nodes0=nearby_pages.keys(),
        nodes1=topic_pages.keys()
    )

df_bridges["weight"] = [
    exp(
        log1p(row["num_in_links"]) + -log1p(row["num_out_links"]) + 
        sum(-log1p(nearby_weights[p]) for p in row["links0"]) +
        sum(-log1p(topic_weights[p]) for p in row["links1"])
    )
    for _, row in df_bridges.iterrows()
]

df_bridges = df_bridges.sort_values("weight", ascending=False)

for _, row in df_bridges.iterrows():
    name = row["title"]
    geo_titles = "; ".join(nearby_pages[i] for i in row["links0"])
    topic_titles = "; ".join(topic_pages[i] for i in row["links1"])
    print(f"{row['weight']:.2f}: ({geo_titles}) <- ({name}) -> ({topic_titles})")


0.00: (Austin, Texas) <- (The Rolling Stones) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Caetano Veloso) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Janis Joplin) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (The Flaming Lips) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Donovan) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (MGMT) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Big Brother and the Holding Company) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Dwight Yoakam) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Paul Leary) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (White Denim) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Ginger Baker) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Meat Puppets) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Gram Parsons) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Roky Erickson) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Gilbert Shelton) -> (Psychedelic rock)
0.00: (Austin, Texas) <- (Rip Off Press) -> (P