# Interactive Innovation Mapping with Python

Bring some life to your innovation mapping notebook analysis with interactive data viz! 🕹

---

This tutorial covers a few examples of interactive data visualisation with Python that can be used to create rich analysis notebooks or as inline prototypes for web visualisations.

In this tutorial, we are going to be based on Bokeh and will make use of HoloViews, GeoViews and Datashader. There are many options for interactive data visualisation with Python however, including Altair, Plotly, Dash, and even Matplotlib, so try them out too!

## Preamble

In [None]:
%load_ext autoreload
%autoreload 2
# install im_tutorial package
# !pip install git+https://github.com/nestauk/im_tutorials.git

In [None]:
# useful Python tools
import itertools
import collections

# matplotlib for static plots
import matplotlib.pyplot as plt
# networkx for networks
import networkx as nx
# numpy for mathematical functions
import numpy as np
# pandas for handling tabular data
import pandas as pd
# seaborn for pretty statistical plots
import seaborn as sns

pd.set_option('max_columns', 99)

# basic bokeh imports for an interactive scatter plot or line chart
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Circle, Line

# NB: If using Google Colab, this function must be run at 
# the end of any cell that you want to display a bokeh plot.
# If using Jupyter, then this line need only appear once at
# the start of the notebook.
output_notebook()

## Import Data

In [None]:
# The im_tutorials datasets module can be used to easily load datasets.
from im_tutorials.data.gtr import gtr_table, gtr_link_table, gtr_table_list 

In [None]:
gtr_table_list()

In [None]:
gtr_projects_df = gtr_table('projects')
gtr_funds_df = gtr_table('funds')
gtr_funds_link_table = gtr_link_table('funds')

In [None]:
gtr_orgs_df = gtr_table('organisations')
gtr_org_locs_df = gtr_table('organisations_locations')

- Join funding table to link table to get project ids. Groupby project to get start and end date, sum of funding.
- Group leads and collaborators and create network
- Join with project descriptions and make collaboration network with SDGs

In [None]:
from pandas import Grouper

In [None]:
gtr_funds_df = gtr_funds_df.merge(gtr_funds_link_table, left_on='id', right_on='id')

In [None]:
gtr_funds_df.shape

In [None]:
gtr_funds_df.head()

In [None]:
gtr_funds_df = gtr_funds_df.drop_duplicates(['project_id', 'amount'])

In [None]:
print('Earliest start date:', gtr_funds_df['start'].min())
print('Earliest end date:', gtr_funds_df['end'].min())
print('\n')
print('Latest start date:', gtr_funds_df['start'].max())
print('Latest end date:', gtr_funds_df['end'].max())

In [None]:
gtr_funds_df['start'].dt.year.value_counts()[:15]

In [None]:
from im_tutorials.data.sdg import sdg_web_articles

In [None]:
gtr_funds_df['end'].dt.year.value_counts()

In [None]:
gtr_projects_df.set_index('id').loc[gtr_funds_df[gtr_funds_df['end'].dt.year == 2121]['project_id']]

In [None]:
gtr_funds_df[gtr_funds_df['end'].dt.year == 2100]

In [None]:
gtr_projects_df.set_index('id').loc[gtr_funds_df[gtr_funds_df['end'].dt.year == 2100]['project_id']]

In [None]:
gtr_funds_df = gtr_funds_df[(gtr_funds_df['start'].dt.year > 2005) & (gtr_funds_df['start'].dt.year < 2019)]

In [None]:
gtr_funds_df['end'].max()

In [None]:
duration = gtr_funds_df['end'] - gtr_funds_df['start']

In [None]:
plt.hist(duration.dt.days / 365.25, bins=100);

In [None]:
gtr_projects_funds_df = gtr_projects_df.merge(
    gtr_funds_df, left_on='id', right_on='project_id', how='left')

In [None]:
gtr_project_funds_df = gtr_projects_funds_df.drop_duplicates(subset=['project_id'])

In [None]:
gtr_project_funds_df['start_year'] = gtr_project_funds_df['start_y'].dt.year

In [None]:
hv.Scatter(gtr_project_funds_df, 'start_year', 'amount')

In [None]:
grouper = Grouper(key='start_y', freq='YS')
amount_year_sum = gtr_project_funds_df.groupby([grouper, 'leadFunder'])['amount'].sum()
amount_year_sum = amount_year_sum.loc[
    pd.to_datetime('2006-01-01'):pd.to_datetime('2018-01-01')].unstack()

In [None]:
amount_year_sum_rolling = amount_year_sum.rolling(3).mean()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(amount_year_sum_rolling, marker='o')
ax.legend(amount_year_sum_rolling.columns);

In [None]:
def sine_curve(phase, freq):
    xvals = [0.1* i for i in range(100)]
    return hv.Curve((xvals, [np.sin(phase+freq*x) for x in xvals]))

curve_dict = {f:sine_curve(0,f) for f in frequencies}

In [None]:
def make_multi_line_plot():
    

hv.NdOverlay(
    {c: hv.Path((amount_year_sum.index.values, amount_year_sum[c])) for c in amount_year_sum.columns}).opts(
    'Histogram', width=1000, alpha=0.8, muted_alpha=0.1)

In [None]:
ndoverlay = hv.NdOverlay(curve_dict, kdims='frequency')

In [None]:
from bokeh.palettes import Category10_9
from bokeh.models import PrintfTickFormatter, HoverTool

In [None]:
HoverTool??

In [None]:
hover = HoverTool(tooltips=[('Amount', '£@y{( 0.00 a)}'),
                           ('Year', '@x{%F}'),
                           ('Funder', '$name')],
                 line_policy='nearest',
                 formatters={'x': 'datetime'}
                 )

p = figure(width=750, height=400, x_axis_type='datetime',
          title='Total Awards by Funder over Time')

for i, c in enumerate(amount_year_sum.columns):
    color = Category10_9[i]
    p.line(x=amount_year_sum_rolling.index.values, y=amount_year_sum_rolling[c], 
           legend=c, color=color, line_width=2, alpha=0.7, name=c,
           muted_alpha=0.1, muted_color=color)
    p.circle(x=amount_year_sum_rolling.index.values, y=amount_year_sum_rolling[c], 
             legend=c, color=color, name=c,
             muted_alpha=0.1, muted_color=color)

p.xaxis.axis_label = 'Year'
p.yaxis.axis_label = 'Total Funding'    
p.yaxis[0].formatter = PrintfTickFormatter(format="£%.1e")
p.legend.click_policy = "mute"
p.legend.location = 'top_left'
p.legend.label_text_font_size = '8pt'

p.add_tools(hover)
    
show(p)

In [None]:
ds = hv.Dataset(amount_year_sum)

In [None]:
amount_year_sum = amount_year_sum.reset_index()

In [None]:
scatter = hv.Curve(amount_year_sum, 'start_y', 'EPSRC')

In [None]:
np.arange(NLINES)[np.newaxis, :]

In [None]:
hv.NdOverlay(
    {c: hv.Path((amount_year_sum.index.values, amount_year_sum[c])) for c in amount_year_sum.columns}).opts(
    'Histogram', width=1000, alpha=0.8, muted_alpha=0.1)

In [None]:
hv.Path(aamount_year_sum.columns, amount_year_sum)

In [None]:
(np.arange(N), np.random.rand(N, NLINES) + np.arange(NLINES)[np.newaxis, :])

In [None]:
hv.Path

In [None]:
hv.help(hv.Path)

In [None]:
fig, ax = plt.subplots()
ax.hist(np.log10(gtr_funds_df[gtr_funds_df['amount'] > 0]
                 .groupby('project_id')['amount'].sum()), bins=100);
# ax.set_xscale('log')

### Datashader Map

In [None]:
import datashader as ds, datashader.transfer_functions as tf, numpy as np
from datashader import spatial
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9
from IPython.core.display import HTML, display

In [None]:
from colorcet import fire
from datashader.utils import lnglat_to_meters as webm

In [None]:
gtr_org_locs_df['easting'], gtr_org_locs_df['northing'] = webm(gtr_org_locs_df['longitude'], gtr_org_locs_df['latitude'])

In [None]:
import holoviews as hv
from holoviews.element import tiles
from holoviews.operation.datashader import datashade
hv.extension('bokeh', logo=False)

In [None]:
from holoviews.streams import RangeXY

In [None]:
from colorcet import kbc

In [None]:
cmap = kbc

In [None]:
import geoviews as gv
import cartopy.crs as crs

In [None]:
url = 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{Z}/{Y}/{X}.jpg'

map_tiles = gv.WMTS(url, crs=crs.GOOGLE_MERCATOR)

In [None]:
from holoviews.operation.datashader import datashade, dynspread


In [None]:
width=600
height=600

opts = dict(width=width, height=height, x_sampling=1, y_sampling=1, cmap=cmap, dynamic=False)
tile_opts  = dict(width=width, height=height, xaxis=None, yaxis=None, bgcolor='white', show_grid=False)


def make_view(x_range, y_range, **kwargs):
    tiles = map_tiles.options(alpha=0.5, **tile_opts)
    points = hv.Points(gtr_org_locs_df, ['easting', 'northing'])
    d = dynspread(datashade(points, x_range=x_range, y_range=y_range, **opts), shape='circle', threshold=.2)
    return d * tiles

In [None]:
dmap = hv.DynamicMap(make_view, streams=[RangeXY()])
plot = hv.renderer('bokeh').instance(mode='server').get_plot(dmap)
dmap

In [None]:
df_gtr = df_gtr[(df_gtr['rel'] == 'LEAD_ORG') |
               (df_gtr['rel'] == 'COLLAB_ORG') | 
               (df_gtr['rel'] == 'PARTICIPANT_ORG')]

In [None]:
df_gtr.head()

In [None]:
np.sum(df_gtr['id'].value_counts() > 50)

In [None]:
df_gtr['rel'].value_counts()

In [None]:
gtr_df = gtr_df[~pd.isnull(gtr_df['research_topics'])]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from annoy import AnnoyIndex

In [None]:
topics = sorted(set(chain(*gtr_df['research_topics'])))
mlb = MultiLabelBinarizer(classes=topics)
df_topics = pd.DataFrame(mlb.fit_transform(gtr_df['research_topics']), columns=topics)
df_topics = df_topics.T

In [None]:
svd = TruncatedSVD(n_components=30)
tsne = TSNE(n_components=2)

svd_vecs = svd.fit_transform(df_topics)
tsne_vecs = tsne.fit_transform(svd_vecs)

In [None]:
from gensim.sklearn_api.phrases import PhrasesTransformer

In [None]:
t = AnnoyIndex(30, 'angular')  # Length of item vector that will be indexed
for i in range(df_topics.shape[0]):
    t.add_item(i, svd_vecs[i])
    
t.build(500) # 10 trees

In [None]:
min_dist = 0.9

dists = {}
edges = []
for i in range(df_topics.shape[0]):
    closest = t.get_nns_by_item(i, 5)
    source = df_topics.index[closest[0]]
    closest = t.get_nns_by_item(i, 5)[1:]
    for n in closest:
        dist = t.get_distance(i, n)
        if dist <= min_dist:
            ns = df_topics.index[n]
            edge = tuple(sorted([source, ns]))
            edges.append(edge)
            dists[edge] = dist


In [None]:
from collections import Counter
import networkx as nx

In [None]:
edge_list = []
for k, v in Counter(edges).items():
    edge_list.append([k[0], k[1], {'weight': dists[k]}])
    
g = nx.Graph()
g.add_edges_from(edge_list)

nx.draw(g, node_size=25, weight='weight')

In [None]:
plt.scatter(tsne_vecs[:, 0], tsne_vecs[:, 1])

In [None]:
funders_time_df = gtr_df.groupby(['start_year', 'funder_name'])['project_id'].count().unstack().loc[2006:2016]
cds = ColumnDataSource.from_df(funders_time_df)

In [None]:
funders_time_df.shape

In [None]:
from bokeh.palettes import Category20_11
from bokeh.models import HoverTool

In [None]:
cmap = Category20_11

In [None]:
x = cmap.pop()

In [None]:
hover = HoverTool(tooltips=[], mode='vline')

p = figure()

for i, c in enumerate(funders_time_df.columns):
    p.line(source=cds, x='start_year', y=c, line_width=2, alpha=0.9, color=Category20_11[i],
          name='x')
    hover.tooltips.append((f'{c}', f'@{c}'))
p.add_tools(hover)
    
show(p)

In [None]:
p = figure