In [2]:
%load_ext autoreload
%autoreload 2
from utils import load_grants
import pandas as pd
import plotly.express as px
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
grants = pd.DataFrame(utils.load_grants())
grants_by_year = grants.start_year.value_counts()
# Create Plotly Express bar chart with title and axis labels
fig = px.bar(
    x=grants_by_year.index,
    y=grants_by_year.values,
    labels={"x": "Year", "y": "Number of Grants"},
    title="Grants by Year"
)
fig.show()

In [10]:
keywords = pd.DataFrame(utils.load_keywords())

0           1
1          10
2           2
3           3
4         217
         ... 
392462      1
392463      1
392464      1
392465      1
392466      1
Name: grants, Length: 392467, dtype: int64

In [24]:


keywords.loc[:, 'grants_len'] = keywords.grants.map(len)
keywords_sorted = keywords.sort_values('grants_len', ascending=False).reset_index(drop=True)


N = 20
top_keywords = keywords_sorted[['term', 'grants_len']].head(N).copy()
top_keywords = top_keywords.iloc[::-1]  # reverse for top-to-bottom display

fig_keywords = px.bar(
    top_keywords,
    x='grants_len',
    y='term',
    orientation='h',
    labels={'grants_len': 'Number of Grants', 'term': 'Keyword'},
    title=f'Top {N} Keywords by Number of Grants',
    height=400 + 20 * N
)
fig_keywords.update_traces(text=top_keywords['grants_len'], textposition='auto')
fig_keywords.update_layout(margin=dict(l=200, r=20, t=60, b=20))
fig_keywords.show()

In [None]:
grants

Unnamed: 0,id,title,grant_summary,funding_amount,start_year,end_year,funder,for_primary,for
0,arc/DP220100606,Australia's variable rainfall - how dry or wet...,Australia's variable rainfall - how dry or wet...,353000.00,2022.0,2025.0,arc.gov.au,406.0,0401040406040605040608
1,arc/LP0347670,Interplay of Microbiological Corrosion and All...,Interplay of Microbiological Corrosion and All...,88000.00,2004.0,2008.0,arc.gov.au,912.0,0912
2,arc/DP1097177,"The economics of happiness, public policy, and...","The economics of happiness, public policy, and...",367607.00,2010.0,2014.0,arc.gov.au,1402.0,1402
3,arc/DP0210999,Asynchronous Continuous Time Conditioning.,Asynchronous Continuous Time Conditioning. Met...,211164.00,2002.0,2008.0,arc.gov.au,806.0,0806
4,arc/DP130101838,Turning homogeneous catalysts into heterogeneo...,Turning homogeneous catalysts into heterogeneo...,390000.00,2013.0,2016.0,arc.gov.au,399.0,0306030306060399039904
...,...,...,...,...,...,...,...,...,...
60880,nhmrc/254727,Synthesis and purification of flavivirus-speci...,In this proposal we suggest to develop an anti...,140000.00,2003.0,2004.0,National Health and Medical Research Council,,
60881,nhmrc/932028,Efficacy of conductive education for children ...,,125733.60,1993.0,1995.0,National Health and Medical Research Council,,
60882,nhmrc/960887,How physiological factors influence the side e...,,165174.22,1996.0,1998.0,National Health and Medical Research Council,,
60883,nhmrc/316928,Costimulatory mechanisms for enhancing CD8 T c...,"Following an infection, a person's immune syst...",438750.00,2005.0,2007.0,National Health and Medical Research Council,,


In [26]:
keywords

Unnamed: 0,term,type,description,grants,grants_len
0,animal ethics,Methodology,A framework for evaluating the moral and socie...,[arc/ANZCCART],1
1,research ethics,Methodology,Incorporates ethical review and compliance fra...,"[arc/ANZCCART, arc/DP0343014, arc/DP110103193,...",10
2,consensus-building,Methodology,A process through which diverse stakeholders (...,"[arc/ANZCCART, arc/DP0667042]",2
3,scientific integrity,Application,Enhancing the quality and fairness of scientif...,"[arc/ANZCCART, arc/DP160102523, arc/FT110100658]",3
4,community engagement,Methodology,Active involvement of community members in the...,"[arc/ANZCCART, arc/DE130100400, arc/DE13010051...",217
...,...,...,...,...,...
392462,Superiority trial design comparing central BP‑...,Methodology,Randomised superiority framework to test wheth...,[nhmrc/http://purl.org/au-research/grants/nhmr...,1
392463,aortic pressure,Methodology,"Direct measurement of pressure in the aorta, u...",[nhmrc/http://purl.org/au-research/grants/nhmr...,1
392464,clinical application of central BP,Application,Use of central blood pressure data in diagnosi...,[nhmrc/http://purl.org/au-research/grants/nhmr...,1
392465,cardiac workload,Application,Assessment of the strain placed on the heart d...,[nhmrc/http://purl.org/au-research/grants/nhmr...,1


In [28]:
# build time series of grant counts per year for the top N keywords and plot them
selected_terms = top_keywords['term'].tolist()

records = []
for term in selected_terms:
    row = keywords[keywords['term'] == term]
    if row.empty:
        continue
    grant_ids = row['grants'].iat[0]
    # ensure grant_ids is iterable of ids
    if isinstance(grant_ids, str):
        # unlikely, but guard: try to treat as single id
        grant_ids = [grant_ids]
    # select grants that match these ids and have a start_year
    g = grants[grants['id'].isin(grant_ids)][['id', 'start_year']].dropna(subset=['start_year']).copy()
    if g.empty:
        continue
    g['year'] = g['start_year'].astype(int)
    counts = g['year'].value_counts().sort_index()
    for year, cnt in counts.items():
        records.append({'term': term, 'year': int(year), 'count': int(cnt)})

df_kw_time = pd.DataFrame(records)

In [None]:
# pivot to wide format: rows = year, cols = keyword
df_pivot = df_kw_time.pivot(index='year', columns='term', values='count').fillna(0).sort_index()
present_terms = [t for t in selected_terms if t in df_pivot.columns]

# line plot (one line per keyword)
fig_kw_time = px.line(
    df_pivot.reset_index(),
    x='year',
    y=present_terms,
    labels={'value': 'Number of Grants', 'variable': 'Keyword', 'year': 'Year'},
    title=f'Keyword mentions over time (top {len(present_terms)} keywords)'
)
fig_kw_time.update_layout(legend=dict(orientation='v', yanchor='top', y=0.95, xanchor='left', x=1.02))
fig_kw_time.show()


In [31]:

# heatmap for a compact overview (keywords on y-axis, years on x-axis)
fig_kw_heat = px.imshow(
    df_pivot.T,
    aspect='auto',
    labels=dict(x='Year', y='Keyword', color='Number of Grants'),
    title=f'Heatmap: Keyword mentions over time (top {len(present_terms)} keywords)'
)
fig_kw_heat.update_xaxes(tickmode='linear')
fig_kw_heat.update_layout(height=400 + 20 * len(present_terms), margin=dict(l=200, r=20, t=60, b=20))
fig_kw_heat.show()