In [None]:
import time
import requests as rq
import bs4
import pandas as pd
from io import StringIO

URL = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"

HEADERS = {
    "User-Agent": "JHU-DS4BME-HW5/1.0 (contact: bmcinty4@jh.edu)"
}

# polite retries
s = rq.Session()
s.headers.update(HEADERS)

resp = s.get(URL, timeout=30)
resp.raise_for_status()

soup = bs4.BeautifulSoup(resp.text, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})

# parse with pandas from the HTML string (avoids pandas doing its own fetch)
gdp = pd.read_html(StringIO(str(tables[0])))[0]
gdp = gdp.dropna(how="all")
gdp = gdp.rename(columns={"IMF (2025)[1][6]": "IMF (2025)", "World Bank (2022–24)[7]": "World Bank (2022–24)", "United Nations (2023)[8]": "United Nations (2023)"})

str_cols = gdp.select_dtypes(include="object").columns
gdp[str_cols] = gdp[str_cols].apply(lambda s: s.str.replace(r"\s*\[.*?\]\s*", "", regex=True).str.strip())

gdp.head()


Unnamed: 0,Country/Territory,IMF (2025),World Bank (2022–24),United Nations (2023)
0,World,113795678,111326370,100834796
1,United States,30507217,29184890,27720700
2,China,19231705,18743803,17794782
3,Germany,4744804,4659929,4525704
4,India,4187017,3912686,3575778


In [None]:
import pandas as pd
import plotly.express as px
import numpy as np

# The first plot was not showing
gdp["IMF (2025)"] = (
    gdp["IMF (2025)"].astype(str)
    .str.replace(r"[^0-9.\-]", "", regex=True)
    .replace("", np.nan)
    .astype(float)
)
# Top 5 by value
top5 = gdp.nlargest(5, "IMF (2025)")

fig = px.bar(top5, x="Country/Territory", y="IMF (2025)", title="Top 5 by value")
fig.update_xaxes(categoryorder="array",
                 categoryarray=gdp.sort_values("IMF (2025)", ascending=False)["Country/Territory"])
fig.show()




In [None]:
!pip install country_converter
import country_converter as coco

country_col = "Country/Territory"
value_col   = "IMF (2025)"

cc = coco.CountryConverter()
gdp = gdp.copy()
gdp["continent"] = cc.convert(names=gdp[country_col], to="continent")

gdp["continent"] = gdp["continent"].replace("not found", pd.NA)

fig = px.bar(
    gdp.dropna(subset=["continent", value_col]),
    x="continent",
    y=value_col,
    color=country_col,
    title="GDP by Continent (stacked by Country)",
)
fig.update_layout(barmode="stack", xaxis_title="Continent", yaxis_title=value_col)
fig.show()


Collecting country_converter
  Downloading country_converter-1.3.1-py3-none-any.whl.metadata (25 kB)
Downloading country_converter-1.3.1-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: country_converter
Successfully installed country_converter-1.3.1




Part 2 - Recap

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
dat = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21.csv").drop(['Unnamed: 0'], axis = 1)
dat.head()

Unnamed: 0,id,roi,volume
0,127,Telencephalon_L,531111
1,127,Telencephalon_R,543404
2,127,Diencephalon_L,9683
3,127,Diencephalon_R,9678
4,127,Mesencephalon,10268


In [None]:
dat = dat.assign(id_char = dat.id.astype(str))
fig = px.bar(dat, x = "id_char", y = "volume", color = "roi")
fig.show()

In [None]:
icv = dat.groupby(['id']).volume.sum().reset_index().rename(columns = {'volume' : 'icv'})
dat = pd.merge(dat, icv, on = 'id')
dat = dat.assign(comp = dat.volume / dat.icv)
dat.head()

Unnamed: 0,id,roi,volume,id_char,icv,comp
0,127,Telencephalon_L,531111,127,1378295,0.385339
1,127,Telencephalon_R,543404,127,1378295,0.394258
2,127,Diencephalon_L,9683,127,1378295,0.007025
3,127,Diencephalon_R,9678,127,1378295,0.007022
4,127,Mesencephalon,10268,127,1378295,0.00745


In [None]:
fig = px.bar(dat, x = "id_char", y = "comp", color = "roi")
fig.show()

In [None]:
roi_mean = dat.drop(["id", "id_char", "icv"], axis = 1).groupby(["roi"]).mean().reset_index()
fig = px.bar(roi_mean, x = "roi", y = "comp")
fig.show()

In [None]:
## load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
    "modify"   : "roi",
    "modify.1" : "level4",
    "modify.2" : "level3",
    "modify.3" : "level2",
    "modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()

Unnamed: 0,roi,level4,level3,level2,level1
0,SFG_L,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L
1,SFG_R,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R
2,SFG_PFC_L,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L
3,SFG_PFC_R,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R
4,SFG_pole_L,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L


In [None]:
## Now load in the subject data
id = 127
ubjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjesctData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
## Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()

Unnamed: 0,roi,volume,level4,level3,level2,level1,icv,comp
0,SFG_L,12926,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L,ICV,0.00935
1,SFG_R,10050,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R,ICV,0.00727
2,SFG_PFC_L,12783,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L,ICV,0.009247
3,SFG_PFC_R,11507,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R,ICV,0.008324
4,SFG_pole_L,3078,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L,ICV,0.002227


In [None]:
fig = px.sunburst(subjectData, path=['icv', 'level1', 'level2', 'level3', 'level4', 'roi'],
                  values='comp', width=800, height=800)
fig.show()

Part 2 - HW

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Lets define a function since we are working in object oriented language
# df is the dataframe
# path is the heirarchy from top to bottom
def sunburst_to_sankey(df, path, value='comp', title='Sankey from hierarchy'):

    # Clean the data
    cols = path + [value]
    data = df[cols].dropna(subset=path).copy()

    # Build a unique node key per layer
    def node_key(level_name, node_label):
        return f"{level_name}::{node_label}"

    # 1) Collect nodes in order of levels, then appearance
    key_to_index = {}
    labels = []
    for lvl in path:
        for node in pd.unique(data[lvl]):
            k = node_key(lvl, node)
            if k not in key_to_index:
                key_to_index[k] = len(labels)
                labels.append(str(node))

    # 2) Build links between each adjacent pair of levels
    sources, targets, values = [], [], []
    for i in range(len(path) - 1):
        a, b = path[i], path[i+1]
        grouped = (data.groupby([a, b], as_index=False)[value].sum())
        for _, row in grouped.iterrows():
            sa = key_to_index[node_key(a, row[a])]
            tb = key_to_index[node_key(b, row[b])]
            sources.append(sa)
            targets.append(tb)
            values.append(float(row[value]))

    # 3) Plot
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15, thickness=20,
            line=dict(color="black", width=0.5),
            label=labels
        ),
        link=dict(source=sources, target=targets, value=values)
    )])

    fig.update_layout(title_text=title, font_size=10)
    return fig

# My Data

sankey_fig = sunburst_to_sankey(
    subjectData,
    path=['icv','level1','level2','level3','level4','roi'],
    value='comp',
    title='ICV → Level1 → Level2 → Level3 → Level4 → ROI (Sankey)'
)
sankey_fig.update_layout(width=1600, height=800)
sankey_fig.show()

