In [1]:
import pandas as pd
import altair as alt

In [7]:
# Literacy vs. GDP Growth 

csv_path = "education-economy-data.csv"

df = pd.read_csv("/Users/arumehta/Downloads/education-economy-data.csv")

# Flexible column picking 
def pick(dframe, *cands):
    cols_lower = {c.lower(): c for c in dframe.columns}
    for cand in cands:
        if cand in cols_lower:
            return cols_lower[cand]
    return None

country  = pick(df, 'country')
continent = pick(df, 'continent')
literacy = pick(df, 'literacy rate','literacy_rate','literacy')
growth   = pick(df, 'gdp growth (% annual)','gdp_growth (% annual)','gdp_growth','gdp growth')
phys     = pick(df, 'physician density','physician_density')
unemp    = pick(df, 'unemployment rate (%)','unemployment_rate (%)','unemployment rate','unemployment_rate')
gdppc    = pick(df, 'gdp per capita (current usd)','gdp_per_capita (current usd)','gdp per capita','gdp_per_capita')

# Coerce numerics
for c in [literacy, growth, phys, unemp, gdppc]:
    if c and df[c].dtype == object:
        df[c] = pd.to_numeric(
            df[c].astype(str).str.replace(',', '').str.replace('%',''),
            errors='coerce'
        )

scatter_df = df.dropna(subset=[x for x in [literacy, growth] if x is not None]).copy()

chart = (
    alt.Chart(scatter_df)
      .mark_circle(opacity=0.85)
      .encode(
          x=alt.X(literacy, title='Literacy Rate (%)', scale=alt.Scale(zero=False)),
          y=alt.Y(growth, title='GDP Growth (% annual)', scale=alt.Scale(zero=False)),
          color=alt.Color(continent, title='Continent') if continent else alt.value('steelblue'),
          size=alt.Size(phys, title='Physicians per 1,000', scale=alt.Scale(range=[30, 500])) if phys else alt.value(80),
          tooltip=[c for c in [country, continent, literacy, growth, phys, unemp, gdppc] if c]
      )
      .properties(
          title='Literacy vs. GDP Growth (Colored by Continent; Bubble Size = Physician Density)',
          width=650, height=420
      )
      .interactive()
)

chart.save("/Users/arumehta/Desktop/DS 4200/literacy_vs_gdp_growth_scatter.html")
print("Saved: literacy_vs_gdp_growth_scatter.html")


Saved: literacy_vs_gdp_growth_scatter.html


In [6]:
# GDP Per Continent Boxplot

csv_path = "education-economy-data.csv"

df = pd.read_csv("/Users/arumehta/Downloads/education-economy-data.csv")

def pick(dframe, *cands):
    cols_lower = {c.lower(): c for c in dframe.columns}
    for cand in cands:
        if cand in cols_lower:
            return cols_lower[cand]
    return None

continent = pick(df, 'continent')
gdppc     = pick(df, 'gdp per capita (current usd)','gdp_per_capita (current usd)','gdp per capita','gdp_per_capita')
gdppc_cat = pick(df, 'gdp per capita category','gdp_per_capita_category','income_group')
country   = pick(df, 'country')

# Coerce numeric
if gdppc and df[gdppc].dtype == object:
    df[gdppc] = pd.to_numeric(df[gdppc].astype(str).str.replace(',', ''), errors='coerce')

group_col = continent or gdppc_cat or country  # graceful fallback
title_x = 'Continent' if group_col == continent else ('Income Group' if group_col == gdppc_cat else 'Country')

box_df = df.dropna(subset=[x for x in [gdppc, group_col] if x is not None]).copy()

box = (
    alt.Chart(box_df)
      .mark_boxplot(outliers=True, extent='min-max')
      .encode(
          x=alt.X(group_col, title=title_x, sort='-y'),
          y=alt.Y(gdppc, title='GDP per Capita (USD)', scale=alt.Scale(zero=False)),
          color=alt.Color(group_col, legend=None)
      )
      .properties(
          title='GDP per Capita Distribution by Region/Group',
          width=650, height=420
      )
)

box.save("/Users/arumehta/Desktop/DS 4200/gdp_per_capita_by_continent_box.html")
print("Saved: gdp_per_capita_by_continent_box.html")

Saved: gdp_per_capita_by_continent_box.html
