# Visualizations Notebook

The following notebook has as a purpose to show visualizations regarding the Education status at South America.

### Set up of libraries

In [1]:
import pandas as pd
import altair as alt

In [2]:
from utils import filters_countries_cols, translate_english_degrees
from utils import translate_area_del_conocimiento, translates_degree

In [3]:
# Loading the data
raw = pd.read_csv("../data/raw/matricula.csv", encoding="latin1", sep=None, engine="python")
titulados = pd.read_csv("../data/raw/titulados.csv", encoding="latin1", sep=None, engine="python")


In [4]:
raw_dfs = [raw, titulados]

In [5]:
# year column to number
for df in raw_dfs:
    df["AÑO"] = df["AÑO"].astype(str).str[-4:]
    df["AÑO"] = df["AÑO"].astype(int)

    # columns in lowercase
    cols = [col.lower() for col in df.columns]
    df.columns = cols

# **First Chart: Evolution total students of undergrads, and grad students 2007-2025**

This line chart shows the total number of enrolled students for undergrads, grads and diploma students
in Chile for the period 2007-2025

In [32]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

var_1 = "total matrícula"

# Getting the data
cols_1 = ["año"]
df_1 = raw[raw["nivel global"] == "Pregrado"]
df_1 = df_1.groupby(cols_1)[var_1].sum().reset_index() 


df_1[var_1] = df_1[var_1] / 1000
df_1.rename(columns = {"nivel global" : "Degree Type"}, inplace=True)
values = list(range(min(df_1["año"]), max(df_1["año"]) +1 , 2))

# Chart number of enrolled during type
chart_1 = alt.Chart(df_1).mark_bar(color = "#f0ad2f").encode(
    x=alt.X("año:O", axis= alt.Axis(title="Year", values=values), scale=alt.Scale(2007,2025)),
    y=alt.Y(f"{var_1}:Q",
            axis= alt.Axis(title="Total enrolled (thousands)"))).properties(
    title = "Total Undergrads enrolled students for 2007-2025",
    height = 300,
    width = 400)

df_11 = titulados[titulados["nivel global"] == "Pregrado"].copy()
df_11 = titulados.groupby(["año"])["total titulaciones"].sum().reset_index()
df_11["total titulaciones"] = df_11["total titulaciones"] / 1000
chart_11 = alt.Chart(df_11).mark_bar(color = "#f0ad2f").encode(
    x=alt.X("año:O", axis= alt.Axis(title="Year", values=values), scale=alt.Scale(2007,2025)),
    y=alt.Y(f"{"total titulaciones"}:Q",
            axis= alt.Axis(title="Total titulaciones (thousands)"))).properties(
    title = "Total Undergrads enrolled students for 2007-2025",
    height = 300,
    width = 400)

chart_1.save("../png_charts/total_enrollment.png")
final_11 = (chart_1 | chart_11).configure_axis(grid=False)

df_final = df_1.merge(df_11, on="año", how="left")
df_final = df_final.query("año != 2025")
df_final = df_final.melt(id_vars = "año",
                   value_vars = ["total matrícula", "total titulaciones"],
                   var_name = "indicador",
                   value_name = "total")

hhh = alt.Chart(df_final).mark_area().encode(
    x = "año",
    y = alt.Y("total"),#.stack("normalize"),
    color = "indicador"
)
hhh

#final_11

# **Second Chart: Out of undergrads, relation of professional and technical undergrads**

The following chart shows the relation out of undergrads of students studying
technical careers and professional careers


In [37]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart


# Degrees to show
degrees = ["Carreras Profesionales", "Carreras Técnicas"]

# Data
cols_2 = ["año", "carrera clasificación nivel 2"]
df_2 = raw.groupby(cols_2)["total matrícula"].sum().reset_index()

# Cleaning the data
#df_2.rename(columns={"carrera clasificación nivel 2": "Degree type of undergrads"}, inplace=True)
df_2 = df_2[df_2["carrera clasificación nivel 2"].isin(degrees)]
df_2["carrera clasificación nivel 2"] = df_2.apply(translates_degree, axis=1)

chart_2 = alt.Chart(df_2).mark_bar().encode(
    x = alt.X("año:O", axis=alt.Axis(title="Year")),
    y = alt.Y("sum(total matrícula)", axis=alt.Axis(title="Total enrolled")).stack("normalize"),
    color = alt.Color("carrera clasificación nivel 2",
                      legend = alt.Legend(title="Degree type of undergrads"))).properties(
        title="Share of Technical and Professioanl enrolled undergrads"
    )

chart_2.save("../png_charts/profs_techs.png")
chart_2


# **Third Chart: Evolution Total male and female students of overall enrolled**

The following chart shows the growth of male and females students during 2007-2025


In [38]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

cols_3 = ["año"]
df_3 = raw.groupby(cols_3)[["total matrícula hombres", "total matrícula mujeres"]].sum().reset_index()



df_3 = df_3.melt(id_vars="año",
                 value_vars=["total matrícula hombres", "total matrícula mujeres"],
                 var_name="enrolled type",
                 value_name="total matrícula")

men = "total matrícula hombres"
women = "total matrícula mujeres"
values = list(range(min(df_1["año"]), max(df_1["año"]) +1 , 2))

df_3["enrolled type"] = df_3["enrolled type"].apply(lambda x: "Men" if x == men else "Women")
df_3["total matrícula"] = df_3["total matrícula"] / 1000

chart_3 = alt.Chart(df_3).mark_line().encode(
    x = alt.X("año:O", axis=alt.Axis(title="Year", values=values)),
    y = alt.Y("sum(total matrícula)",
              axis=alt.Axis(title="Total enrolled (thousands)")),
    color = alt.Color("enrolled type",
                      legend=alt.Legend(title="Gender"),
                      scale = alt.Scale(
                          domain = ["Men", "Women"],
                          range = ["#93c3b3", "#7900f1"]
                      ))).properties(
        title = "Evolution of enrolled Men and Women in universities' Degrees 2007-2025",
    ).configure_axis(
        grid = False
    )

chart_3.save("../png_charts/men_women.png")
chart_3


# **Fourth Chart: Differences of study areas between men and women**

The following chart shows the ranking of top enrolled number of students of
knowkledge areas, comparing men and women for the 2007-2025 period

In [39]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

# For Women

cols_4 = ["año", "área del conocimiento"]
col_add = "total matrícula mujeres"
df_4 = raw.groupby(cols_4)[col_add].sum().reset_index()

df_4["área del conocimiento"] = df_4.apply(translate_area_del_conocimiento, axis=1)

df_4_final = pd.DataFrame({"año": [], "área del conocimiento": [],
                           "total matrícula mujeres": [], "rank": []})

for year in df_4["año"].unique():
    yr_df = df_4[df_4["año"] == year].copy()
    yr_df["rank"] = yr_df["total matrícula mujeres"].rank(ascending=False)
    df_4_final = pd.concat([df_4_final, yr_df])

chart_4 = alt.Chart(df_4_final).mark_line().encode(
    x = alt.X("año:O").title("Year"),
    y = alt.Y("rank").title("Rank"),
    color = alt.Color("área del conocimiento",
                      legend = alt.Legend(title="Study Field"))).properties(
                          title = "Study field preference by WOMEN"
                      )

In [40]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

# For Men

cols_44 = ["año", "área del conocimiento"]
col_add_44 = "total matrícula hombres"
df_44 = raw.groupby(cols_44)[col_add_44].sum().reset_index()

df_44["área del conocimiento"] = df_44.apply(translate_area_del_conocimiento, axis=1)

df_44_final = pd.DataFrame({"año": [], "área del conocimiento": [],
                           "total matrícula hombres": [], "rank": []})

for year in df_44["año"].unique():
    yr_df = df_44[df_44["año"] == year].copy()
    yr_df["rank"] = yr_df["total matrícula hombres"].rank(ascending=False)
    df_44_final = pd.concat([df_44_final, yr_df])


chart_44 = alt.Chart(df_44_final).mark_line().encode(
    x = alt.X("año:O"),
    y = "rank",
    color = alt.Color("área del conocimiento",
                      legend = alt.Legend(title="Study Field"))).properties(
                          title = "Study field preference by MEN")
chart_44

final = chart_4 | chart_44

final.save("../png_charts/ranks.png")
final

# **Fifth Chart: Where in Chile are they studying at 2025**

The following Chart shows the number of univerisity student in each region of
Chile ordered from north to South at 2025

In [41]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

cols_5 = ["año", "región"]
value_5 = "total matrícula"
df_5 = raw.groupby(cols_5)[value_5].sum().reset_index()
df_5 = df_5[df_5["año"] == 2025]
order_regions = ['Arica y Parinacota',
                 'Tarapacá',
                 'Antofagasta', 
                 'Atacama',
                 'Coquimbo', 
                 'Valparaíso',
                 'Metropolitana',
                 "Lib. Gral. B. O'Higgins",
                 'Maule',   
                 'Ñuble',
                 'Biobío',
                 'La Araucanía',
                 'Los Ríos',
                 'Los Lagos',
                 'Aysén',
                 'Magallanes']

chart_5 = alt.Chart(df_5).mark_bar().encode(
    x = alt.X("total matrícula").title("Total enrolled"),
    y = alt.Y("región", sort= order_regions).title("Regions of Chile (South → North)")
).properties(
    title="Total enrolled university students by region for 2025",
    width = 400,
    height = 400
)
chart_5.save("../png_charts/regions.png")
chart_5

# **Sixth Chart: Heat Map for career length and Study field for 2025**

The following Chart shows how long are careers per field study for Professional
Careers of Undergrads in 2025

In [42]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

x = "área del conocimiento"
y = "duración total de carrera"

df_6 = raw[(raw["año"] == 2025) & 
           (raw["clasificación institución nivel 1"] == "Universidades") &
           (raw["nivel global"] == "Pregrado")]
df_6 = df_6[["duración total de carrera", "área del conocimiento"]]
df_6 = (
    df_6.groupby([x,y])[y]
    .count()
    .reset_index(name="count")
)
df_6["área del conocimiento"] = df_6.apply(translate_area_del_conocimiento, axis=1)

chart_6 = (
    alt.Chart(df_6)
    .mark_rect()
    .encode(
        x=alt.X(x, title="Field Study"),
        y=alt.Y(y, type="ordinal", sort="descending", title="Total Length of Career (Semesters)"),
        color=alt.Color("count", scale=alt.Scale(scheme="blues", type="log"),
            title="Number of careers"
        ),
        tooltip=[x, y, "count"]
    )
    .properties(
        title="Distribution of length of careers by field study for Professional Careers in 2025"
    )
)


chart_6.save("../png_charts/long_careers.png")
chart_6


# **Seventh Chart: Share of Men and Women per degree type in 2025**

The following Chart shows what's the distribution of men and women per degree type in 2025

In [43]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

cols_7 = ["año","carrera clasificación nivel 2"]
df_7 = raw[raw["año"] == 2025]
df_7 = df_7.groupby(cols_7)[["total matrícula hombres", "total matrícula mujeres"]].sum().reset_index()

df_7

df_7 = df_7.melt(id_vars= cols_7,
                 value_vars=["total matrícula hombres", "total matrícula mujeres"],
                 var_name="enrolled type",
                 value_name="total matrícula")

men = "total matrícula hombres"
women = "total matrícula mujeres"

df_7["enrolled type"] = df_7["enrolled type"].apply(lambda x: "Men" if x == men else "Women")
print(df_7["carrera clasificación nivel 2"].unique())
df_7["carrera clasificación nivel 2"] = df_7.apply(translates_degree, axis=1)

rule_50 = pd.DataFrame({"y": [0.50]})
line_50 = alt.Chart(rule_50).mark_rule(color="black").encode(
    y = "y:Q").properties(
        title="50%"
    )

order = [
    "Technical (Undergrad)",
    "Professionals (Undergrad)",
    "Diplomas (Grad)",
    "Masters (Grad)",
    "PhD (Grad)"
]
print(df_7["carrera clasificación nivel 2"].unique())
chart_7 = alt.Chart(df_7).mark_bar().encode(
    x = alt.X("carrera clasificación nivel 2:N",
              sort=order,  # <–– this enforces your custom order
              axis=alt.Axis(title="Degree Type")),
    y = alt.Y("sum(total matrícula)",
              stack="normalize",
              axis=alt.Axis(title="Total enrolled")),
    color = alt.Color("enrolled type",
                      legend=alt.Legend(title="Gender"))
).properties(
    title = "Share of Men and Women per degree in 2025",
    width = 400,
    height = 400
)

chart_77 = chart_7 + line_50
chart_77.save("../png_charts/gender_degree.png")
chart_77

['Carreras Profesionales' 'Carreras Técnicas' 'Doctorado' 'Magister'
 'Postítulo']
['Professionals (Undergrad)' 'Technical (Undergrad)' 'PhD (Grad)'
 'Masters (Grad)' 'Non-Degree (Grad)']


# **Eightth Chart: Evolution of spending  of % GDP per education type**

The following Chart shows how much Chile has been spending in primary, secondary,
and tertiary education as part of its GDP for period 2007-2021

In [None]:
# For the following chart I used ChatGPT to assist me in debugging and assisting
# me in how to develop some features of the cart

# loading data
# url = https://ourworldindata.org/grapher/education-spending?country=%7ECHL&overlay=download-data&spending_type=gdp_share&level=level_side_by_side
spend = pd.read_csv("../data/raw/chile_spending.csv")

new_names = {
 'Year': 'year',
 'Government spending on tertiary education as share of GDP': 'Tertiary',
 'Government spending on upper secondary education as share of GDP': 'secondary_1',
 'Government spending on lower secondary education as share of GDP': 'secondary_2',
 'Government spending on primary education as share of GDP': 'Primary',
 'Government spending on pre-primary education as share of GDP': 'Pre-Primary'}
spend.rename(columns=new_names, inplace=True)
spend["Secondary"] = spend["secondary_1"] + spend["secondary_2"]
spend["Other levels"] = spend["Primary"] + spend["Pre-Primary"] + spend["Secondary"]
spend.drop(columns=["Entity", "Code", "secondary_1", "secondary_2"], inplace=True)

Unnamed: 0,year,Tertiary,Primary,Pre-Primary,Secondary,Other levels
0,1998,0.54913,1.38028,0.28741,1.1063,2.77399
1,2000,0.54608,1.61622,0.30723,1.28754,3.21099
2,2002,0.56794,1.70221,0.34935,1.42931,3.48087
3,2003,0.51535,1.55932,0.33989,1.50412,3.40333
4,2004,0.53562,1.33845,0.32233,1.34668,3.00746
5,2005,0.4581,1.22703,0.31589,1.27915,2.82207
6,2006,0.45029,1.09116,0.32507,1.17362,2.58985
7,2007,0.49502,1.14624,0.35159,1.24031,2.73814
8,2008,0.55037,1.3804,0.42518,1.43594,3.24152
9,2009,0.66382,1.54817,0.54372,1.49906,3.59095


In [63]:
levels = ["Pre-Primary", "Primary", "Secondary", "Tertiary"]
colors = ["#bee0f9", "#84c8f5", "#43abed", "#0C2945"]

spend_long = spend.melt(id_vars=["year"],
                        var_name = "level",
                        value_vars = levels)

In [64]:


chart_8 = alt.Chart(spend_long).mark_line().encode(
    x = alt.X("year:N").title("Year"),
    y = alt.Y("value").title("% of GDP"),
    color = alt.Color("level",
                      scale = alt.Scale(
                          domain = levels,
                          range = colors
                      )).title("Education Level")).properties(
        title = "% og GDP spent on education level",
        width = 500,
        height = 400).configure_axis(
            grid = False
        )
chart_8.save("../png_charts/spending.png")
chart_8