In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load data (adjust path if needed)
df = pd.read_csv("data/data.csv", sep=';')

# Quick sanity check
df.head()


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [1]:
import pandas as pd
import numpy as np

# Load original dataset (semicolon separated)
df = pd.read_csv("data/data.csv", sep=";")

# Select the 12 variables needed for the visualizations
cols = [
    "Debtor",
    "Tuition fees up to date",
    "Scholarship holder",
    "Admission grade",
    "Previous qualification (grade)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Mother's qualification",
    "Father's qualification",
    "Target"
]
df = df[cols].copy()

# ---- Clean missing values for grade columns only (NA = missing) ----
grade_cols = ["Admission grade", "Previous qualification (grade)"]
df[grade_cols] = df[grade_cols].replace({0: np.nan})

# ---- Convert financial stability fields to Yes/No (this is fine; used in hover only) ----
binary_map = {0: "No", 1: "Yes"}
df["Debtor"] = df["Debtor"].map(binary_map)
df["Tuition fees up to date"] = df["Tuition fees up to date"].map(binary_map)
df["Scholarship holder"] = df["Scholarship holder"].map(binary_map)

# ---- Clean Target variable (dropout status) ----
df["Target"] = df["Target"].astype(str).str.title()

# ---- Parental qualification mapping: KEEP numeric, ADD label columns ----
qualification_map = {
    1:  "Secondary Education - 12th Year",
    2:  "Bachelor's Degree",
    3:  "Higher Education - Degree",
    4:  "Master's Degree",
    5:  "Doctorate",
    6:  "Incomplete Higher Education",
    9:  "12th Year - Not Completed",
    10: "11th Year - Not Completed",
    11: "7th Year (Old Program)",
    12: "11th Year - Other",
    14: "10th Year of Schooling",
    18: "General Commerce Course",
    19: "Basic Education - 3rd Cycle",
    22: "Technical-Professional Course",
    26: "7th Year of Schooling",
    27: "General High School - 2nd Cycle",
    29: "9th Year - Not Completed",
    30: "8th Year of Schooling",
    34: "Unknown",
    35: "Cannot Read or Write",
    36: "Can Read (No 4th Year)",
    37: "Basic Education - 1st Cycle (4th/5th)",
    38: "Basic Education - 2nd Cycle (6th/7th/8th)",
    39: "Technological Specialization Course",
    40: "Higher Education - Degree (1st cycle)",
    41: "Specialized Higher Studies Course",
    42: "Professional Higher Technical Course",
    43: "Master's - 2nd cycle",
    44: "Doctorate - 3rd cycle"
}

# keep numeric codes as-is for plotting
df["Mother_qual_code"] = df["Mother's qualification"]
df["Father_qual_code"] = df["Father's qualification"]

# add human-readable labels for hover text / tables if you want them later
df["Mother_qual_label"] = df["Mother's qualification"].map(qualification_map)
df["Father_qual_label"] = df["Father's qualification"].map(qualification_map)

# ---- Save cleaned dataset ----
df.to_csv("data/cleaned_students.csv", index=False)
print("Cleaning completed, clean file saved as data/cleaned_students.csv")


Cleaning completed, clean file saved as data/cleaned_students.csv


In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go


qualification_map = {
    1:  "Secondary Education - 12th Year",
    2:  "Bachelor's Degree",
    3:  "Higher Education - Degree",
    4:  "Master's Degree",
    5:  "Doctorate",
    6:  "Incomplete Higher Education",
    9:  "12th Year - Not Completed",
    10: "11th Year - Not Completed",
    11: "7th Year (Old Program)",
    12: "11th Year - Other",
    14: "10th Year of Schooling",
    18: "General Commerce Course",
    19: "Basic Education - 3rd Cycle",
    22: "Technical-Professional Course",
    26: "7th Year of Schooling",
    27: "General High School - 2nd Cycle",
    29: "9th Year - Not Completed",
    30: "8th Year of Schooling",
    34: "Unknown",
    35: "Cannot Read or Write",
    36: "Can Read (No 4th Year)",
    37: "Basic Education - 1st Cycle (4th/5th)",
    38: "Basic Education - 2nd Cycle (6th/7th/8th)",
    39: "Technological Specialization Course",
    40: "Higher Education - Degree (1st cycle)",
    41: "Specialized Higher Studies Course",
    42: "Professional Higher Technical Course",
    43: "Master's - 2nd cycle",
    44: "Doctorate - 3rd cycle"
}

# Ordered rank: lower = less schooling, higher = more schooling
qual_rank_map = {
    35: 0,  # Cannot Read or Write
    36: 1,  # Can Read (No 4th Year)

    37: 2,  # Basic Education - 1st Cycle (4th/5th)
    26: 3,  # 7th Year of Schooling
    11: 3,  # 7th Year (Old Program)
    38: 4,  # Basic Education - 2nd Cycle (6th/7th/8th)
    30: 5,  # 8th Year of Schooling
    29: 6,  # 9th Year - Not Completed
    19: 7,  # Basic Education - 3rd Cycle

    14: 8,   # 10th Year of Schooling
    10: 9,   # 11th Year - Not Completed
    12: 10,  # 11th Year - Other
    18: 11,  # General Commerce Course
    27: 12,  # General High School - 2nd Cycle
    9:  13,  # 12th Year - Not Completed
    1:  14,  # Secondary Education - 12th Year

    22: 15,  # Technical-Professional Course
    39: 16,  # Technological Specialization Course

    42: 17,  # Professional Higher Technical Course
    40: 18,  # Higher Education - Degree (1st cycle)
    3:  19,  # Higher Education - Degree
    2:  20,  # Bachelor's Degree
    41: 21,  # Specialized Higher Studies Course

    4:  22,  # Master's Degree
    43: 23,  # Master's - 2nd cycle

    5:  24,  # Doctorate
    44: 25,  # Doctorate - 3rd cycle

    34: 12   # Unknown â†’ roughly middle
}

df["Mother_qual_label"] = df["Mother's qualification"].map(qualification_map)
df["Father_qual_label"] = df["Father's qualification"].map(qualification_map)

df["Mother_qual_rank"] = df["Mother's qualification"].map(qual_rank_map)
df["Father_qual_rank"]  = df["Father's qualification"].map(qual_rank_map)

df = df.dropna(subset=["Mother_qual_rank", "Father_qual_rank"])


df["SES_score"] = (df["Mother_qual_rank"] + df["Father_qual_rank"]) / 2
q1, q2 = df["SES_score"].quantile([0.33, 0.66])

ses_groups = {
    "All SES": df,
    "Low SES": df[df["SES_score"] <= q1],
    "Mid SES": df[(df["SES_score"] > q1) & (df["SES_score"] < q2)],
    "High SES": df[df["SES_score"] >= q2],
}
ses_labels = list(ses_groups.keys())


fig_fin = go.Figure()

for i, label in enumerate(ses_labels):
    sub = ses_groups[label]

    # 1) Density heatmap using ranks
    fig_fin.add_trace(
        go.Histogram2d(
            x=sub["Mother_qual_rank"],
            y=sub["Father_qual_rank"],
            colorscale="Blues",
            zsmooth="best",
            nbinsx=25,
            nbinsy=25,
            name=f"{label} - student density",
            opacity=0.65,
            colorbar=dict(title="Student Density"),
            showscale=True if label == "All SES" else False,
            visible=True if label == "All SES" else False
        )
    )

    sub_drop = sub[sub["Target"] == "Dropout"]
    fig_fin.add_trace(
        go.Scatter(
            x=sub_drop["Mother_qual_rank"],
            y=sub_drop["Father_qual_rank"],
            mode="markers",
            name="Dropout students (red)",
            marker=dict(color="red", size=7, opacity=0.85),
            customdata=np.stack(
                [sub_drop["Mother_qual_label"], sub_drop["Father_qual_label"]],
                axis=-1
            ),
            hovertemplate=(
                "Outcome: Dropout<br>"
                "Mother qual: %{customdata[0]}<br>"
                "Father qual: %{customdata[1]}<extra></extra>"
            ),
            visible=True if label == "All SES" else False
        )
    )

    sub_grad = sub[sub["Target"] == "Graduate"]
    fig_fin.add_trace(
        go.Scatter(
            x=sub_grad["Mother_qual_rank"],
            y=sub_grad["Father_qual_rank"],
            mode="markers",
            name="Graduate students (green)",
            marker=dict(color="green", size=7, opacity=0.8),
            customdata=np.stack(
                [sub_grad["Mother_qual_label"], sub_grad["Father_qual_label"]],
                axis=-1
            ),
            hovertemplate=(
                "Outcome: Graduate<br>"
                "Mother qual: %{customdata[0]}<br>"
                "Father qual: %{customdata[1]}<extra></extra>"
            ),
            visible=False
        )
    )

    sub_enrl = sub[sub["Target"] == "Enrolled"]
    fig_fin.add_trace(
        go.Scatter(
            x=sub_enrl["Mother_qual_rank"],
            y=sub_enrl["Father_qual_rank"],
            mode="markers",
            name="Enrolled students (orange)",
            marker=dict(color="orange", size=7, opacity=0.8),
            customdata=np.stack(
                [sub_enrl["Mother_qual_label"], sub_enrl["Father_qual_label"]],
                axis=-1
            ),
            hovertemplate=(
                "Outcome: Enrolled<br>"
                "Mother qual: %{customdata[0]}<br>"
                "Father qual: %{customdata[1]}<extra></extra>"
            ),
            visible=False
        )
    )


n_per_group = 4
total_traces = len(fig_fin.data)
buttons = []

for i, label in enumerate(ses_labels):
    visible = [False] * total_traces
    start = i * n_per_group
    end = start + n_per_group
    for j in range(start, end):
        visible[j] = True 
    buttons.append(
        dict(
            label=label,
            method="update",
            args=[
                {"visible": visible},
                {"title": f"Financial Stability Landscape ({label})"}
            ]
        )
    )

fig_fin.update_xaxes(
    title=dict(
        text="<b>Mother's Qualification Rank (higher = more education)</b>",
        font=dict(size=16)
    ),
    tickfont=dict(size=12)
)

fig_fin.update_yaxes(
    title=dict(
        text="<b>Father's Qualification Rank (higher = more education)</b>",
        font=dict(size=16)
    ),
    tickfont=dict(size=12)
)

fig_fin.update_layout(
     title=dict(
        text="<b>Financial Stability Landscape (All SES, Dropout Highlighted)</b>",
        font=dict(size=16)
         ),
    hovermode="closest",
    width=1400,
    height=650,
    margin=dict(l=70, r=150, t=120, b=70),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.03,
        xanchor="left",
        x=0,
        bgcolor="rgba(255,255,255,0.7)",
        bordercolor="lightgray",
        borderwidth=1
    ),
    updatemenus=[
        dict(
            type="dropdown",
            showactive=True,
            x=1.08,
            y=1.25,
            buttons=buttons
        )
    ]
)

fig_fin.show()

import plotly.io as pio

pio.write_html(
    fig_fin,
    file="viz1_ses.html",      
    include_plotlyjs="cdn",    
    full_html=True,           
    auto_open=False
)



In [None]:
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio

grade_col = "Curricular units 1st sem (grade)"
units_col = "Curricular units 1st sem (approved)"

df_acad = df.dropna(subset=[grade_col, units_col, "Target"]).copy()

grade_thr = df_acad[grade_col].median()
units_thr = df_acad[units_col].median()

hover_cols = []
for col in ["Tuition fees up to date", "Debtor", "Scholarship holder"]:
    if col in df_acad.columns:
        hover_cols.append(col)

fig_risk = go.Figure()

outcomes_order = ["Dropout", "Graduate", "Enrolled"]
outcome_colors = {
    "Dropout": "red",
    "Graduate": "green",
    "Enrolled": "orange"
}

scatter_indices = {}
line_indices = {}

for outcome in outcomes_order:
    color = outcome_colors[outcome]
    sub = df_acad[df_acad["Target"] == outcome].copy()

    hover_text = []
    for _, row in sub.iterrows():
        parts = [
            f"Outcome: {outcome}",
            f"1st Sem Grade: {row[grade_col]:.2f}",
            f"Units Approved: {row[units_col]}"
        ]
        for c in hover_cols:
            parts.append(f"{c}: {row[c]}")
        hover_text.append("<br>".join(parts))

    scatter_trace = go.Scatter(
        x=sub[grade_col],
        y=sub[units_col],
        mode="markers",
        name=outcome,
        marker=dict(
            color=color,
            size=9,              
            opacity=0.75,
            line=dict(width=0.4, color="black")
        ),
        text=hover_text,
        hovertemplate="%{text}<extra></extra>",
        visible=(outcome == "Dropout")   
    )
    fig_risk.add_trace(scatter_trace)
    scatter_indices[outcome] = len(fig_risk.data) - 1

    if len(sub) >= 2:
        sub_sorted = sub.sort_values(grade_col)
        x_vals = sub_sorted[grade_col].values
        y_vals = sub_sorted[units_col].values

        m, b = np.polyfit(x_vals, y_vals, 1)
        x_line = np.linspace(x_vals.min(), x_vals.max(), 100)
        y_line = m * x_line + b

        line_trace = go.Scatter(
            x=x_line,
            y=y_line,
            mode="lines",
            name=f"{outcome} trend",
            line=dict(color=color, width=2),
            showlegend=False,              
            visible=(outcome == "Dropout")
        )
        fig_risk.add_trace(line_trace)
        line_indices[outcome] = len(fig_risk.data) - 1
    else:
        line_indices[outcome] = None  

fig_risk.add_shape(
    type="line",
    x0=grade_thr, x1=grade_thr,
    y0=df_acad[units_col].min(), y1=df_acad[units_col].max(),
    line=dict(color="grey", width=2, dash="dash")
)

fig_risk.add_shape(
    type="line",
    x0=df_acad[grade_col].min(), x1=df_acad[grade_col].max(),
    y0=units_thr, y1=units_thr,
    line=dict(color="grey", width=2, dash="dash")
)

x_min, x_max = df_acad[grade_col].min(), df_acad[grade_col].max()
y_min, y_max = df_acad[units_col].min(), df_acad[units_col].max()

fig_risk.add_annotation(
    x=(x_min + grade_thr) / 2,
    y=(y_min + units_thr) / 2,
    text="Low grade<br>Low approvals<br><b>Higher dropout risk</b>",
    showarrow=False,
    font=dict(size=11, color="firebrick"),
    bgcolor="rgba(255,230,230,0.85)"
)

fig_risk.add_annotation(
    x=(x_max + grade_thr) / 2,
    y=(y_max + units_thr) / 2,
    text="High grade<br>High approvals<br><b>Lower dropout risk</b>",
    showarrow=False,
    font=dict(size=11, color="darkgreen"),
    bgcolor="rgba(230,255,230,0.85)"
)

total_traces = len(fig_risk.data) 

buttons = []

visible_all = [True] * total_traces
buttons.append(
    dict(
        label="All outcomes",
        method="update",
        args=[
            {"visible": visible_all},
            {"title": {"text": "<b>Academic Risk Map (All Outcomes)</b>"}}
        ]
    )
)

for outcome in outcomes_order:
    visible = [False] * total_traces
    si = scatter_indices[outcome]
    li = line_indices[outcome]
    visible[si] = True
    if li is not None:
        visible[li] = True

    buttons.append(
        dict(
            label=f"{outcome} only",
            method="update",
            args=[
                {"visible": visible},
                {"title": {"text": f"<b>Academic Risk Map ({outcome} Only)</b>"}}
            ]
        )
    )

fig_risk.update_layout(
    title=dict(
        text="<b>Academic Risk Map (Dropout Only)</b>",
        font=dict(size=20)
    ),
    hovermode="closest",
    width=1100,
    height=650,
    margin=dict(l=60, r=40, t=100, b=60),
    legend=dict(
        title="Student Outcome",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="lightgray",
        borderwidth=1
    ),
    updatemenus=[
        dict(
            type="dropdown",
            showactive=True,
            x=1.12,
            y=1.15,
            buttons=buttons,
            active=1  # "Dropout only" initial view
        )
    ]
)

fig_risk.update_xaxes(
    title=dict(
        text="<b>1st Semester Average Grade</b>",
        font=dict(size=16)
    ),
    tickfont=dict(size=12),
    rangeslider=dict(visible=False)
)

fig_risk.update_yaxes(
    title=dict(
        text="<b>Units Approved in 1st Semester</b>",
        font=dict(size=16)
    ),
    tickfont=dict(size=12)
)

fig_risk.show()

pio.write_html(
    fig_risk,
    file="viz2_academic_risk.html",
    include_plotlyjs="cdn",
    full_html=True,
    auto_open=False
)
