<a href="https://colab.research.google.com/github/YufeiM28/CAPP_Static_Project/blob/main/code/Milestone4_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip -q install --upgrade altair==5.4.1 typing_extensions==4.12.2 vl-convert-python
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [23]:
import altair as alt
import pandas as pd
import json
import urllib.request
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [24]:
df_1112 = pd.read_csv("1112_clean.csv")
df_1819 = pd.read_csv("1819_clean.csv")
df_2324 = pd.read_csv("2324_clean.csv")

# Plot 1

In [25]:
for df in (df_1112, df_1819, df_2324):
    df.columns = df.columns.str.strip()
    df = df.replace("NDA", None)

for df in (df_1112, df_1819, df_2324):
    df["College_Enrollment_Rate_School"] = pd.to_numeric(df["College_Enrollment_Rate_School"], errors="coerce")
    df["Graduation_Rate_School"] = pd.to_numeric(df["Graduation_Rate_School"], errors="coerce")

mean_df = pd.DataFrame({
    "Academic_Year": ["2011–12", "2018–19", "2023–24"],
    "College Enrollment Rate": [
        df_1112["College_Enrollment_Rate_School"].mean(),
        df_1819["College_Enrollment_Rate_School"].mean(),
        df_2324["College_Enrollment_Rate_School"].mean()],
    "Graduation Rate": [
        df_1112["Graduation_Rate_School"].mean(),
        df_1819["Graduation_Rate_School"].mean(),
        df_2324["Graduation_Rate_School"].mean()]
    })

In [26]:
# plot
plot_df = mean_df.melt("Academic_Year", var_name="Rate Type", value_name="Average Rate")

chart = (alt.Chart(plot_df)
    .mark_bar()
    .encode(
        x=alt.X("Academic_Year:N", title="Academic Year", axis=alt.Axis(labelAngle=0)),
        xOffset=alt.XOffset("Rate Type:N"),
        y=alt.Y("Average Rate:Q", title="Average Rate (%)", axis=alt.Axis(grid=False), scale=alt.Scale(domain=[0, 100])),
        color=alt.Color("Rate Type:N", title="",
              scale=alt.Scale(domain=["College Enrollment Rate", "Graduation Rate"],
              range=["#90bded", "#1b9e77"]),
              legend=alt.Legend(titleFontSize=13, labelFontSize=12)))
    .properties(
        width=560, height=380,
        title=alt.TitleParams(
        text="Trends in High School Graduation and College Enrollment Rates")))

labels = (alt.Chart(plot_df)
    .mark_text(dy=-6, fontSize=11)
    .encode(
        x="Academic_Year:N",
        xOffset="Rate Type:N",
        y="Average Rate:Q",
        text=alt.Text("Average Rate:Q", format=".1f"),
        detail="Rate Type:N"))

plot1 = (chart + labels).configure_view(strokeWidth=0)

plot1.save("plot1.svg")

plot1


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(

Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


# Plot 2

In [27]:
# data clean
df_1112["School_Year"] = "2011-12"
df_1819["School_Year"] = "2018-19"
df_2324["School_Year"] = "2023-24"

dfs = [df_1112, df_1819, df_2324]

for df in dfs:
    df.dropna(subset=["Student_Attendance_Avg", "Teacher_Attendance_Avg"], inplace=True)

merged_wide = pd.concat(dfs, ignore_index=True)
merged_wide.to_csv("attendance_merged_wide.csv", index=False)

merged_long = merged_wide.melt(
    id_vars=["School_ID", "School_Year"],
    value_vars=["Student_Attendance_Avg", "Teacher_Attendance_Avg"],
    var_name="Role", value_name="Attendance"
    ).replace({"Role": {"Student_Attendance_Avg": "Student",
    "Teacher_Attendance_Avg": "Teacher"}})

merged_long["Attendance"] = pd.to_numeric(merged_long["Attendance"], errors="coerce")
merged_long = merged_long.dropna(subset=["Attendance", "School_Year", "Role"])

year_labels = ["2011-12", "2018-19", "2023-24"]

In [28]:
# ChatGPT is used for helping me to learn how to generate jitterplot in Altair
plot2 = (alt.Chart(merged_long)
    .transform_calculate(
        xIndex=f"indexof({year_labels!r}, datum.School_Year)",
        jitter="(random() - 0.5) * 0.45",
        xj="datum.xIndex + datum.jitter")
    .mark_circle(size=50, opacity=0.7)
    .encode(
        x=alt.X("xj:Q", axis=alt.Axis(
                title="School Year",
                grid = False,
                values=[0, 1, 2],
                labelExpr=f"{year_labels!r}[datum.value]")),
        y=alt.Y("Attendance:Q", title="Attendance (%)", scale=alt.Scale(domain=[0, 100]),
                axis=alt.Axis(titleFontSize=13, labelFontSize=12)),
        color=alt.Color("Role:N",
            scale=alt.Scale(domain=["Student", "Teacher"], range=["#99cdee", "#ee9999"]),
            legend=alt.Legend(titleFontSize=13, labelFontSize=12)))
    .properties(width=560, height=380,
            title=alt.TitleParams(
            text="Student and Teacher Attendance across School Year")))

plot2.save("plot2.svg")

plot2


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(

Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


# Plot 3

In [29]:
# get the map of chicago
url = "https://raw.githubusercontent.com/RandomFractals/ChicagoCrimes/master/data/chicago-community-areas.geojson"

with urllib.request.urlopen(url) as response:
    chicago_geo = json.load(response)

#schools = df_2324[["School_Latitude", "School_Longitude"]]
schools = df_2324[["School_Latitude", "School_Longitude"]].dropna()

base = (alt.Chart(alt.Data(values=chicago_geo, format=alt.DataFormat(property="features")))
    .mark_geoshape(fill="#f7f7f7", stroke="#bdbdbd", strokeWidth=0.5))

points = (alt.Chart(schools)
    .mark_circle(size=50, opacity=0.75, color="#90bded")
    .encode(longitude="School_Longitude:Q", latitude="School_Latitude:Q"))

plot3 = (base + points).project(type="mercator").properties(
    width=650,
    height=550,
    title=alt.TitleParams(
        text="Chicago Public High Schools (2023–2024)"))

plot3.save("plot3.svg")


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


# Plot 4

In [30]:
# clean
def coerce_numeric_series(s):
    s = s.astype(str).str.strip().str.replace("%", "", regex=False)
    s = pd.to_numeric(s, errors="coerce")
    return s

def prep(df, year_label):
    use = df[["School_ID", "SAT_Grade_11_Score_School_Avg", "College_Enrollment_Rate_School"]].copy()
    use["SAT_Grade_11_Score_School_Avg"] = coerce_numeric_series(use["SAT_Grade_11_Score_School_Avg"])
    use["College_Enrollment_Rate_School"] = coerce_numeric_series(use["College_Enrollment_Rate_School"])
    use["School_Year"] = year_label
    return use.dropna(subset=["SAT_Grade_11_Score_School_Avg", "College_Enrollment_Rate_School"])

df_1819_sc = prep(df_1819, "2018-19")
df_2324_sc = prep(df_2324, "2023-24")

scatter_df = pd.concat([df_1819_sc, df_2324_sc], ignore_index=True)

year_order = ["2018-19", "2023-24"]
colors = ["#fdbe6b", "#fd6b6b"]

In [31]:
# plot
plot4 = (alt.Chart(scatter_df)
      .mark_circle(size=50, opacity=0.7)
      .encode(
          x=alt.X("SAT_Grade_11_Score_School_Avg:Q",
              title="School Averaged SAT Scores",
              scale=alt.Scale(domain=[600, 1600])),
          y=alt.Y("College_Enrollment_Rate_School:Q",
              title="College Enrollment Rate (%)",
              scale=alt.Scale(domain=[0, 100])),
          color=alt.Color("School_Year:N",
              title="School Year",
              scale=alt.Scale(domain=year_order, range=colors)))
      .properties(title="Relationship between College Enrollment Rate and SAT Score", width=560, height=300))

plot4.save("plot4.svg")

plot4


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(

Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


# Plot 5

In [32]:
# clean data
cols_scores = ["Family_Involvement_Score", "Environment Score", "Instruction Score"]
keep = ["School_ID", "Student_Attendance_Avg"] + cols_scores

d = df_1112[keep].copy()

for c in cols_scores + ["Student_Attendance_Avg"]:
    d[c] = pd.to_numeric(d[c], errors="coerce")

long = (d.melt(id_vars=["School_ID", "Student_Attendance_Avg"],
           value_vars=cols_scores, var_name="Domain", value_name="Score")
     .dropna(subset=["Score", "Student_Attendance_Avg"]))

# labels
long["Domain"] = long["Domain"].map({
    "Family_Involvement_Score": "Family Involvement",
    "Environment Score": "Environment",
    "Instruction Score": "Instruction"})

# color
domain = ["Family Involvement", "Environment", "Instruction"]
palette = ["#1b9e77", "#80b1d3", "#fb9a99"]

# plot
scatter = (alt.Chart(long)
      .mark_circle(size=35, opacity=0.3)
      .encode(
          x=alt.X("Score:Q", title="Evaluation Score",
                  scale=alt.Scale(domain=[0, 100])),
          y=alt.Y("Student_Attendance_Avg:Q", title="Student Attendance (%)",
                  scale=alt.Scale(domain=[60, 100])),
          color=alt.Color("Domain:N", title="Score",
                          scale=alt.Scale(domain=domain, range=palette))
      ))

# fit lines
trend = (alt.Chart(long)
      .transform_regression("Score", "Student_Attendance_Avg", groupby=["Domain"])
      .mark_line(size=2.5)
      .encode(x="Score:Q", y="Student_Attendance_Avg:Q",
          color=alt.Color("Domain:N", scale=alt.Scale(domain=domain, range=palette))))

plot5 = (scatter + trend).properties(width=560, height=380,
    title=alt.TitleParams(text="Influential Factors towards Students' Attendance Rate"))

plot5.save("plot5.svg")

plot5


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(

Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


# Plot 6 & 7

In [33]:
# clean data & build dataframe
def prep_creative(df, year_label):
    d = df[["School_ID", "Creative_School_Certification"]].copy()
    d["Creative_School_Certification"] = (d["Creative_School_Certification"]
        .astype(str)
        .str.strip()
        .str.upper())
    d = d[~d["Creative_School_Certification"].isin(["NDA", "NOT ENOUGH DATA", "INCOMPELET DATA"])]
    d = d.dropna(subset=["Creative_School_Certification"])
    d["School_Year"] = year_label
    return d

df_1819_ldr = prep_creative(df_1819, "2018-19")
df_2324_ldr = prep_creative(df_2324, "2023-24")

combined = pd.concat([df_1819_ldr, df_2324_ldr], ignore_index=True)

summary = (combined
    .groupby(["School_Year", "Creative_School_Certification"])
    .size()
    .reset_index(name="Count"))

missing = {"NDA", "NOT ENOUGH DATA", "INCOMPLETE DATA"}
level_order = ["EMERGING", "DEVELOPING", "STRONG", "EXCELLING"]
combined = combined[combined["Creative_School_Certification"].isin(level_order)].copy()
rank_map = {"EMERGING":1, "DEVELOPING":2, "STRONG":3, "EXCELLING":4}
combined["level_rank"] = combined["Creative_School_Certification"].map(rank_map)
year_order = ["2018-19", "2023-24"]

In [34]:
# plot
plot6 = (alt.Chart(combined)
      .mark_bar()
      .encode(
          x=alt.X("School_Year:N", sort=year_order, title=None, axis=alt.Axis(labelAngle=0)),
          y=alt.Y("count():Q", stack="normalize", axis=alt.Axis(format="%", title="Share of Schools")),
          color=alt.Color("Creative_School_Certification:N",
              title="Creativity Certification",
              sort=["EMERGING", "DEVELOPING", "STRONG", "EXCELLING"],
              scale=alt.Scale(range=["#b8e3b0", "#78c679", "#31a354", "#006d2c"])),
          order=alt.Order("level_rank:Q", sort="descending"))
      .properties(title="Share of Schools by Creativity Level", width=400, height=200)
      .configure_view(strokeWidth=0)
      .configure_axis(grid=False))

plot6.save("plot6.svg")

plot6


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(

Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [35]:
# clean data & build dataframe
def prep_leadership(df, year_label):
    d = df[["School_ID", "School_Survey_Effective_Leaders"]].copy()
    d["School_Survey_Effective_Leaders"] = (d["School_Survey_Effective_Leaders"]
        .astype(str)
        .str.strip()
        .str.upper())
    d = d[~d["School_Survey_Effective_Leaders"].isin(["NDA", "NOT ENOUGH DATA", "INCOMPELET DATA"])]
    d = d.dropna(subset=["School_Survey_Effective_Leaders"])
    d["School_Year"] = year_label
    return d

df_1819_ldr = prep_leadership(df_1819, "2018-19")
df_2324_ldr = prep_leadership(df_2324, "2023-24")

combined = pd.concat([df_1819_ldr, df_2324_ldr], ignore_index=True)

missing = {"NDA", "NOT ENOUGH DATA", "INCOMPLETE DATA"}
level_order = ["WEAK", "NEUTRAL", "STRONG", "VERY STRONG"]
combined = combined[combined["School_Survey_Effective_Leaders"].isin(level_order)].copy()
rank_map = {"WEAK":1, "NEUTRAL":2, "STRONG":3, "VERY STRONG":4}
combined["level_rank"] = combined["School_Survey_Effective_Leaders"].map(rank_map)
year_order = ["2018-19", "2023-24"]

In [36]:
# plot
plot7 = (alt.Chart(combined)
      .mark_bar()
      .encode(
          x=alt.X("School_Year:N", sort=year_order, title=None, axis=alt.Axis(labelAngle=0)),
          y=alt.Y("count():Q", stack="normalize", axis=alt.Axis(format="%", title="Share of Schools")),
          color=alt.Color("School_Survey_Effective_Leaders:N",
              title="Effective Leadership Education",
              sort=["WEAK", "NEUTRAL", "STRONG", "VERY STRONG"],
              scale=alt.Scale(range=["#afbede", "#7993c7", "#4869ac", "#314876"])),
          order=alt.Order("level_rank:Q", sort="descending"))
      .properties(title="Share of Schools by Leadership Education", width=400, height=200)
      .configure_view(strokeWidth=0)
      .configure_axis(grid=False))

plot7.save("plot7.svg")

plot7


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(

Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(
