In [26]:
import altair as alt
import pandas as pd
from vega_datasets import data

pd.set_option("display.max_columns", None)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Create Helper for Saving Outputs

In [27]:
def save_chart(chart, filename, fmt="svg", scale=2):
    filepath = f"charts/{filename}.{fmt}"
    chart.save(filepath, format=fmt, scale=scale)

## Load & Prep Data

In [28]:
arrests_df = pd.read_excel("data/arrests-latest.xlsx")

In [29]:
# roll up duplicates
# Asked AI: how to keep the first value of duplicate rows in pandas and it gave this: df.drop_duplicates(subset=["A"], keep="first")
arrests_dedupe = arrests_df.drop_duplicates(subset=["unique_identifier"], keep="first")
# print(len(arrests_df)) # 291722
# print(len(arrests_dedupe)) # 274017

## Create Visualizations

In [None]:
# Disable "..." button on top right of charts
alt.renderers.set_embed_options(actions=False)


# Configure global look (set palette)
# Asked AI: what decorator can i use to set global theme with altair:
# @alt.theme.register("my_global_theme") and gave a sample theme function that I adjusted for my teal theme
@alt.theme.register("teal_theme", enable=True)
def teal_theme():
    return {
        "config": {
            "range": {
                "category": {"scheme": "teals"},
                "heatmap": {"scheme": "teals"},
            },
            "mark": {"color": "#008080"},
        },
    }


In [32]:
# limit df to only needed cols to speed up run time & exclude July 2025 because it's incomplete month
# arrests_date = arrests_dedupe[["apprehension_date"]].copy()
arrests_date = arrests_dedupe[
    ~(
        (arrests_dedupe["apprehension_date"].dt.month == 7)
        & (arrests_dedupe["apprehension_date"].dt.year == 2025)
    )
][["apprehension_date"]].copy()


base = (
    alt.Chart(arrests_date)
    .mark_line()
    .encode(
        alt.X(
            "yearmonth(apprehension_date):T",
            axis=alt.Axis(
                title="Date (Month & Year)",
                tickCount="month",  # ensures monthly tick marks
                format="%b %Y",  # e.g. Jan 2024, Feb 2024
                labelAngle=-45,  # slanted labels for readability
            ),
        ).title("Date"),
        alt.Y("count():Q").title("Total Monthly Arrests"),
    )
    .properties(width=400)
)


rule = (
    alt.Chart(
        pd.DataFrame(
            {
                "apprehension_date": [pd.Timestamp("2025-01-20")],
                "label": ["Trump Inaugurated"],
            }
        )
    )
    .mark_rule(color="red", strokeWidth=2)
    .encode(x="yearmonth(apprehension_date):T", size=alt.value(2))
)

text = (
    alt.Chart(
        pd.DataFrame(
            {
                "apprehension_date": [pd.Timestamp("2025-01-20")],
                "y": [arrests_date["apprehension_date"].value_counts().max()],
                "label": ["Trump Inaugurated"],
            }
        )
    )
    .mark_text(align="left", dx=5, dy=-5, color="red")
    .encode(x="yearmonth(apprehension_date):T", y=alt.value(0), text="label")
)

chart = (base + rule + text).properties(title="ICE Arrests (Sept 2023 - June 2025)")
save_chart(chart, "arrests_over_time")

In [None]:
## look into better source for state centroids than using capitals

arrest_states = arrests_dedupe.copy()

# Clean state names and aggregate to counts
# Drop missing / unknowns
arrest_states = arrest_states.dropna(subset=["apprehension_state"])

# Normalize to Title Case to match vega_datasets' "state" names
# (eg 'SOUTH CAROLINA' -> 'South Carolina')
arrest_states["state"] = (
    arrest_states["apprehension_state"].astype(str).str.strip().str.title()
)

# Fix small title-case quirks (e.g. 'Of' in DC)
arrest_states["state"] = arrest_states["state"].replace(
    {
        "District Of Columbia": "District of Columbia",
    }
)

# drop missing values
df = arrest_states.dropna(subset=["state"])

# Count arrests by state
by_state = (
    df.groupby("state", as_index=False).size().rename(columns={"size": "arrests"})
)

# Get map & centroids from vega_datasets
# Base US states topology
states = alt.topo_feature(data.us_10m.url, "states")

# State "centroids": use the state capital coordinates (proxy)
capitals = pd.DataFrame(data.us_state_capitals())  # columns: state, city, lat, lon

centroids = capitals[["state", "lat", "lon"]].copy()

# Join your counts to centroids
plot_df = pd.merge(centroids, by_state, on="state", how="inner")

# Background map
background = (
    alt.Chart(states)
    .mark_geoshape(fill="#eeeeee", stroke="#ffffffb0")
    .project("albersUsa")
)

# Proportional symbols
points = (
    alt.Chart(plot_df)
    .mark_circle(opacity=0.7)
    .encode(
        longitude="lon:Q",
        latitude="lat:Q",
        size=alt.Size("arrests:Q", title="Arrests", scale=alt.Scale(range=[0, 2500])),
    )
    .project("albersUsa")
)


chart = (background + points).properties(
    title="ICE Arrests by State (Sept 2023 - July 2025)", width=500, height=400
)

save_chart(chart, "arrests_by_state")


### Demographics

In [126]:
# limit to values where origin and departure country differ

# drop missing values
diff_departures = (arrests_dedupe.copy()).dropna(
    subset=["citizenship_country", "departure_country"]
)
# exclude july 2025 for time series (incomplete month)
diff_departures = diff_departures[
    ~(
        (diff_departures["apprehension_date"].dt.month == 7)
        & (diff_departures["apprehension_date"].dt.year == 2025)
    )
]
# create flag for where departure and citizenship country differ
diff_departures["different_country_flag"] = (
    diff_departures["citizenship_country"] != diff_departures["departure_country"]
)

diff_countries_perc = (
    len(diff_departures[diff_departures["different_country_flag"]])
    / len(diff_departures)
    * 100
)
# print(
#     f"countries where origin and departure differ make up {diff_countries_perc}% of total arrests"
# ) # 2.16%

# Create readable labels (change true/false to yes/no)
diff_departures["different_country_label"] = diff_departures[
    "different_country_flag"
].map({True: "Yes", False: "No"})

chart = (
    alt.Chart(diff_departures)
    .mark_area(opacity=0.8)
    .encode(
        x=alt.X("yearmonth(apprehension_date):T", title="Date"),
        y=alt.Y("count():Q", title="Arrest count"),
        color=alt.Color(
            "different_country_label:N",
            title="Different country?",
            scale=alt.Scale(
                domain=["No", "Yes"],
                range=["#cccccc", "#00796B"],
            ),
        ),
    )
    .properties(
        width=500,
        height=300,
        title="ICE Arrests by Month — Origin vs Departure Country",
    )
)

save_chart(chart, "time_diff_countries")

In [127]:
diff_departures = (arrests_dedupe.copy()).dropna(
    subset=["citizenship_country", "departure_country"]
)

# Keep only countries where citizenship and departure differ
diff_departures = diff_departures[
    diff_departures["citizenship_country"] != diff_departures["departure_country"]
]

# Count pairs (citizenship -> departure)
pairs = (
    diff_departures.groupby(
        ["citizenship_country", "departure_country"], as_index=False
    )
    .size()
    .rename(columns={"size": "count"})
)

# Top 10 departures (by total count)
top_dests = pairs.groupby("departure_country")["count"].sum().nlargest(7).index
pairs_top_dest = pairs[pairs["departure_country"].isin(top_dests)]

# Within that subset, pick top 10 citizenships
top_origins = (
    pairs_top_dest.groupby("citizenship_country")["count"].sum().nlargest(7).index
)
pairs_top = pairs_top_dest[
    pairs_top_dest["citizenship_country"].isin(top_origins)
].copy()

# Sorting for nicer axes
origin_order = (
    pairs_top.groupby("citizenship_country")["count"]
    .sum()
    .sort_values(ascending=False)
    .index.tolist()
)
dest_order = (
    pairs_top.groupby("departure_country")["count"]
    .sum()
    .sort_values(ascending=False)
    .index.tolist()
)

# Normalize within each citizenship (row-wise) on the Top10×Top10 subset
pairs_top["share_within_cit"] = pairs_top["count"] / pairs_top.groupby(
    "citizenship_country"
)["count"].transform("sum")

# Heatmap
heatmap = (
    alt.Chart(pairs_top)
    .mark_rect()
    .encode(
        x=alt.X(
            "departure_country:N",
            sort=dest_order,
            title="Departure Country",
            axis=alt.Axis(labelAngle=-45),
        ),
        y=alt.Y(
            "citizenship_country:N", sort=origin_order, title="Citizenship Country"
        ),
        color=alt.Color(
            "share_within_cit:Q",
            title="Share within citizenship",
            scale=alt.Scale(
                domain=[0, 1], scheme="purplebluegreen"
            ),  # yellowgreenblue, purplebluegreen
        ),
        # tooltip=[
        #     alt.Tooltip("citizenship_country:N", title="Citizenship"),
        #     alt.Tooltip("departure_country:N", title="Departure"),
        #     alt.Tooltip("count:Q", title="Count"),
        #     alt.Tooltip("share_within_cit:Q", title="Share", format=".1%"),
        # ],
    )
    .properties(
        width=500,
        height=400,
        title="Deportations to Countries Differing from Person's Origin (Top 7)",
    )
)

save_chart(heatmap, "diff_countries_heatmap")

In [37]:
# prep df
arrests_citizen = arrests_dedupe[["citizenship_country"]].copy()
top_n = (
    arrests_citizen["citizenship_country"]
    .value_counts()
    .nlargest(10)
    .rename_axis("citizenship_country")
    .reset_index(name="arrests")
)

# create chart
chart = (
    alt.Chart(top_n)
    .mark_bar()
    .encode(
        alt.X("citizenship_country:N", sort="-y", axis=alt.Axis(labelAngle=-45)).title(
            "Citizenship Country"
        ),
        alt.Y("arrests:Q").title("Arrests"),
    )
).properties(title="Top 10 Countries of Origin")

save_chart(chart, "citizenship_country")

In [38]:
# top5 citizenship country arrests make up X% of total arrests

# prep data
total = arrests_dedupe.shape[0]
top_n_sum = top_n["arrests"].sum()
top_n_perc = (top_n_sum / total) * 100
remaining_perc = ((total - top_n_sum) / total) * 100

# Asked AI: whats the syntax for creating pandas df and it gave the following:
# data = {
#     "name": ["Alice", "Bob", "Charlie"],
#     "age": [25, 30, 35],
#     "city": ["NYC", "LA", "Chicago"]
# }

# df = pd.DataFrame(data)

pie_data = pd.DataFrame(
    {
        "category": ["Top 5 Countries", "All Other Countries"],
        "arrest_perc": [top_n_perc, remaining_perc],
    }
)

# create chart
chart = (
    alt.Chart(pie_data)
    .mark_arc()
    .encode(
        alt.Theta("arrest_perc:Q", title="Percent of Total Arrests"),
        alt.Color("category:N", title="Citizenship Country"),
    )
    .properties(
        title="Share of Total Arrests by Citizenship Country: Top 5 vs. All Other"
    )
)

save_chart(chart, "pie_top5_countries")

In [39]:
methods = arrests_dedupe[
    ["apprehension_method", "apprehension_date", "apprehension_criminality"]
].copy()


# bucket apprehension methods (include as a reference in the final graphic)
bucket_map = {
    "CAP Local Incarceration": "Criminal Alien Program",
    "CAP Federal Incarceration": "Criminal Alien Program",
    "CAP State Incarceration": "Criminal Alien Program",
    "Non-Custodial Arrest": "Enforcement and Removal Operations",
    "Custodial Arrest": "Enforcement and Removal Operations",
    "ERO Reprocessed Arrest": "Enforcement and Removal Operations",
    "Other efforts": "Task Force / Other Agency",
    "Other Task Force": "Task Force / Other Agency",
    "Other Agency (turned over to INS)": "Task Force / Other Agency",
    "Law Enforcement Agency Response Unit": "Task Force / Other Agency",
    "Organized Crime Drug Enforcement Task Force": "Task Force / Other Agency",
    "287(g) Program": "287g / Probation",
    "Probation and Parole": "287g / Probation",
    "Patrol Border": "Border / Patrol",
    "Patrol Interior": "Border / Patrol",
    "Located": "Border / Patrol",
    "Worksite Enforcement": "Worksite / Inspections",
    "Inspections": "Worksite / Inspections",
    "Anti-Smuggling": "Worksite / Inspections",
    "Traffic Check": "Worksite / Inspections",
    "Transportation Check Aircraft": "Transportation",
    "Transportation Check Bus": "Transportation",
    "Transportation Check Passenger Train": "Transportation",
    "Presented During Inspection": "Transportation",
    "Boat Patrol": "Transportation",
    "Crewman/Stowaway": "Transportation",
    "Criminal Alien Program": "Criminal Alien Program",
}

# asked AI: how to extract year from date column in pandas: df["year"] = df["date"].dt.year
methods["year"] = methods["apprehension_date"].dt.year
methods["method_bucket"] = (
    methods["apprehension_method"].map(bucket_map).fillna("Miscellaneous")
)

# group by method and arrest year
rolled = methods.groupby(["method_bucket", "year"]).size().reset_index(name="count")
# add % of total year col
rolled["perc_of_total_year"] = (
    rolled["count"] / rolled.groupby("year")["count"].transform("sum")
) * 100


chart = (
    alt.Chart(rolled)
    .mark_rect()
    .encode(
        alt.X("year:N", axis=alt.Axis(labelAngle=-45)).title("Arrest Year"),
        alt.Y("method_bucket:N", title="Apprehension Method"),
        alt.Color(
            "perc_of_total_year:Q",
            title="Annual Arrest Percentage",
        ),
    )
    .properties(title="Apprehension Method by Year", width=200)
)

save_chart(chart, "heatmap_method")

In [129]:
# create another heatmap but for % change from previous year
# add % change from previous year (2023 won't have values since it is the base year)
rolled_sorted = rolled.sort_values(["method_bucket", "year"])
rolled_sorted["perc_change_count"] = rolled_sorted.groupby("method_bucket")[
    "count"
].pct_change()
rolled_sorted.head()

(
    alt.Chart(rolled_sorted)
    .mark_rect()
    .encode(
        alt.X("year:N", title="Arrest Year"),
        alt.Y("method_bucket:N", title="Apprehension Method"),
        alt.Color(
            "perc_change_count:Q",
            title="Annual % Change",
            scale=alt.Scale(
                scheme="redblue",
                domain=[
                    -2,
                    rolled_sorted["perc_change_count"].max(),
                ],
                domainMid=0,  # force diverging at zero
            ),
        ),
    )
    .properties(title="", width=200)
)
# almost all apprehesion methods increased from previous year;
# not a very helpful view (don't include in final output)

In [41]:
rolled_criminality = (
    methods.groupby(["method_bucket", "apprehension_criminality"])
    .size()
    .reset_index(name="count")
)
rolled_criminality.head()


(
    alt.Chart(rolled_criminality)
    .mark_rect()
    .encode(
        alt.X("apprehension_criminality:N", axis=alt.Axis(labelAngle=-30)).title(
            "Criminality Type"
        ),
        alt.Y("method_bucket:N", title="Apprehension Method"),
        alt.Color("count:Q", title="Arrests Count"),
    )
    .properties(title="Apprehension Method by Criminality Type", width=200)
)

In [50]:
# Age Distribution (histogram)
age_df = arrests_dedupe.copy()
# create age col
age_df["age"] = 2025 - age_df["birth_year"]
age_df = age_df[["age", "gender"]]
# exclude unknown gender because so few that it doesn't show up on the chart
age_gender_df = age_df[age_df["gender"] != "Unknown"]

mean_age = int(age_gender_df["age"].mean())

base = alt.Chart(age_gender_df)

bar = base.mark_bar().encode(
    alt.X("age:Q", bin=True, axis=alt.Axis(title="Age (years)")),
    alt.Y("count()", axis=alt.Axis(title="Count of Arrests")),
    alt.Color(
        "gender:N",
        title="Gender",
        scale=alt.Scale(range=["#008080", "#C6C6C6", "#4682B4"]),
    ),
)

# Mean line (red)
rule = base.mark_rule(color="red", strokeWidth=3).encode(x="mean(age):Q")

# Label for mean line
label = base.mark_text(
    align="left",  # position to the right of the line
    dx=5,  # horizontal offset
    dy=-5,  # vertical offset (move up)
    color="red",
).encode(
    x="mean(age):Q",
    y=alt.value(0),
    text=alt.value(f"Mean Age = {mean_age}"),
)

chart = (bar + rule + label).properties(
    title="Distribution of Arrests by Age and Gender"
)

save_chart(chart, "age_histogram")


In [43]:
# time from arrest to deportation (subset of data: only deported persons)
# print(arrests_dedupe["final_order_date"].isnull().sum()) # 107250
# print(len(arrests_dedupe)) # 274017
# ~1/3 final_order_date values are missing, exclude from analysis and make a note on chart

final_date_df = (arrests_dedupe.copy()).dropna(
    subset=["departed_date", "apprehension_date"]
)

final_date_df["days_to_departure"] = (
    final_date_df["departed_date"] - final_date_df["apprehension_date"]
).dt.days.astype("Int64")
len(final_date_df[final_date_df["days_to_departure"] > 0])
# final_date_df[final_date_df["days_to_departure"] < 0]

# final_date_df["decision_days"].mean()


147268

In [48]:
yes_count = (final_date_df["final_order_yes_no"] == "YES").sum()
no_count = (final_date_df["final_order_yes_no"] == "NO").sum()
total = (final_date_df["final_order_yes_no"]).count()

pie_deportations = pd.DataFrame(
    {
        "deportation_status": ["Yes", "No"],
        "perc": [(yes_count / total) * 100, (no_count / total) * 100],
    }
)


base = alt.Chart(pie_deportations).encode(
    alt.Theta("perc:Q", title="Percent of Total Arrests"),
    alt.Color("deportation_status:N", title="Deportation Status"),
)

pie = base.mark_arc(outerRadius=120)

text = (
    base.transform_calculate(label="format(datum.perc, '.0f') + '%'")  # 92 → "92%"
    .mark_text(radius=133, size=15, color="white")
    .encode(text="label:N")
)

chart = (pie + text).properties(title="Share of Arrests Resulting in Deportation")

save_chart(chart, "pie_deportations")

In [45]:
# Density: time taken between arrest and deportation
# limit to only deported arrests & "days_to_departure" > 0
deported_df = final_date_df[
    (final_date_df["final_order_yes_no"] == "YES")
    & (final_date_df["days_to_departure"] > 0)
]

mean_days = deported_df["days_to_departure"].mean()

# create chart
base = (
    alt.Chart(deported_df)
    .transform_density(
        "days_to_departure",
        as_=["days_to_departure", "density"],
    )
    .mark_area()
    .encode(
        alt.X("days_to_departure:Q", title="Number of Days"),
        alt.Y("density:Q", title="Density"),
    )
)

mean_line = (
    alt.Chart(pd.DataFrame({"days_to_departure": [mean_days]}))
    .mark_rule(color="red", strokeWidth=2)
    .encode(x="days_to_departure:Q")
)

mean_label = (
    alt.Chart(
        pd.DataFrame(
            {
                "days_to_departure": [mean_days],
                "label": [f"Mean = {mean_days:.0f} days"],
            }
        )
    )
    .mark_text(
        align="left",
        angle=0,
        dx=5,  # move right of the line
        dy=5,  # move slightly up
        color="red",
    )
    .encode(
        x="days_to_departure:Q",
        y=alt.value(0.018),  # roughly top of the chart
        text="label",
    )
)

chart = (base + mean_line + mean_label).properties(
    title="Density of Days Between Arrest and Departure", width=400
)

save_chart(chart, "density_days")