In [3]:
# Student Housing Conditions Analysis

import pandas as pd

# === Load the Data ===
student_file = "StudentAddresses-2016-2024.xlsx - Sheet1.csv"
violation_file = "800a2663-1d6a-46e7-9356-bedb70f5332c.csv"

df_students = pd.read_csv(student_file)
df_violations = pd.read_csv(violation_file)

# === Clean Student Data ===
df_students.columns = [
    "street_number", "street_name", "street_suffix", "unit_number", "zip_code",
    "student_level", "enrollment_status", "at_home", "five_plus_students",
    "university", "year"
]

df_students["zip_code"] = df_students["zip_code"].astype(str).str.zfill(5)
df_students["street_number"] = df_students["street_number"].astype(str).str.extract(r"(\d+)")[0].fillna("")
df_students["street_name"] = df_students["street_name"].astype(str).str.strip().str.upper()
df_students["simple_address_key"] = (
    df_students["street_number"] + " " +
    df_students["street_name"] + " " +
    df_students["zip_code"]
).str.replace(r"\s+", " ", regex=True).str.strip()

# === Clean Violation Data ===
df_violations["violation_zip"] = df_violations["violation_zip"].astype(str).str.zfill(5)
df_violations["violation_stno"] = df_violations["violation_stno"].astype(str).str.extract(r"(\d+)")[0].fillna("")
df_violations["violation_street"] = df_violations["violation_street"].astype(str).str.strip().str.upper()
df_violations["simple_address_key"] = (
    df_violations["violation_stno"] + " " +
    df_violations["violation_street"] + " " +
    df_violations["violation_zip"]
).str.replace(r"\s+", " ", regex=True).str.strip()

# === Group and Merge ===
students_grouped = df_students.groupby("simple_address_key").size().reset_index(name="student_count")
violations_grouped = df_violations.groupby("simple_address_key").size().reset_index(name="violation_count")

merged_df = pd.merge(
    students_grouped,
    violations_grouped,
    on="simple_address_key",
    how="left"
)
merged_df["violation_count"] = merged_df["violation_count"].fillna(0).astype(int)

# === Filter for Addresses with Both Students and Violations ===
matched = merged_df[merged_df["violation_count"] > 0]
# matched.sort_values(by=["violation_count", "student_count"], ascending=False).head(10)
matched.to_csv("Student_Housing_with_Violations.csv", index=False)

  df_students = pd.read_csv(student_file)
