In [45]:
import pandas as pd
import numpy as np

# Load raw CSV
raw = pd.read_csv("brianna_dec2.csv")

# Clean race
clean = raw.copy()

clean["subject_race"] = (
    clean["subject_race"]
    .astype(str)
    .str.lower()
    .str.strip()
)

clean = clean[~clean["subject_race"].isin(["other", "unknown", "none", "", None])]
clean = clean[clean["subject_race"].notna()]

# Clean Action Description
clean["raw_action_description"] = (
    clean["raw_action_description"]
    .fillna("unknown")
    .astype(str)
    .str.strip()
    .str.title()
)

clean = clean[clean["raw_action_description"] != "Unknown"]
clean = clean[clean["raw_action_description"] != ""]


# Clean age and ensure data type is correct
clean = clean.dropna(subset=["subject_age"])
clean["subject_age"] = pd.to_numeric(clean["subject_age"], errors="coerce")
clean = clean.dropna(subset=["subject_age"])

# Convert datetime 
clean["datetime"] = pd.to_datetime(clean["datetime"], errors="coerce")
clean = clean.dropna(subset=["datetime"])

clean["year"] = clean["datetime"].dt.year.astype(int)

# Clean Search flags + create `search_intrusiveness`
clean["search_person"] = clean["search_person"].fillna(0).astype(int)
clean["search_vehicle"] = clean["search_vehicle"].fillna(0).astype(int)

def classify_intrusiveness(row):
    p = row["search_person"]
    v = row["search_vehicle"]
    
    if p == 1 and v == 1:
        return "Both"
    elif p == 1 and v == 0:
        return "Person Only"
    elif p == 0 and v == 1:
        return "Vehicle Only"
    else:
        return "Neither"

clean["search_intrusiveness"] = clean.apply(classify_intrusiveness, axis=1)

# Clean Search Basis
clean = clean.dropna(subset=["search_basis"])
clean["search_basis"] = clean["search_basis"].astype(str).str.strip()

# Clean labels for views (arrest and race label)
clean["arrest_made"] = clean["arrest_made"].astype(int)

clean["arrest_status_label"] = clean["arrest_made"].map({
    0: "Not Arrested",
    1: "Arrested"
})

clean["race_label"] = clean["subject_race"].map({
    "asian/pacific islander": "Asian / Pacific Islander",
    "black": "Black",
    "hispanic": "Hispanic",
    "white": "White"
})

clean = clean.dropna(subset=["race_label", "arrest_status_label"])

clean = clean.copy()
clean.to_csv("cleaned_master_dataset.csv", index=False)



search_intrusiveness
Both            9617
Person Only     1149
Vehicle Only     773
Neither           21
Name: count, dtype: int64