In [1]:
import pandas as pd
import numpy as np

# --- Load Dataset ---
df = pd.read_csv(
    r"D:\DEPI_Project\Datasets\Raw\Population and Anuual Income\census_tracts_all_states_with_names.csv",
    encoding="utf-8",
    low_memory=False
)

# --- Cleaning Pipeline ---
"""
Cleans census (demographics) dataset:
- Ensures correct numeric types
- Splits 'name' field into separate columns (tract, county, state)
- Creates a unified district identifier (FIPS)
- Handles missing values
- Removes duplicates
"""

# --- 1. Fix Data Types ---
num_cols = ["state_fips", "county_fips", "tract_fips", "income", "population"]
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# --- 2. Split 'name' Column ---
if "name" in df.columns:
    # Expected format: "Census Tract 201; Autauga County; Alabama"
    name_parts = df["name"].astype(str).str.split("; ", expand=True)
    df["tract_name"] = name_parts[0].str.replace("Census Tract ", "", regex=False).str.strip()
    df["county_name"] = name_parts[1].str.replace(" County", "", regex=False).str.strip()
    df["state_name"] = name_parts[2].str.strip()

# --- 3. Generate Unified FIPS ID ---
# Combines state, county, and tract FIPS codes into one unique district ID
df["district_fips_id"] = (
    df["state_fips"].astype(str).str.zfill(2)
    + df["county_fips"].astype(str).str.zfill(3)
    + df["tract_fips"].astype(str).str.zfill(6)
)

# --- 4. Handle Missing Values ---
# Drop rows with missing essential geographic or population info
critical_cols = ["district_fips_id", "state_name", "county_name", "tract_name", "population"]
df = df.dropna(subset=critical_cols)

# Fill missing income with the median for that state
if "income" in df.columns and "state_name" in df.columns:
    df["income"] = df.groupby("state_name")["income"].transform(lambda x: x.fillna(x.median()))

# --- 5. Remove Duplicates ---
df = df.drop_duplicates(subset=["district_fips_id"])

# --- 6. Save Cleaned Dataset ---
output_path = r"D:\DEPI_Project\Datasets\Cleaned\Population and Anuual Income\cleaned_population_and_annual_income.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"Cleaning complete. Saved cleaned dataset to: {output_path}")


Cleaning complete. Saved cleaned dataset to: D:\DEPI_Project\Datasets\Cleaned\Population and Anuual Income\cleaned_population_and_annual_income.csv
