In [6]:
# ================================================================
# Real Estate Intelligence Integration Pipeline
# ------------------------------------------------
# Goal: Merge real estate, census, crime, and congestion data
#       into one unified dataset for analysis & visualization.
# ================================================================

import pandas as pd
import geopandas as gpd
import numpy as np

# --- Load Cleaned Datasets ---
real_estate_df = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Real Estate\real_estate.csv")
census_df      = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Population And Annual Income\population_and_annual_income.csv")
crime_df       = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Crimes\crimes.csv")
congestion_df  = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Congestion\congestion.csv")

# ================================================================
# 1️- AGGREGATE CRIME DATA (tract level)
# ------------------------------------------------
# Convert event-level data into tract-level summaries:
# - Average crime rate
# - Total number of incidents
# ================================================================

crime_agg = (
    crime_df.groupby('district_fips_id')
    .agg({
        'Crime Rate': 'mean',
        'ID': 'count'
    })
    .reset_index()
    .rename(columns={'Crime Rate': 'avg_crime_rate', 'ID': 'crime_count'})
)

# ================================================================
# 2️- AGGREGATE CONGESTION DATA (tract level)
# ------------------------------------------------
# Average delay, severity, and duration across events in same tract.
# ================================================================

traffic_agg = (
    congestion_df.groupby('district_fips_id')
    .agg({
        'DelayFromTypicalTraffic(mins)': 'mean',
        'Severity': 'mean',
        'Duration_min': 'mean'
    })
    .reset_index()
    .rename(columns={
        'DelayFromTypicalTraffic(mins)': 'avg_delay',
        'Severity': 'avg_severity',
        'Duration_min': 'avg_duration'
    })
)

# ================================================================
# 3️- MERGE ALL DATASETS
# ------------------------------------------------
# Start from property-level dataset and enrich it
# step-by-step with census, crime, and congestion info.
# ================================================================

merged = real_estate_df.copy()

# Attach socioeconomic info from census
merged = merged.merge(
    census_df[['district_fips_id', 'income', 'population']],
    on='district_fips_id', how='left'
)

# Attach aggregated crime data
merged = merged.merge(crime_agg, on='district_fips_id', how='left')

# Attach aggregated congestion data
merged = merged.merge(traffic_agg, on='district_fips_id', how='left')

# ================================================================
# 4️- DERIVED METRICS
# ------------------------------------------------
# Compute interpretable features for analysis.
# ================================================================

# Housing affordability
merged['price_to_income_ratio'] = merged['Price'] / merged['income']

# Normalized crime rate per 1,000 residents
merged['crime_per_1000'] = (merged['crime_count'] / merged['population']) * 1000

# Simplified transport score = average of delays, severity, and duration
merged['transport_score'] = merged[['avg_delay', 'avg_severity', 'avg_duration']].mean(axis=1)

# Drop unrealistic or missing price-to-income values
merged = merged[merged['price_to_income_ratio'].between(0, 50, inclusive='neither')]

# ================================================================
# 5️- GEO-DATA CONVERSION (for mapping)
# ------------------------------------------------
# Create GeoDataFrame from latitude/longitude columns.
# ================================================================

gdf = gpd.GeoDataFrame(
    merged,
    geometry=gpd.points_from_xy(merged.Longitude, merged.Latitude),
    crs="EPSG:4326"
)

# ================================================================
# 6️- EXPORT FINAL MASTER DATASET
# ------------------------------------------------
# Save both CSV and GeoJSON versions for web dashboard.
# ================================================================

output_csv = r"D:\DEPI_Project\Datasets\Test\final_real_estate_master.csv"
output_geojson = r"D:\DEPI_Project\Datasets\Test\final_real_estate_master.geojson"

merged.to_csv(output_csv, index=False, encoding='utf-8')
gdf.to_file(output_geojson, driver='GeoJSON')

print(f"Integration Complete!")
print(f"Saved CSV: {output_csv}")
print(f"Saved GeoJSON: {output_geojson}")


✅ Integration Complete!
   Saved CSV: D:\DEPI_Project\Datasets\Test\final_real_estate_master.csv
   Saved GeoJSON: D:\DEPI_Project\Datasets\Test\final_real_estate_master.geojson


  ogr_write(
