In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import geopandas as gpd

In [None]:

df = pd.read_csv("pittsburghArrest.csv")

df = df[["AGE", "INCIDENTNEIGHBORHOOD"]]
df = df.dropna(subset=["AGE", "INCIDENTNEIGHBORHOOD"])
df["AGE"] = pd.to_numeric(df["AGE"], errors="coerce")
df = df.dropna(subset=["AGE"])

df["AGE_GROUP"] = df["AGE"].apply(lambda x: "<18" if x < 18 else "18+")

grouped = df.groupby(["INCIDENTNEIGHBORHOOD", "AGE_GROUP"]).size().unstack(fill_value=0)

grouped["Total_Count"] = grouped["<18"] + grouped["18+"]
grouped["Under_18_Percent_Local"] = grouped["<18"] / grouped["Total_Count"]
grouped["Over_18_Percent_Local"] = grouped["18+"] / grouped["Total_Count"]

total_under_18 = grouped["<18"].sum()
total_over_18 = grouped["18+"].sum()

grouped["Under_18_Percent_Global"] = grouped["<18"] / total_under_18
grouped["Over_18_Percent_Global"] = grouped["18+"] / total_over_18

result = grouped.reset_index().rename(columns={
    "INCIDENTNEIGHBORHOOD": "NEIGHBORHOOD",
    "<18": "Under_18_Count",
    "18+": "Over_18_Count"
})

result = result[
    ["NEIGHBORHOOD", "Under_18_Count", "Over_18_Count", "Total_Count",
     "Under_18_Percent_Local", "Over_18_Percent_Local",
     "Under_18_Percent_Global", "Over_18_Percent_Global"]
]

result.to_csv("summaryPittArrest.csv", index=False)


In [None]:
df = pd.read_csv("pittsburghParks.csv")


df = df[["type", "neighborhood"]]
df = df.dropna(subset=["type", "neighborhood"])


df["IS_PARK"] = df["type"].apply(lambda x: "Park" if x.strip().lower() == "park" else "Other")

grouped = df.groupby(["neighborhood", "IS_PARK"]).size().unstack(fill_value=0)

if "Park" not in grouped.columns:
    grouped["Park"] = 0
if "Other" not in grouped.columns:
    grouped["Other"] = 0

total_park = grouped["Park"].sum()
total_other = grouped["Other"].sum()

grouped["Park_Global_Percent"] = grouped["Park"] / total_park
grouped["Other_Global_Percent"] = grouped["Other"] / total_other

result = grouped.reset_index().rename(columns={
    "neighborhood": "NEIGHBORHOOD",
    "Park": "Park_Count",
    "Other": "Non_Park_Count"
})

result = result[
    ["NEIGHBORHOOD", "Park_Count", "Non_Park_Count", "Park_Global_Percent", "Other_Global_Percent"]
]

result.to_csv("summaryPittPark.csv", index=False)


In [None]:

df = pd.read_csv("pittsburghFacility.csv")

df = df[["type", "neighborhood"]]
df = df.dropna(subset=["type", "neighborhood"])

all_types = df["type"].unique()

pivot_table = df.groupby(["neighborhood", "type"]).size().unstack(fill_value=0)

type_totals = pivot_table.sum()

type_percents = pivot_table.divide(type_totals)

final_df = pivot_table.copy()
for col in pivot_table.columns:
    final_df[f"{col}_Count"] = pivot_table[col]
    final_df[f"{col}_Percent"] = type_percents[col]
    final_df.drop(columns=[col], inplace=True)
    
final_df["Total_Facilities"] = final_df[[col for col in final_df.columns if col.endswith("_Count")]].sum(axis=1)
final_df = final_df.reset_index().rename(columns={"neighborhood": "NEIGHBORHOOD"})

count_cols = [col for col in final_df.columns if col.endswith("_Count")]
percent_cols = [col for col in final_df.columns if col.endswith("_Percent")]
final_df = final_df[["NEIGHBORHOOD", "Total_Facilities"] + count_cols + percent_cols]

final_df.to_csv("summaryPittFacilities.csv", index=False)


In [None]:

df = pd.read_csv("pittsburghSteps.csv")

df = df[["neighborhood", "length", "number_of_steps"]]
df = df.dropna(subset=["neighborhood", "length", "number_of_steps"])

df_with_steps = df[df["number_of_steps"] > 0]
df_no_steps = df[df["number_of_steps"] == 0]

grouped_steps = df_with_steps.groupby("neighborhood").agg({
    "length": "sum",
    "number_of_steps": "sum"
}).rename(columns={
    "length": "Total_Length_With_Steps",
    "number_of_steps": "Total_Steps"
})

grouped_steps["Length_per_Step"] = grouped_steps["Total_Length_With_Steps"] / grouped_steps["Total_Steps"]

grouped_no_steps = df_no_steps.groupby("neighborhood").agg({
    "length": "sum"
}).rename(columns={"length": "Zero_Steps_Total_Length"})

final_df = grouped_steps.join(grouped_no_steps, how="outer").fillna(0)

final_df = final_df.reset_index().rename(columns={"neighborhood": "NEIGHBORHOOD"})

final_df.to_csv("summaryPittSteps.csv", index=False)

In [None]:
df = pd.read_csv("pittsburghTrees.csv")

tree_count = df.groupby("neighborhood").size().reset_index(name="tree_count")

tree_count.to_csv("summaryPittTrees.csv", index=False)

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

gdf = gpd.read_file("../GeopandasTest/hood.geojson")
# print(gdf["hood"].unique())
# print(gdf["Shape__Area"].head())
print(gdf.columns)

In [None]:
# import pandas as pd

# arrest_df = pd.read_csv("summaryPittArrest.csv")
# park_df = pd.read_csv("summaryPittPark.csv")
# fac_df = pd.read_csv("summaryPittFacilities.csv")
# step_df = pd.read_csv("summaryPittSteps.csv")

# arrest_neigh = set(arrest_df["NEIGHBORHOOD"])
# fac_neigh = set(fac_df["NEIGHBORHOOD"])
# step_neigh = set(step_df["NEIGHBORHOOD"])
# park_neigh = set(park_df["NEIGHBORHOOD"])

# core_common = arrest_neigh & fac_neigh & step_neigh

# all_core = arrest_neigh | fac_neigh | step_neigh
# missing_core = all_core - core_common
# if missing_core:
#     print("The following neighborhood is missing in the dataset(Arrest/Facilities/Steps), and will not be counted:")
#     for name in sorted(missing_core):
#         print("-", name)

# arrest_df = arrest_df[arrest_df["NEIGHBORHOOD"].isin(core_common)]
# fac_df = fac_df[fac_df["NEIGHBORHOOD"].isin(core_common)]
# step_df = step_df[step_df["NEIGHBORHOOD"].isin(core_common)]

# park_df = park_df[park_df["NEIGHBORHOOD"].isin(core_common)]

# merged = arrest_df.merge(fac_df, on="NEIGHBORHOOD")
# merged = merged.merge(step_df, on="NEIGHBORHOOD")
# merged = merged.merge(park_df, on="NEIGHBORHOOD", how="left")

# merged = merged.fillna(0)

# merged.to_csv("summaryPitt_ALL_MERGED.csv", index=False)


In [None]:
# --- 读取所有数据 ---
df_arrest = pd.read_csv("summaryPittArrest.csv")
df_park = pd.read_csv("summaryPittPark.csv")
df_facilities = pd.read_csv("summaryPittFacilities.csv")
df_steps = pd.read_csv("summaryPittSteps.csv")
df_trees = pd.read_csv("summaryPittTrees.csv")

# --- 合并前统一列名保持一致 ---
for df in [df_arrest, df_park, df_facilities, df_steps, df_trees]:
    df.columns = [col.strip() for col in df.columns]

# --- 合并 ---
dfs = [df_arrest, df_facilities, df_steps, df_trees]
merged = df_park.copy()

for df in dfs:
    merged = pd.merge(merged, df, on="NEIGHBORHOOD", how="outer")

# --- 缺失填 0（除了 park 之外的缺失打印出来） ---
park_neighborhoods = set(df_park["NEIGHBORHOOD"])
all_neighborhoods = set(merged["NEIGHBORHOOD"])

for df in [df_arrest, df_facilities, df_steps, df_trees]:
    current_hoods = set(df["NEIGHBORHOOD"])
    missing = all_neighborhoods - current_hoods
    if df is not df_park:
        print(f"⚠️ 缺失的街区（{df.columns[1]} 数据缺失）：", missing)

merged.fillna(0, inplace=True)

# --- 读取 geojson 面积 ---
with open("hood.geojson", "r", encoding="utf-8") as f:
    geo = geo.load(f)

area_data = []
for feat in geo["features"]:
    name = feat["properties"]["hood"]
    area = feat["properties"]["Shape__Area"]
    area_data.append({"NEIGHBORHOOD": name, "Shape__Area": area})

df_area = pd.DataFrame(area_data)

# --- 合并面积数据 ---
merged = pd.merge(merged, df_area, on="NEIGHBORHOOD", how="left")

# --- 面积归一化 ---
area_min = merged["Shape__Area"].min()
area_max = merged["Shape__Area"].max()
merged["normalized_area"] = (merged["Shape__Area"] - area_min) / (area_max - area_min)

# --- 找出需要做单位化的列（排除掉已有的_unit列和面积相关列） ---
exclude_keywords = ["_unit", "Shape__Area", "normalized_area", "NEIGHBORHOOD"]
original_cols = [col for col in merged.columns if all(key not in col for key in exclude_keywords)]

# --- 一次性生成所有单位列，避免性能警告 ---
unit_data = {}
for col in original_cols:
    unit_col = col + "_unit"
    # 避免除以 0
    unit_data[unit_col] = merged[col] / merged["normalized_area"].replace(0, 1e-6)

unit_df = pd.DataFrame(unit_data)

# --- 合并单位列到最前面 ---
merged = pd.concat([merged[["NEIGHBORHOOD"]], unit_df, merged.drop(columns=["NEIGHBORHOOD"])], axis=1)

# --- 导出 ---
merged.to_csv("summaryPitt_ALL_MERGED_UNIT.csv", index=False)
print("✅ 全部搞定！最终文件保存为：summaryPitt_ALL_MERGED_UNIT.csv")

⚠️ 缺失的街区（Under_18_Count 数据缺失）： {'Mt. Oliver', 'Mount Oliver Borough'}
⚠️ 缺失的街区（Total_Facilities 数据缺失）： {'Mt. Oliver', 'Troy Hill-Herrs Island', 'Northview Heights', 'Outside State', 'Spring Garden', 'Outside County', 'Esplen', 'Outside City', 'Summer Hill', 'Arlington', 'Central North Side', 'Golden Triangle/Civic Arena', 'Friendship', 'Chartiers City', 'North Shore', 'Mt. Oliver Boro', 'Mt. Oliver Neighborhood', 'Mount Oliver Borough', 'New Homestead', 'East Carnegie', 'South Shore', 'Ridgemont', 'St. Clair', 'Mount Oliver', 'Arlington Heights'}
⚠️ 缺失的街区（Total_Length_With_Steps 数据缺失）： {'Swisshelm Park', 'Troy Hill-Herrs Island', 'Northview Heights', 'Outside State', 'Outside County', 'Homewood South', 'Outside City', 'Summer Hill', 'Chateau', 'Central North Side', 'Golden Triangle/Civic Arena', 'Friendship', 'Mt. Oliver Boro', 'Mt. Oliver Neighborhood', 'Fairywood', 'Homewood West', 'New Homestead', 'South Shore', 'Allegheny West', 'Hays', 'Mount Oliver', 'Arlington Heights'}
⚠️ 缺失的街区

NameError: name 'json' is not defined

In [None]:
df = pd.read_csv("summaryPitt_ALL_MERGED_UNIT.csv")

columns_to_analyze = [
    "Under_18_Crime_Count_unit", "Over_18_Crime_Count_unit",
    "Total_Crime_Count_unit",
    "Park_Count_unit",
    # "Park", "Not_Park", "Park_Global_Percent", "Not_Park_Global_Percent",

    "Total_Facilities_unit",

    "Total_Length_With_Steps_unit", "Total_Steps_unit", "Length_per_Step_unit"
    # , "Zero_Steps_Total_Length",
]

correlation_matrix = df[columns_to_analyze].corr()

plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True,
            linewidths=.5, cbar_kws={"shrink": .8})

plt.title("Correlation Matrix", fontsize=18)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()

plt.savefig("correlation_matrix.png", dpi=300)
plt.show()

In [None]:
data = df[columns_to_analyze].copy()

# --- 定义一个带 r 值的回归图函数 ---
def regplot_with_r(x, y, **kwargs):
    ax = kwargs.get("ax", plt.gca())
    sns.regplot(x=x, y=y, ax=ax,
                scatter_kws={"s": 20, "alpha": 0.6},
                line_kws={"color": "red"})
    # 计算 r 值
    r, _ = pearsonr(x, y)
    ax.annotate(f"r = {r:.2f}", xy=(0.05, 0.9), xycoords='axes fraction',
                fontsize=11, color='black')

# --- 创建 PairGrid ---
g = sns.PairGrid(data, height=2.5)

# 左下角和右上角都用 regplot_with_r
g.map_lower(regplot_with_r)
g.map_upper(regplot_with_r)

# 对角线画分布图
g.map_diag(sns.histplot, kde=True)

# 标题 & 美化
plt.suptitle("📊 所有变量两两线性拟合 + Pearson r 值", fontsize=16, y=1.02)
plt.tight_layout()
plt.subplots_adjust(top=0.95)  # 给标题留空间
plt.savefig("pairwise_full_regplot_r.png", dpi=300)
plt.show()