In [114]:
import geopandas as gpd
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import altair as alt
alt.data_transformers.enable('json')


DataTransformerRegistry.enable('json')

In [115]:
seg = gpd.read_file('../data/pointsWithSeg.geojson')

In [117]:
seg

Unnamed: 0,fid,SEG_ID,pointId,wall,lives,building,infrastructure,road,sidewalk,sky,...,lives_q,building_q,infrastructure_q,road_q,sidewalk_q,sky_q,green_q,transportation_q,publicservice_q,geometry
0,1,420708,000001,0.030779,0.003141,0.163661,0.020548,0.118350,0.033204,0.291897,...,0.6,0.6,0.6,0.4,1.0,0.8,0.2,0.4,0.8,POINT (-75.16460 39.95994)
1,2,422065,000002,0.047397,0.008571,0.367944,0.009722,0.108147,0.035982,0.209341,...,1.0,1.0,0.2,0.4,1.0,0.6,0.0,0.0,0.4,POINT (-75.16357 39.96050)
2,3,420702,000003,0.034286,0.003337,0.178823,0.008427,0.237291,0.061395,0.024113,...,0.6,0.8,0.2,1.0,1.0,0.0,0.6,0.0,0.2,POINT (-75.16619 39.96013)
3,5,420696,000004,0.019361,0.001505,0.331703,0.004485,0.274849,0.073919,0.099125,...,0.4,1.0,0.0,1.0,1.0,0.2,0.0,0.6,0.6,POINT (-75.16776 39.96033)
4,8,422066,000005,0.051927,0.002919,0.139440,0.039439,0.035637,0.003948,0.092139,...,0.6,0.6,1.0,0.0,0.2,0.0,0.8,0.4,0.6,POINT (-75.16672 39.96089)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12312,24861,423467,012313,0.051546,0.011665,0.146067,0.052584,0.061044,0.014317,0.148602,...,1.0,0.6,1.0,0.2,0.4,0.4,0.6,0.8,0.2,POINT (-75.19385 39.95761)
12313,24863,522448,012314,0.036714,0.003632,0.197326,0.015623,0.019405,0.003073,0.189704,...,0.6,0.8,0.4,0.0,0.0,0.4,0.8,0.4,0.4,POINT (-75.15766 40.00221)
12314,24868,240821,012315,0.021332,0.000403,0.250894,0.013468,0.059427,0.008680,0.310452,...,0.2,1.0,0.4,0.2,0.4,1.0,0.2,0.2,1.0,POINT (-75.16788 39.90467)
12315,24869,240822,012316,0.018760,0.001339,0.290828,0.009906,0.066447,0.007145,0.309186,...,0.4,1.0,0.2,0.2,0.2,0.8,0.2,0.0,0.8,POINT (-75.16553 39.90436)


In [129]:
# Initialize the Kmeans object
kmeans = KMeans(n_clusters=4, random_state=42)

cols = ['wall', 'lives', 'building', 'infrastructure', 'road', 'sidewalk', 'sky', 'green', 'transportation', 'publicservice']

clusteringData = seg[cols].copy()

# Scale the data features we want
scaler = StandardScaler()
scaledClusteringData = scaler.fit_transform(clusteringData)

In [130]:
# Run the fit!
kmeans.fit(scaledClusteringData)

# Save the cluster labels
seg['label'] = kmeans.labels_

In [273]:
segSummary = seg.groupby('label')[cols].mean().reset_index()
segSummary = segSummary.melt(id_vars=['label'],var_name='category',value_name='pct')
segSummary = pd.DataFrame(segSummary)
segSummary.head()

Unnamed: 0,label,category,pct
0,0,wall,0.026113
1,1,wall,0.028116
2,2,wall,0.019327
3,3,wall,0.052916
4,0,lives,0.002462


In [275]:
alt.Chart(segSummary).mark_circle().encode(
    x='category:N',
    y='label:N',
    color=alt.Color('pct:Q', scale=alt.Scale(scheme="viridis")),
    size='pct:Q',
).properties(
    width=600,
    height=300
)

In [227]:
alt.Chart(segSummary).mark_bar().encode(
    row='label:N',
    color=alt.Color(
        'variable:N',
        scale=alt.Scale(scheme="tableau10")),
    y = alt.X('variable:N', sort='-x'),
    x=alt.X('value:Q'),
).properties(
    width=500,
    height=100
).interactive()

In [276]:
bar = alt.Chart(segSummary).mark_bar().encode(
    y=alt.Y('label:N'),
    color=alt.Color(
        'category:N',
        scale=alt.Scale(scheme="tableau20")),
    x=alt.X('pct:Q', stack="normalize"),
    tooltip=[
        alt.Tooltip('label:N', title='Clustering Label'),
        alt.Tooltip('category:N', title='Catagory'),
        alt.Tooltip('pct:Q', title='Percentage',format=',.2f'), 
    ]
).properties(
    width=800,
    height=300
)

bar

In [200]:
seg

Unnamed: 0,fid,SEG_ID,pointId,wall,lives,building,infrastructure,road,sidewalk,sky,...,building_q,infrastructure_q,road_q,sidewalk_q,sky_q,green_q,transportation_q,publicservice_q,geometry,label
0,1,420708,000001,0.030779,0.003141,0.163661,0.020548,0.118350,0.033204,0.291897,...,0.6,0.6,0.4,1.0,0.8,0.2,0.4,0.8,POINT (-75.16460 39.95994),3
1,2,422065,000002,0.047397,0.008571,0.367944,0.009722,0.108147,0.035982,0.209341,...,1.0,0.2,0.4,1.0,0.6,0.0,0.0,0.4,POINT (-75.16357 39.96050),1
2,3,420702,000003,0.034286,0.003337,0.178823,0.008427,0.237291,0.061395,0.024113,...,0.8,0.2,1.0,1.0,0.0,0.6,0.0,0.2,POINT (-75.16619 39.96013),1
3,5,420696,000004,0.019361,0.001505,0.331703,0.004485,0.274849,0.073919,0.099125,...,1.0,0.0,1.0,1.0,0.2,0.0,0.6,0.6,POINT (-75.16776 39.96033),1
4,8,422066,000005,0.051927,0.002919,0.139440,0.039439,0.035637,0.003948,0.092139,...,0.6,1.0,0.0,0.2,0.0,0.8,0.4,0.6,POINT (-75.16672 39.96089),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12312,24861,423467,012313,0.051546,0.011665,0.146067,0.052584,0.061044,0.014317,0.148602,...,0.6,1.0,0.2,0.4,0.4,0.6,0.8,0.2,POINT (-75.19385 39.95761),0
12313,24863,522448,012314,0.036714,0.003632,0.197326,0.015623,0.019405,0.003073,0.189704,...,0.8,0.4,0.0,0.0,0.4,0.8,0.4,0.4,POINT (-75.15766 40.00221),0
12314,24868,240821,012315,0.021332,0.000403,0.250894,0.013468,0.059427,0.008680,0.310452,...,1.0,0.4,0.2,0.4,1.0,0.2,0.2,1.0,POINT (-75.16788 39.90467),0
12315,24869,240822,012316,0.018760,0.001339,0.290828,0.009906,0.066447,0.007145,0.309186,...,1.0,0.2,0.2,0.2,0.8,0.2,0.0,0.8,POINT (-75.16553 39.90436),0


In [278]:
alt.Chart(seg).mark_geoshape(
).encode(
    color='label:N',
).properties(
    width=800,
    height=800
)

In [279]:
blockGroup = gpd.read_file("../data/censusRaceWithGeometry.geojson")

In [309]:
bg_cluster = gpd.sjoin(seg[["label","geometry"]], 
                       blockGroup[['NAME','geometry']],
                       how="left",)

In [317]:
alt.Chart(blockGroup).mark_geoshape(fillOpacity=0.5).properties(width=800,height=800)

In [349]:
# pd.DataFrame.sort_values()
bg_cluster = bg_cluster.groupby(["NAME","label"]).count().reset_index()\
    .sort_values("index_right",ascending=False)\
    .groupby(["NAME"]).nth(0)[["label"]]

In [352]:
blockGroupLabel = blockGroup.merge(bg_cluster,how="left",on='NAME')

In [354]:
blockGroupLabel

Unnamed: 0,NAME,geometry,label
0,"Block Group 2, Census Tract 372, Philadelphia ...","POLYGON ((-75.17224 39.91247, -75.17214 39.912...",0.0
1,"Block Group 1, Census Tract 372, Philadelphia ...","POLYGON ((-75.17224 39.91247, -75.17214 39.912...",0.0
2,"Block Group 3, Census Tract 372, Philadelphia ...","POLYGON ((-75.17224 39.91247, -75.17214 39.912...",0.0
3,"Block Group 4, Census Tract 372, Philadelphia ...","POLYGON ((-75.17224 39.91247, -75.17214 39.912...",0.0
4,"Block Group 3, Census Tract 369, Philadelphia ...","POLYGON ((-75.20774 39.94137, -75.20751 39.941...",1.0
...,...,...,...
1331,"Block Group 1, Census Tract 365.02, Philadelph...","POLYGON ((-75.00915 40.12202, -75.00806 40.123...",2.0
1332,"Block Group 2, Census Tract 11.01, Philadelphi...","POLYGON ((-75.16558 39.94366, -75.16547 39.944...",1.0
1333,"Block Group 1, Census Tract 11.01, Philadelphi...","POLYGON ((-75.16558 39.94366, -75.16547 39.944...",1.0
1334,"Block Group 3, Census Tract 11.01, Philadelphi...","POLYGON ((-75.16558 39.94366, -75.16547 39.944...",1.0


In [357]:
alt.Chart(blockGroupLabel).mark_geoshape(
    fillOpacity=1,
    stroke='white'
).encode(
    color = 'label:N'
).properties(
    width=800,
    height=800
)