In [114]:
import geopandas as gpd
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import altair as alt
alt.data_transformers.enable('json')


DataTransformerRegistry.enable('json')

In [115]:
seg = gpd.read_file('../data/pointsWithSeg.geojson')

In [117]:
seg

Unnamed: 0,fid,SEG_ID,pointId,wall,lives,building,infrastructure,road,sidewalk,sky,...,lives_q,building_q,infrastructure_q,road_q,sidewalk_q,sky_q,green_q,transportation_q,publicservice_q,geometry
0,1,420708,000001,0.030779,0.003141,0.163661,0.020548,0.118350,0.033204,0.291897,...,0.6,0.6,0.6,0.4,1.0,0.8,0.2,0.4,0.8,POINT (-75.16460 39.95994)
1,2,422065,000002,0.047397,0.008571,0.367944,0.009722,0.108147,0.035982,0.209341,...,1.0,1.0,0.2,0.4,1.0,0.6,0.0,0.0,0.4,POINT (-75.16357 39.96050)
2,3,420702,000003,0.034286,0.003337,0.178823,0.008427,0.237291,0.061395,0.024113,...,0.6,0.8,0.2,1.0,1.0,0.0,0.6,0.0,0.2,POINT (-75.16619 39.96013)
3,5,420696,000004,0.019361,0.001505,0.331703,0.004485,0.274849,0.073919,0.099125,...,0.4,1.0,0.0,1.0,1.0,0.2,0.0,0.6,0.6,POINT (-75.16776 39.96033)
4,8,422066,000005,0.051927,0.002919,0.139440,0.039439,0.035637,0.003948,0.092139,...,0.6,0.6,1.0,0.0,0.2,0.0,0.8,0.4,0.6,POINT (-75.16672 39.96089)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12312,24861,423467,012313,0.051546,0.011665,0.146067,0.052584,0.061044,0.014317,0.148602,...,1.0,0.6,1.0,0.2,0.4,0.4,0.6,0.8,0.2,POINT (-75.19385 39.95761)
12313,24863,522448,012314,0.036714,0.003632,0.197326,0.015623,0.019405,0.003073,0.189704,...,0.6,0.8,0.4,0.0,0.0,0.4,0.8,0.4,0.4,POINT (-75.15766 40.00221)
12314,24868,240821,012315,0.021332,0.000403,0.250894,0.013468,0.059427,0.008680,0.310452,...,0.2,1.0,0.4,0.2,0.4,1.0,0.2,0.2,1.0,POINT (-75.16788 39.90467)
12315,24869,240822,012316,0.018760,0.001339,0.290828,0.009906,0.066447,0.007145,0.309186,...,0.4,1.0,0.2,0.2,0.2,0.8,0.2,0.0,0.8,POINT (-75.16553 39.90436)


In [129]:
# Initialize the Kmeans object
kmeans = KMeans(n_clusters=4, random_state=42)

cols = ['wall', 'lives', 'building', 'infrastructure', 'road', 'sidewalk', 'sky', 'green', 'transportation', 'publicservice']

clusteringData = seg[cols].copy()

# Scale the data features we want
scaler = StandardScaler()
scaledClusteringData = scaler.fit_transform(clusteringData)

In [392]:
# Run the fit!
kmeans.fit(scaledClusteringData)

# Save the cluster labels
seg['label'] = kmeans.labels_

In [393]:
names={1:'High-density',2:'Lush',3:"Spacious",0:"Townhouse"}
colors={1:'#E55756',2:'#55A24A',3:"#72B7B3",0:"#F58518"}

In [460]:
color_map = {'Townhouse': '#F58518',
 'High-density': '#E55756',
 'Lush': '#55A24A',
 'Spacious': '#72B7B3',
 'nan':"#cccccc"}

In [394]:
seg['color'] = [colors[x] for x in seg['label']]
seg['label'] = [names[x] for x in seg['label']]

In [395]:
segSummary = seg.groupby('label')[cols].mean().reset_index()
segSummary = segSummary.melt(id_vars=['label'],var_name='category',value_name='pct')
segSummary = pd.DataFrame(segSummary)
segSummary.head()

Unnamed: 0,label,category,pct
0,High-density,wall,0.028116
1,Lush,wall,0.019327
2,Spacious,wall,0.052916
3,Townhouse,wall,0.026113
4,High-density,lives,0.007247


In [396]:
alt.Chart(segSummary).mark_circle().encode(
    x='category:N',
    y='label:N',
    color=alt.Color('pct:Q', scale=alt.Scale(scheme="viridis")),
    size='pct:Q',
).properties(
    width=600,
    height=300
)

In [397]:
alt.Chart(segSummary).mark_bar().encode(
    row='label:N',
    color=alt.Color(
        'category:N',
        scale=alt.Scale(scheme="tableau10")),
    y = alt.X('category:N', sort='-x'),
    x=alt.X('pct:Q'),
).properties(
    width=500,
    height=100
).interactive()

In [412]:
bar = alt.Chart(segSummary).mark_bar().encode(
    y=alt.Y('label:N',title="Clustering Label"),
    color=alt.Color(
        'category:N',
        scale=alt.Scale(scheme="tableau20")),
    x=alt.X('pct:Q', stack="normalize",title="Category Percentage in Street View"),
    tooltip=[
        alt.Tooltip('label:N', title='Clustering Label'),
        alt.Tooltip('category:N', title='Catagory'),
        alt.Tooltip('pct:Q', title='Percentage',format=',.2f'), 
    ]
).properties(
    width=800,
    height=150
)

bar

In [None]:
alt.Chart.to_dict

In [414]:
bar.to_dict()

{'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}},
 'data': {'url': 'altair-data-8c3e737e5bdc7a0a94ee42c46d741686.json',
  'format': {'type': 'json'}},
 'mark': 'bar',
 'encoding': {'color': {'type': 'nominal',
   'field': 'category',
   'scale': {'scheme': 'tableau20'}},
  'tooltip': [{'type': 'nominal',
    'field': 'label',
    'title': 'Clustering Label'},
   {'type': 'nominal', 'field': 'category', 'title': 'Catagory'},
   {'type': 'quantitative',
    'field': 'pct',
    'format': ',.2f',
    'title': 'Percentage'}],
  'x': {'type': 'quantitative',
   'field': 'pct',
   'stack': 'normalize',
   'title': 'Category Percentage in Street View'},
  'y': {'type': 'nominal', 'field': 'label', 'title': 'Clustering Label'}},
 'height': 150,
 'width': 800,
 '$schema': 'https://vega.github.io/schema/vega-lite/v4.8.1.json'}

In [402]:
alt.Chart(seg).mark_geoshape(
).encode(
    color=alt.Color('color:N', scale=None)
).properties(
    width=800,
    height=800
)

In [383]:
blockGroup = gpd.read_file("../data/censusRaceWithGeometry.geojson")

In [384]:
blockGroup

Unnamed: 0,NAME,totalPop,whitePop,state,county,tract,block group,whitePercep,nonwhitePercep,geometry
0,"Block Group 3, Census Tract 91, Philadelphia C...",1529,223,42,101,009100,3,0.145847,0.854153,"POLYGON ((-75.20223 39.96164, -75.20142 39.961..."
1,"Block Group 3, Census Tract 95, Philadelphia C...",561,0,42,101,009500,3,0.000000,1.000000,"POLYGON ((-75.24043 39.96354, -75.24011 39.965..."
2,"Block Group 1, Census Tract 282, Philadelphia ...",1118,21,42,101,028200,1,0.018784,0.981216,"POLYGON ((-75.14683 40.02960, -75.14671 40.030..."
3,"Block Group 2, Census Tract 337.02, Philadelph...",1598,1010,42,101,033702,2,0.632040,0.367960,"POLYGON ((-75.06981 40.07086, -75.06923 40.071..."
4,"Block Group 2, Census Tract 372, Philadelphia ...",1454,1196,42,101,037200,2,0.822558,0.177442,"POLYGON ((-75.16508 39.91337, -75.16486 39.914..."
...,...,...,...,...,...,...,...,...,...,...
1331,"Block Group 2, Census Tract 176.01, Philadelph...",780,322,42,101,017601,2,0.412821,0.587179,"POLYGON ((-75.13985 39.99367, -75.13952 39.995..."
1332,"Block Group 1, Census Tract 188, Philadelphia ...",2073,822,42,101,018800,1,0.396527,0.603473,"POLYGON ((-75.10289 40.00005, -75.10185 40.000..."
1333,"Block Group 1, Census Tract 353.01, Philadelph...",2734,2448,42,101,035301,1,0.895391,0.104609,"POLYGON ((-74.98847 40.06717, -74.98834 40.067..."
1334,"Block Group 1, Census Tract 387, Philadelphia ...",618,470,42,101,038700,1,0.760518,0.239482,"POLYGON ((-75.20643 40.07747, -75.20636 40.077..."


In [403]:
bg_cluster = gpd.sjoin(seg[["label","geometry"]], 
                       blockGroup[['NAME','geometry']],
                       how="left",)

In [404]:
# pd.DataFrame.sort_values()
bg_cluster = bg_cluster.groupby(["NAME","label"]).count().reset_index()\
    .sort_values("index_right",ascending=False)\
    .groupby(["NAME"]).nth(0)[["label"]]

In [405]:
blockGroupLabel = blockGroup.merge(bg_cluster,how="left",on='NAME')
blockGroupLabel =blockGroupLabel[["NAME","geometry",'label']]

In [406]:


alt.Chart(blockGroupLabel).mark_geoshape(
    fillOpacity=1,
    stroke='white'
).encode(
    color=alt.Color('label:N'),
    tooltip=[
        alt.Tooltip('label:N', title='Clustering Label'),
        alt.Tooltip('NAME:N', title='Block Group Name')
    ]
).properties(
    width=800,
    height=800
)


In [423]:
seg["id"] = seg['SEG_ID'].astype("str")+seg['pointId'].astype("str")

In [466]:
seg[["id",'geometry', 'label', 'color']].to_file("../web/data/p3-clustering-point.geojson",driver="GeoJSON")

In [461]:
blockGroupLabel["label"] = blockGroupLabel["label"].fillna("nan")
blockGroupLabel["color"] =[color_map[ x] for x in blockGroupLabel["label"] ]

In [467]:
blockGroupLabel.to_file("../web/data/p3-clustering-polygon.geojson",driver="GeoJSON")