In [1]:
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
import numpy as np
import operator
from heapq import nsmallest, nlargest
import pickle
import contextily as cx
import mapclassify

In [2]:
path = 'data/'

In [3]:
prim = gpd.read_parquet(path + 'primary.pq')

In [4]:
cntx = pd.read_parquet(path + 'contextual.pq')
tess = gpd.read_file(path + 'geometry.gpkg', layer="tessellation")
ut = gpd.read_file('outputs/urban_types.gpkg', layer='AHC_k12')

In [5]:
cntx = tess.merge(cntx, left_index=True, right_index=True)

In [6]:
ut_cntx = cntx.drop(columns='geometry').merge(ut.drop(columns='geometry'), on='uID')

In [7]:
ut_cntx

Unnamed: 0,uID,stcOri_25,stcOri_50,stcOri_75,sdcLAL_25,sdcLAL_50,sdcLAL_75,sdcAre_25,sdcAre_50,sdcAre_75,...,ldbPWL_25,ldbPWL_50,ldbPWL_75,ltbIBD_25,ltbIBD_50,ltbIBD_75,ltcBuA_25,ltcBuA_50,ltcBuA_75,cluster
0,4546,0.291498,0.148546,0.123542,3.355742,3.264933,2.413071,4.154503,4.241074,3.622287,...,-0.794571,-0.761956,-0.725411,1.936091,1.962353,1.776488,1.452102,1.240680,1.131014,2
1,4549,0.242241,-0.142400,0.123542,3.355742,3.351590,2.744006,4.154503,5.044205,4.315923,...,-0.774620,-0.863366,-0.725411,1.936091,1.964223,1.776488,1.452102,1.261616,1.115629,2
2,11120,-0.379114,-0.167905,0.041695,3.018468,2.379772,1.928518,1.699412,2.240903,2.001995,...,-0.854027,-0.902576,-1.121556,0.914700,1.116306,0.895483,1.721248,1.617534,1.438700,11
3,11123,-0.212446,0.194163,0.284353,1.913593,2.346410,1.928518,1.538855,1.533250,2.001995,...,-0.818918,-0.902814,-1.121556,0.981491,1.013327,0.895483,1.644392,1.566688,1.415031,11
4,10961,0.002465,0.496981,0.433197,1.834483,1.735268,1.855207,0.922521,1.222072,1.739291,...,-0.844816,-0.902695,-1.086226,0.741673,0.922507,0.872171,1.413054,1.501976,1.376861,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57600,11582,-1.093221,-0.860597,0.683667,3.508804,2.685265,3.476390,0.993182,2.873015,4.367498,...,-0.841762,-0.804253,-0.675306,2.418395,2.214506,1.928209,1.494097,1.434722,1.566902,5
57601,9,-1.206556,-1.088693,0.675987,3.509637,3.005364,3.476390,0.778322,3.508695,4.367498,...,-0.972579,-0.977837,-0.675306,2.368195,2.131044,1.776375,1.544855,1.759768,1.573805,5
57602,23,-0.847737,-0.583734,-0.993068,7.535405,6.612165,6.136282,13.488553,14.316540,14.490328,...,-0.991286,-0.338819,0.096140,1.995786,1.571115,1.177320,2.208001,1.973452,1.746385,5
57603,0,-0.396080,0.385806,0.040483,5.379813,4.925200,3.342037,5.621028,7.025074,5.726547,...,-0.894139,-0.941121,-0.860360,3.250146,3.036894,2.900127,1.616480,1.546350,1.401225,0


## rename variables for readability

In [8]:
feat_names = list(ut_cntx.drop(columns=['uID','cluster']).columns)

In [9]:
group = ut_cntx.drop(columns=['uID']).groupby('cluster').mean()
group

Unnamed: 0_level_0,stcOri_25,stcOri_50,stcOri_75,sdcLAL_25,sdcLAL_50,sdcLAL_75,sdcAre_25,sdcAre_50,sdcAre_75,sscCCo_25,...,libNCo_75,ldbPWL_25,ldbPWL_50,ldbPWL_75,ltbIBD_25,ltbIBD_50,ltbIBD_75,ltcBuA_25,ltcBuA_50,ltcBuA_75
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.204611,0.318993,0.751677,1.25725,1.195831,1.039776,0.965419,1.048605,0.965011,-0.417229,...,-0.469335,-0.83596,-0.959843,-1.076636,1.769186,1.732039,1.683812,1.569418,1.494037,1.4047
1,1.467501,0.879611,0.334506,0.072754,0.092082,-0.002849,-0.017664,-0.018652,-0.112721,0.597749,...,5.040185,1.357764,0.767146,0.503107,-0.878708,-0.879413,-0.880831,-1.024977,-1.070887,-1.113377
2,-0.195327,-0.163684,-0.069622,0.061966,0.127995,0.207583,-0.006283,0.0402,0.09679,-0.245141,...,-0.301039,-0.314482,-0.229203,-0.120919,0.081036,0.126682,0.164647,0.220647,0.27134,0.322082
3,0.430328,0.250521,0.005863,-0.263043,-0.208791,-0.133235,-0.217243,-0.217284,-0.170224,0.112115,...,0.147461,0.495911,0.57829,0.562775,-0.318535,-0.290705,-0.25179,-0.536529,-0.496897,-0.44432
4,-0.573181,-0.743987,-0.801957,-0.632492,-0.718444,-0.769709,-0.426526,-0.538614,-0.593493,0.248127,...,-0.072957,0.378566,0.46845,0.455217,-0.644055,-0.682284,-0.719171,-0.73549,-0.753953,-0.767978
5,-0.221239,0.102487,0.40115,2.068575,2.169561,2.19465,1.814499,2.289271,2.492214,-0.526862,...,-0.475778,-0.731603,-0.758351,-0.67756,1.778407,1.704433,1.608644,1.692656,1.620183,1.519496
6,0.370159,0.525105,0.590636,-0.253008,-0.275417,-0.240151,-0.213548,-0.271796,-0.306452,0.192604,...,-0.272464,-0.549832,-0.45963,-0.357548,-0.133801,-0.073153,0.009939,0.411309,0.462771,0.503697
7,0.745604,0.433678,0.093623,-0.335425,-0.382794,-0.464434,-0.24279,-0.325005,-0.416047,0.923354,...,0.456284,0.812119,0.520318,0.304918,-0.56597,-0.580138,-0.588945,-0.840767,-0.853619,-0.854193
8,-0.223016,-0.112004,-0.07409,-0.453798,-0.49886,-0.502242,-0.343315,-0.422866,-0.455718,-0.087296,...,-0.356168,-0.329714,-0.209678,-0.103092,-0.403093,-0.42524,-0.439766,-0.086872,-0.083851,-0.078319
9,0.61794,0.535198,0.47562,1.814497,1.789577,1.820351,1.401025,1.501541,1.827194,-0.781224,...,-0.465683,-0.297866,-0.304182,-0.325513,0.398107,0.271352,0.154387,1.140919,1.111402,1.085873


extract and plot top descriptors

In [10]:
for ik in group.index:
    impor_feat = {}
    muik = group.loc[group.index == ik].values[0]
    top = nlargest(4, muik)[3] #pick top 3 best descriptors
    bott = nsmallest(4, muik)[3] #pick bottom 3 best descriptors
    for i,iv in enumerate(muik):
            if iv > top:
                impor_feat[feat_names[i]] = iv 
            if iv < bott:
                impor_feat[feat_names[i]] = iv
    sorted_x = sorted(impor_feat.items(), key=operator.itemgetter(1)) #reverse=True
    #print(sorted_x)
    df = pd.DataFrame(sorted_x,columns=['var','val'])
    df = df.set_index('var')
    df['positive'] = df['val'] > 0
    df['absval']= abs(df.val)
    df = df.sort_values(by='absval',ascending = True)
    #df.plot(kind='bar', title='Top descriptors of cluster ' + str(ik), color=[np.where(df['val']>0, 'r', 'b')], 
    #        legend=False, xlabel='')
    df['absval'].plot(kind='barh', figsize=(8, 4), title='Top descriptors of UT ' + str(ik), xlabel='',
                      color=df.positive.map({False: 'b', True: 'r'})).set_xlabel("Red = positive value, Blue = negative value")
    plt.savefig('outputs/k12_topdescriptors_ut' + str(ik) + '.jpeg', dpi=300, bbox_inches='tight')
    plt.clf()
    #plt.show()

<Figure size 800x400 with 0 Axes>