# **100 Cities Demographic Analysis** 

## **Imports and Data**

In [17]:
import os, sys, requests, zipfile, pathlib
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Vineyard functions
from VineyardFinal import vineyard_from_pds, Get_Adjacency_Persistence, Homotopy, Get_W_Infinity, vdist, min_vc, fD, fL

# Matrix Operations
import numpy as np
import pandas as pd

# misc
from tqdm import tqdm

# tda
import gudhi
import gudhi.hera

# Geospatial and graphs
import geopandas as gpd
from gerrychain import Graph
import networkx as nx

import warnings
warnings.filterwarnings('ignore', message='Found islands', category=UserWarning) # Getting rid of pesky degree-0 graph warnings 

INFINITY = 1e6
n = 100

## **Data** 

In [18]:
# download helpers #
def download(url, out_file):

    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with out_file.open("wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

    print(f"Downloaded: {out_file}.")

def download_zip(url, out_zip, extract_to):

    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with out_zip.open("wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
    
    with zipfile.ZipFile(out_zip) as zf:
        zf.extractall(extract_to)
    
    ## cleanup ##
    if os.exists(out_zip): os.remove(out_zip) 

    print(f"Downloaded: {out_zip.resolve()}")
    print(f"Extracted to: {extract_to.resolve()}")

###### City Data ######
# src: https://github.com/thomasweighill/tda-cities/blob/main/cities2020data.zip
# src: https://github.com/thomasweighill/tda-cities/blob/main/City_Names_And_Populations.csv

# os.makedirs("../dataIn/cities/", exist_ok=True)
# download_zip('https://github.com/thomasweighill/tda-cities/raw/main/cities2020data.zip', Path("../dataIn/cities/cities2020data.zip"), Path("../dataIn/cities/")) 
# download('https://github.com/thomasweighill/tda-cities/raw/main/City_Names_And_Populations.csv', Path("../dataIn/cities/City_Names_And_Populations.csv"))

## **Metric Calculation**

In [19]:
###### Analysis Params ###### 

# Demographic Cols #
col1 = 'BLACK'
col2 = 'HISP'

# length of homotopy #
t = 100

# cities
list_of_cities_pd =pd.read_csv('../dataIn/cities/City_Names_And_Populations.csv')
city_names = [x + y for x,y in zip(list_of_cities_pd.NAME, list_of_cities_pd.ST)][:n]

###### Out Dir ###### 

outDir = '../dataOut/'
os.makedirs(outDir, exist_ok = True)


cols = [
    'w',
    'w1',
    'v',
    'mvc',
    'v/mvc',
    'l1',
    'l1w',
    'l2',
    'l2w',
    'linf',
    'linfw',
    'R',
    'R^2',
    f'{col1}%',
    f'{col2}%',
    'TotPop',
]

df_out = pd.DataFrame(columns = cols)

for city in tqdm(city_names, desc = "Processing Cities"):
    # read graph
    G = Graph.from_json(f"../dataIn/cities/cities2020data/{city}.json") # cities2020data
    # Tract population
    pop = np.array(list((nx.get_node_attributes(G, 'TOTPOP').values())))

    # f and g are both percentages of some statistic over a population
    f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
    g = np.array(list((nx.get_node_attributes(G,col2).values()))) / pop

    # Fill NAN 
    f = np.nan_to_num(f, 0)
    g = np.nan_to_num(g, 0)


###### Compute Vineyard ###### 

    # Compute persistence diagrams
    PD0 = []

    # Straight line homotopy
    Hs = Homotopy(f,g,t = t)
    for h in Hs:
        for i, v in enumerate(G.nodes()):
            G.nodes[v]['Homotopy'] = h[i]
            
        PD0.append(Get_Adjacency_Persistence(G,'Homotopy', dimension = 0, popCol = "TOTPOP", popMin = 10))

###### Calculate Metrics ###### 

    # Wasserstein Distance
    w = Get_W_Infinity(PD0[0], PD0[-1])
    # 1-Wasserstein Distance
    w1 = gudhi.hera.wasserstein_distance(PD0[0], PD0[-1])
    # Vineyard Distance (F(D) = D, F(L) = 1)
    v = vdist(vineyard_from_pds(PD0), fD, fL)
    # MVC 
    mvc = min_vc(PD0[0], PD0[-1])
    # v/mvc
    vmvc = v / mvc
    #l1 Norm
    l1 = np.linalg.norm(f - g, ord = 1)
    # l2 Norm
    l2 = np.linalg.norm(f - g, ord = 2)
    #Linf Norm
    linf = np.linalg.norm(f - g, ord = np.inf)
    # Weighted l1 Norm
    l1w = l1 / len(G)
    # Weighted L2 Norm
    l2w = l2 / len(G)
    # Weighted Linf Norm
    linfw = linf / len(G)
    # Pearson Correlation
    r = np.corrcoef(f,g)[0,1]
    # R^2
    r2 = r ** 2
    # Total Population
    totPop = np.sum(pop)
    # col1%
    col1P = np.sum(f * pop) / totPop * 100.0
    # col2%
    col2P = np.sum(g * pop) / totPop * 100.0

###### Store Data ###### 
    data = pd.Series([w, w1, v, mvc, vmvc, l1, l1w, l2, l2w, linf, linfw, r, r2, col1P, col2P, totPop])

    df_out.loc[city] = pd.Series({
        f'{col}' : d for col, d in zip(cols, data)
    })

    print(f'{city} done')
    
###### Save ###### 
df_out.index.name = 'Name'
df_out.to_csv(os.path.join(outDir, 'city_metrics.csv'), index = True)
print(f"Metric calculation complete. \nResults saved to {outDir}/city_metrics.csv")

  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   1%|          | 1/100 [00:07<11:38,  7.06s/it]

New YorkNY done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   2%|▏         | 2/100 [00:10<07:34,  4.64s/it]

Los AngelesCA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   3%|▎         | 3/100 [00:11<04:57,  3.06s/it]

ChicagoIL done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   4%|▍         | 4/100 [00:13<04:08,  2.59s/it]

HoustonTX done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   5%|▌         | 5/100 [00:13<02:53,  1.83s/it]

PhiladelphiaPA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   6%|▌         | 6/100 [00:14<02:15,  1.44s/it]

PhoenixAZ done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:   7%|▋         | 7/100 [00:14<01:40,  1.08s/it]

San AntonioTX done


Processing Cities:   8%|▊         | 8/100 [00:15<01:21,  1.13it/s]

San DiegoCA done


Processing Cities:   9%|▉         | 9/100 [00:15<01:12,  1.26it/s]

DallasTX done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  10%|█         | 10/100 [00:16<01:02,  1.45it/s]

HonoluluHI done


Processing Cities:  11%|█         | 11/100 [00:16<00:48,  1.82it/s]

San JoseCA done


Processing Cities:  12%|█▏        | 12/100 [00:16<00:39,  2.25it/s]

JacksonvilleFL done


Processing Cities:  13%|█▎        | 13/100 [00:16<00:33,  2.58it/s]

IndianapolisIN done


Processing Cities:  14%|█▍        | 14/100 [00:17<00:29,  2.93it/s]

San FranciscoCA done


Processing Cities:  15%|█▌        | 15/100 [00:17<00:28,  2.94it/s]

AustinTX done


Processing Cities:  16%|█▌        | 16/100 [00:17<00:26,  3.13it/s]

ColumbusOH done


Processing Cities:  17%|█▋        | 17/100 [00:17<00:24,  3.34it/s]

Fort WorthTX done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  18%|█▊        | 18/100 [00:18<00:25,  3.23it/s]

CharlotteNC done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  20%|██        | 20/100 [00:18<00:22,  3.54it/s]

DetroitMI done
El PasoTX done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  22%|██▏       | 22/100 [00:19<00:17,  4.39it/s]

MemphisTN done
BaltimoreMD done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  24%|██▍       | 24/100 [00:19<00:16,  4.70it/s]

BostonMA done
SeattleWA done


Processing Cities:  25%|██▌       | 25/100 [00:19<00:16,  4.58it/s]

WashingtonDC done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  27%|██▋       | 27/100 [00:20<00:15,  4.68it/s]

NashvilleTN done
DenverCO done


Processing Cities:  28%|██▊       | 28/100 [00:20<00:15,  4.59it/s]

LouisvilleKY done


Processing Cities:  29%|██▉       | 29/100 [00:20<00:15,  4.51it/s]

MilwaukeeWI done


Processing Cities:  30%|███       | 30/100 [00:20<00:16,  4.21it/s]

PortlandOR done


Processing Cities:  31%|███       | 31/100 [00:21<00:15,  4.40it/s]

Las VegasNV done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  33%|███▎      | 33/100 [00:21<00:13,  4.83it/s]

Oklahoma CityOK done
AlbuquerqueNM done


Processing Cities:  35%|███▌      | 35/100 [00:21<00:11,  5.62it/s]

TucsonAZ done
FresnoCA done


Processing Cities:  37%|███▋      | 37/100 [00:22<00:09,  6.63it/s]

SacramentoCA done
Long BeachCA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  39%|███▉      | 39/100 [00:22<00:10,  6.00it/s]

Kansas CityMO done
MesaAZ done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop


Virginia BeachVA done


Processing Cities:  42%|████▏     | 42/100 [00:22<00:09,  5.99it/s]

AtlantaGA done
Colorado SpringsCO done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop


OmahaNE done
RaleighNC done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  46%|████▌     | 46/100 [00:23<00:08,  6.69it/s]

MiamiFL done
ClevelandOH done


Processing Cities:  48%|████▊     | 48/100 [00:23<00:07,  7.07it/s]

TulsaOK done
OaklandCA done


Processing Cities:  51%|█████     | 51/100 [00:24<00:05,  8.44it/s]

MinneapolisMN done
WichitaKS done
ArlingtonTX done


Processing Cities:  53%|█████▎    | 53/100 [00:24<00:05,  7.89it/s]

BakersfieldCA done
New OrleansLA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  55%|█████▌    | 55/100 [00:24<00:05,  8.28it/s]

AnaheimCA done
TampaFL done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  58%|█████▊    | 58/100 [00:24<00:04,  9.70it/s]

AuroraCO done
Santa AnaCA done
St. LouisMO done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  61%|██████    | 61/100 [00:25<00:03, 11.13it/s]

PittsburghPA done
Corpus ChristiTX done
RiversideCA done


Processing Cities:  65%|██████▌   | 65/100 [00:25<00:03, 10.92it/s]

CincinnatiOH done
LexingtonKY done
AnchorageAK done
StocktonCA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  67%|██████▋   | 67/100 [00:25<00:03,  9.87it/s]

ToledoOH done
St. PaulMN done
NewarkNJ done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  71%|███████   | 71/100 [00:26<00:02, 10.72it/s]

GreensboroNC done
BuffaloNY done
PlanoTX done


Processing Cities:  73%|███████▎  | 73/100 [00:26<00:02, 11.67it/s]

LincolnNE done
HendersonNV done
Fort WayneIN done
Jersey CityNJ done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  78%|███████▊  | 78/100 [00:26<00:01, 14.11it/s]

St. PetersburgFL done
Chula VistaCA done
NorfolkVA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  82%|████████▏ | 82/100 [00:26<00:01, 14.23it/s]

OrlandoFL done
ChandlerAZ done
LaredoTX done
MadisonWI done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  86%|████████▌ | 86/100 [00:27<00:00, 14.16it/s]

Winston-SalemNC done
LubbockTX done
Baton RougeLA done
DurhamNC done


Processing Cities:  88%|████████▊ | 88/100 [00:27<00:00, 14.16it/s]

GarlandTX done
GlendaleAZ done
RenoNV done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  92%|█████████▏| 92/100 [00:27<00:00, 13.47it/s]

HialeahFL done
ChesapeakeVA done
ScottsdaleAZ done


Processing Cities:  94%|█████████▍| 94/100 [00:27<00:00, 13.49it/s]

North Las VegasNV done
IrvingTX done
FremontCA done


  f = np.array(list((nx.get_node_attributes(G,col1).values()))) / pop
Processing Cities:  98%|█████████▊| 98/100 [00:27<00:00, 13.35it/s]

IrvineCA done
BirminghamAL done
RochesterNY done


Processing Cities: 100%|██████████| 100/100 [00:28<00:00,  3.56it/s]

San BernardinoCA done
SpokaneWA done
Metric calculation complete. 
Results saved to ../dataOut//city_metrics.csv





In [20]:
df_1 = pd.read_csv('../../PaperCodes/DataOut/City_Metrics_P.csv', index_col = "Name")
df_2 = pd.read_csv('../dataOut/city_metrics.csv', index_col = "Name")
print(df_1['v'].head())
print(df_2['v'].head())
print(df_1.loc["DallasTX"]['v'])
print(df_2.loc["DallasTX"]['v'])

print(df_1.loc["DallasTX"]['w1'])
print(df_2.loc["DallasTX"]['w1'])

Name
New YorkNY        3.162003
Los AngelesCA     2.358659
ChicagoIL         1.479105
HoustonTX         2.211081
PhiladelphiaPA    0.835309
Name: v, dtype: float64
Name
New YorkNY        5.064414
Los AngelesCA     3.962156
ChicagoIL         2.433321
HoustonTX         3.644369
PhiladelphiaPA    1.293566
Name: v, dtype: float64
1.4905361117503764
2.2932280789814903
3.843986792849618
3.843986792849618
