In [1]:
import pandas as pd
import geopandas as gpd
import skmob
from skmob.tessellation import tilers
from shapely import wkt, MultiPolygon
from torch_geometric.data import Data
import torch

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


from sklearn.preprocessing import StandardScaler

In [2]:
# Create block group features

# Load block group data
bg = pd.read_csv('../../data/processed/census_data.csv')
feature_columns = ['GEOID', 'MedAge', 'MedHouIncome', 'PerCapIncome', 'TotPop', 'White', 'BlackAA', 'AIAN', 'Asian', 'NHOPI', 'OtherRace']
bg = bg[feature_columns]

# Removed first two digits of GEOID bc same for all rows (48 = Texas)
bg.GEOID = bg.GEOID.astype(str)
bg.GEOID = bg.GEOID.str[3:]
bg.GEOID = bg.GEOID.astype(int)
bg.isna().sum()

GEOID           0
MedAge          0
MedHouIncome    0
PerCapIncome    0
TotPop          0
White           0
BlackAA         0
AIAN            0
Asian           0
NHOPI           0
OtherRace       0
dtype: int64

In [3]:
edges = pd.read_csv('../../data/processed/aggregated_fdf.csv')

In [4]:
# Drop first two digits of GEOID
edges.origin = edges.origin.astype(str)
edges.origin = edges.origin.str[3:]  
edges.origin = edges.origin.astype(int)

edges.destination = edges.destination.astype(str)
edges.destination = edges.destination.str[3:]  
edges.destination = edges.destination.astype(int)

# Get unique geoids from edges
concat_edges = pd.concat([edges['origin'], edges['destination']])
unique_geoids = concat_edges.unique()

# filter bg to have only those geoids
filtered_bg = bg.loc[bg['GEOID'].isin(unique_geoids)]
filtered_bg.reset_index(inplace=True, drop=True)

In [8]:
for c in filtered_bg.columns:
    temp = filtered_bg[c].loc[filtered_bg[c].astype(int) < 0]
    if len(temp) > 0:
        print(f"Column with errors: {c} | {len(temp)} errors")

Column with errors: MedAge | 8 errors
Column with errors: MedHouIncome | 375 errors
Column with errors: PerCapIncome | 7 errors


In [16]:
filtered_bg.head()

Unnamed: 0,GEOID,MedAge,MedHouIncome,PerCapIncome,TotPop,White,BlackAA,AIAN,Asian,NHOPI,OtherRace
0,157601001,33.7,35066,19239,1308,731,257,7,0,0,185
1,157601002,38.7,51477,25688,1158,782,95,0,0,0,228
2,157602011,49.2,63879,31392,913,802,111,0,0,0,0
3,157602012,33.5,48452,23740,1145,586,369,0,0,0,78
4,157602021,41.7,90056,42052,2351,1837,120,0,0,0,352


In [17]:
geoid2number = {}
number2geoid = {}

# Create two dictionaries to map geoids to indexes and vice versa
for i in range(len(filtered_bg)):
    geoid2number[filtered_bg.GEOID[i]] = i
    number2geoid[i] = filtered_bg.GEOID[i]


# Map edges to indexes
edges['origin'] = edges['origin'].map(geoid2number)
edges['destination'] = edges['destination'].map(geoid2number)


bg_features = filtered_bg.drop(columns=['GEOID'])

In [25]:
imputer = IterativeImputer(random_state=0, missing_values=-666666666)
bg_test = imputer.fit_transform(bg_features)
bg_test = pd.DataFrame(bg_test, columns=bg_features.columns)
for c in bg_test.columns:
    bg_test[c] = bg_test[c].astype(int)
bg_features = bg_test

In [29]:
scaler = StandardScaler()
bg_test = scaler.fit_transform(bg_features)
bg_test = pd.DataFrame(bg_test, columns=bg_features.columns)
bg_test.head()

edge_flows = scaler.fit_transform(edges[['flow']])
edges.flow = pd.DataFrame(edge_flows, columns=['flow'])

In [34]:
feature_columns = ['MedAge', 'TotPop', 'White', 'BlackAA', 'AIAN', 'Asian', 'NHOPI', 'OtherRace']
predictor_columns = ['PerCapIncome']

x = bg_test[feature_columns]
y = bg_test[predictor_columns]

In [35]:
data = Data(
    x=torch.tensor(x.values, dtype=torch.float),
    y=torch.tensor(y.values, dtype=torch.float),
    edge_index=torch.tensor(edges[['origin', 'destination']].values, dtype=torch.long).T,
    edge_attr=torch.tensor(edges[['flow']].values, dtype=torch.float).squeeze()
    )

In [36]:
data

Data(x=[4142, 8], edge_index=[2, 2927820], edge_attr=[2927820], y=[4142, 1])