In [94]:
import geopandas as gpd

territories = gpd.read_parquet('./data/gnn_data/territories.parquet')
crs = territories.estimate_utm_crs()
territories = territories.drop(1267).to_crs(crs)
territories.head()

Unnamed: 0_level_0,geometry,name,population
territory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,"POLYGON ((580049.515 6617067.720, 580225.344 6...",Самойловское сельское поселение,2154.0
4,"POLYGON ((573514.444 6620389.851, 573579.919 6...",Большедворское сельское поселение,1698.0
5,"POLYGON ((564707.328 6594903.594, 564341.814 6...",Пикалевское городское поселение,20169.0
6,"POLYGON ((560629.509 6557332.762, 560642.857 6...",Борское сельское поселение,3393.0
7,"POLYGON ((560659.317 6585430.874, 559919.213 6...",Бокситогорское городское поселение,15960.0


In [95]:
blocks = gpd.read_parquet('./data/split_gdf.parquet')[['geometry']].reset_index(drop=True).to_crs(crs)
blocks = blocks.explode('geometry', ignore_index=True)
blocks = blocks[blocks.geom_type == 'Polygon'].copy()
blocks.head()

Unnamed: 0,geometry
0,"POLYGON ((570883.045 6596529.946, 570883.081 6..."
1,"POLYGON ((570987.035 6596464.499, 570987.085 6..."
2,"POLYGON ((572676.730 6596606.982, 572676.701 6..."
3,"POLYGON ((583242.895 6595451.954, 583243.013 6..."
4,"POLYGON ((582405.498 6595459.455, 582405.614 6..."


In [96]:
from blocksnet.utils.spatial import sjoin_intersections

sjoin = sjoin_intersections(blocks[['geometry']], territories[['geometry']])
sjoin.head()

Unnamed: 0,index_left,index_right,geometry,intersection_area,share_left,share_right
0,0,3,"POLYGON ((570883.081 6596530.057, 570886.131 6...",1373078.0,1.0,0.001433325
1,1,3,"POLYGON ((570987.085 6596464.433, 570991.354 6...",770.9275,1.0,8.047535e-07
2,2,3,"POLYGON ((572676.701 6596607.098, 572675.654 6...",604.073,1.0,6.305779e-07
3,3,3,"POLYGON ((583243.013 6595451.957, 583366.887 6...",45192.86,1.0,4.717578e-05
4,4,3,"POLYGON ((582405.614 6595459.459, 582434.970 6...",198.2999,1.0,2.070007e-07


## Data preparation

In [97]:
from blocksnet.enums import LandUse

rules = {
    "residential": LandUse.RESIDENTIAL,
    "business" : LandUse.BUSINESS,
    "recreation": LandUse.RECREATION,
    "industrial" : LandUse.INDUSTRIAL,
    "transport" : LandUse.TRANSPORT,
    "special" : LandUse.SPECIAL,
    "agriculture" : LandUse.AGRICULTURE,
}

In [100]:
sjoin[sjoin.index_right == 3].index_left

0            0
1            1
2            2
3            3
4            4
         ...  
475        378
916        809
919        810
921        811
35400    31937
Name: index_left, Length: 81, dtype: int64

In [102]:
from blocksnet.blocks.assignment import assign_land_use
from blocksnet.config import log_config
from tqdm import tqdm

log_config.set_logger_level('ERROR')

def one_hot_land_use(blocks_gdf : gpd.GeoDataFrame):
    blocks_gdf = blocks_gdf.copy()
    columns = [lu.value for lu in LandUse]
    for column in columns:
        blocks_gdf[column] = blocks_gdf['lu_shares'].apply(lambda s : s[column] if isinstance(s,dict) and column in s else 0.0)
    blocks_gdf['area'] = blocks_gdf.area
    return blocks_gdf[['geometry', 'area', *columns]]

def get_territory_blocks(territory_id : int, min_share : float = 0.5) -> gpd.GeoDataFrame:
    blocks_ids = sjoin[(sjoin['index_right'] == territory_id) & (sjoin['share_left']>=min_share)]['index_left']
    gdf = blocks.loc[blocks_ids]
    fz = gpd.read_parquet(f'./data/prostor/{territory_id}_functional_zones.parquet').to_crs(gdf.crs)
    assigned = assign_land_use(gdf[['geometry']], fz, rules)
    one_hot = one_hot_land_use(assigned)
    return one_hot

territories_blocks = {}

for territory_id in tqdm(territories.index):
    try:
        territory_blocks = get_territory_blocks(territory_id)
        territories_blocks[territory_id] = territory_blocks
    except:
        print(f'Trouble with {territory_id}')

 19%|█▉        | 36/187 [00:15<00:48,  3.14it/s]

Trouble with 42


 41%|████      | 76/187 [00:32<00:18,  5.96it/s]

Trouble with 84


 70%|███████   | 131/187 [00:46<00:04, 11.82it/s]

Trouble with 143


100%|██████████| 187/187 [01:10<00:00,  2.65it/s]


## Learning

In [None]:
territories_blocks[3]


Unnamed: 0,geometry,area,residential,business,recreation,industrial,transport,special,agriculture
0,"POLYGON ((570883.045 6596529.946, 570883.081 6...",1.373078e+06,0.387554,0.0,0.029253,0.092349,0.0,0.0,0.0
1,"POLYGON ((570987.035 6596464.499, 570987.085 6...",7.709275e+02,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
2,"POLYGON ((572676.730 6596606.982, 572676.701 6...",6.040730e+02,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
3,"POLYGON ((583242.895 6595451.954, 583243.013 6...",4.519286e+04,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
4,"POLYGON ((582405.498 6595459.455, 582405.614 6...",1.982999e+02,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
59,"POLYGON ((571304.120 6605720.084, 571304.118 6...",2.730813e+02,0.000000,0.0,1.000000,0.000000,0.0,0.0,0.0
60,"POLYGON ((570889.202 6605877.085, 570889.276 6...",7.257186e+02,0.000000,0.0,0.999779,0.000000,0.0,0.0,0.0
61,"POLYGON ((575243.396 6585767.008, 575243.385 6...",7.874966e+03,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
62,"POLYGON ((578341.885 6602161.266, 578341.942 6...",8.430962e+02,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0


In [263]:
import pandas as pd

series_list = {}
for territory_id, territory_blocks in territories_blocks.items():
    area = territory_blocks['area'].sum()
    lus = {}
    for lu in LandUse:
        series = territory_blocks.apply(lambda s : s[lu.value] * s['area'], axis=1)
        lu_area = series.sum()
        lus[lu.value] = lu_area / area
    series_list[territory_id] = pd.Series({'area': area, **lus})

X = pd.DataFrame.from_dict(series_list, orient='index')
X.head()

Unnamed: 0,area,residential,business,recreation,industrial,transport,special,agriculture
3,941698400.0,0.00796,0.0,0.031091,0.004536,0.0,3.5e-05,0.010508
4,727896500.0,0.008568,0.0,0.100588,0.000313,0.0,9.7e-05,0.00431
5,43849710.0,0.114069,0.0,0.061521,0.043203,0.0,0.004793,0.001448
6,691196200.0,0.004723,0.0,0.092466,0.016979,0.0,2e-06,0.002691
7,255747600.0,0.02185,0.0,0.050808,0.001148,0.0,0.000549,0.005088


In [264]:
Y = territories.loc[X.index, ['population']]
Y.head()

Unnamed: 0,population
3,2154.0
4,1698.0
5,20169.0
6,3393.0
7,15960.0


In [304]:
XY = X.join(Y)
XY.head()

Unnamed: 0,area,residential,business,recreation,industrial,transport,special,agriculture,population
3,941698400.0,0.00796,0.0,0.031091,0.004536,0.0,3.5e-05,0.010508,2154.0
4,727896500.0,0.008568,0.0,0.100588,0.000313,0.0,9.7e-05,0.00431,1698.0
5,43849710.0,0.114069,0.0,0.061521,0.043203,0.0,0.004793,0.001448,20169.0
6,691196200.0,0.004723,0.0,0.092466,0.016979,0.0,2e-06,0.002691,3393.0
7,255747600.0,0.02185,0.0,0.050808,0.001148,0.0,0.000549,0.005088,15960.0


In [317]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(XY.drop(columns=['population']))
Y_scaled = scaler.fit_transform(XY[['population']])

In [320]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_scaled, test_size=0.2, random_state=42)

In [347]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, Y_train.ravel())

## Test

In [349]:
Y_pred = model.predict(X_test)
scaler.inverse_transform(Y_pred.reshape(-1,1))

array([[ 2096.75],
       [21098.11],
       [ 6209.5 ],
       [20276.81],
       [21060.48],
       [ 1531.65],
       [21974.35],
       [27096.65],
       [ 3285.91],
       [28164.86],
       [12580.39],
       [ 2409.28],
       [19915.47],
       [10957.11],
       [10037.33],
       [ 2186.36],
       [13217.07],
       [12300.15],
       [ 5884.95],
       [ 2570.12],
       [10550.17],
       [ 3740.75],
       [17714.11],
       [ 4051.21],
       [ 2965.88],
       [ 6210.72],
       [ 8763.08],
       [ 6402.32],
       [ 1342.81],
       [27164.56],
       [ 2780.89],
       [ 4119.83],
       [12870.91],
       [ 3612.01],
       [21314.81],
       [21948.5 ],
       [17438.23]])

In [350]:
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

RMSE: 1.2343557887625838


### Catboost

In [372]:
import catboost as cb

model = cb.CatBoostRegressor(iterations=10_000, learning_rate=3e-4, depth=6, loss_function='RMSE', random_state=42, verbose=0)
model.fit(X_train, Y_train)

<catboost.core.CatBoostRegressor at 0x7fd33396b550>

In [373]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")

RMSE: 1.2547568473635153


In [374]:
Y_pred = model.predict(X_test)
Y_pred

array([-0.38824748,  0.33648091, -0.31678497,  0.38331246,  0.31746904,
       -0.46160493,  0.25778159,  0.09111776, -0.26212448,  0.85476212,
       -0.05135453, -0.36531381,  0.03044619,  0.0757662 , -0.14742569,
       -0.4369942 ,  0.12758316,  0.14251729, -0.22356824, -0.31052818,
        0.0508255 , -0.2488922 ,  0.28002604, -0.22675012, -0.36729836,
       -0.15623823, -0.03188415, -0.07596782, -0.43270438,  0.1457493 ,
       -0.39247102, -0.31993416,  0.18076454, -0.29477378,  0.17565201,
        0.03083624, -0.12719537])

In [375]:
scaler.inverse_transform(Y_test)

array([[ 4560.],
       [15726.],
       [ 1525.],
       [ 4040.],
       [ 3916.],
       [ 1043.],
       [37955.],
       [25764.],
       [ 3006.],
       [28128.],
       [ 3754.],
       [  463.],
       [ 9767.],
       [22949.],
       [14260.],
       [ 2709.],
       [22155.],
       [ 8053.],
       [ 1250.],
       [ 9935.],
       [56597.],
       [ 4827.],
       [ 1788.],
       [79548.],
       [ 1676.],
       [ 1260.],
       [ 8936.],
       [  655.],
       [  612.],
       [ 8528.],
       [ 3965.],
       [14766.],
       [ 4849.],
       [ 2849.],
       [ 6410.],
       [87440.],
       [ 3060.]])

In [376]:
scaler.inverse_transform(Y_pred.reshape(-1,1))

array([[ 3923.05742908],
       [15654.50005929],
       [ 5079.84711866],
       [16412.57940071],
       [15346.74801025],
       [ 2735.5936513 ],
       [14380.56545264],
       [11682.71732821],
       [ 5964.65630135],
       [24044.10679886],
       [ 9376.466328  ],
       [ 4294.29306067],
       [10700.60450925],
       [11434.21598866],
       [ 7821.32747446],
       [ 3133.9764127 ],
       [12272.99601561],
       [12514.74021824],
       [ 6588.78020631],
       [ 5181.1279998 ],
       [11030.49168516],
       [ 6178.85216825],
       [14740.64448856],
       [ 6537.2739771 ],
       [ 4262.16837553],
       [ 7678.67567879],
       [ 9691.64046222],
       [ 8978.0421379 ],
       [ 3203.41740572],
       [12567.05784554],
       [ 3854.68945643],
       [ 5028.86997986],
       [13133.86231594],
       [ 5436.15020117],
       [13051.10392946],
       [10706.91835139],
       [ 8148.80303058]])