# required libraries

In [None]:
import json
import pickle
from pathlib import Path
import networkx as nx
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from geopy.distance import distance
import plotly.graph_objects as go
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Stage 1 - education predict


code to create graph

In [None]:
buffer_size = 300
n_closest = 3
path_root_city = Path('/content')
filename = 'data_high1.csv'
filepath = path_root_city / filename
filepath_subway = path_root_city / 'метро.geojson'
filepath_bus = path_root_city / 'остановки автобусов.geojson'
filepath_tram = path_root_city / 'остановки трамвай.geojson'
df_squares = gpd.read_file(path_root_city / 'squares1.csv')
df_squares['building_id'] = df_squares['building_id'].astype(int)

def read_geojson(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def extract_stops(path: Path) -> list[tuple[int, float, float]]:
    data = read_geojson(path)
    coords = list(map(lambda v: (int(v['properties']['osm_id']),
                                 v['geometry']['coordinates'][0],
                                 v['geometry']['coordinates'][1]), data['features']))
    return coords

def draw_plotly(g_):
    import plotly.express as px
    pos_ = nx.get_node_attributes(g_, 'coordinate')
    nodes = list(g_.nodes())
    edge_x = []
    edge_y = []
    edges_text = []
    for edge in g_.edges(data=True):
        x0, y0 = pos_[edge[0]]
        x1, y1 = pos_[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edges_text.extend([f"Meters: {edge[2]['meters']}', f'Meters: {edge[2]['meters']}", None])
    nodes_data = g_.nodes(data=True)
    node_x = []
    node_y = []
    node_text = []
    node_colors = []
    for node in nodes:
        x, y = pos_[node]
        node_x.append(x)
        node_y.append(y)
        attrs = nodes_data[node]
        if attrs['label'] in {'subway', 'bus', 'tram'}:
            node_colors.append('black')
        else:
            node_colors.append('green' if attrs['education'] > 0 else 'red')
        attrs = dict(filter(lambda kv: kv[0] not in {'coordinate', 'label'}, attrs.items()))
        node_text.append('<br>'.join(list(map(lambda kv: f'{kv[0]}: {kv[1]}', attrs.items()))))
    fig = go.Figure()
    df_nodes = pd.DataFrame.from_records(list(dict(nodes_data).values())).drop(columns='coordinate')
    df_nodes['s_cnt'] = df_nodes[['kindergarten', 'bus_buf', 'school', 'tram_stop', 'subway_buf']].sum(axis=1)
    df_nodes['color'] = 'gray'
    df_nodes.loc[df_nodes['s_cnt'] > 0, 'color'] = 'blue'
    df_nodes.loc[df_nodes['education'] == 1, 'color'] = 'green'
    df_nodes.loc[df_nodes['label'].isin({'tram', 'subway', 'bus'}), 'color'] = 'black'
    fig = px.scatter_map(df_nodes, lat="lon", lon="lat", hover_name="education", hover_data=["label", "square"],
                         color=df_nodes['color'],
                         zoom=8,
                         height=1000)
    fig.update_layout(map_style="open-street-map")
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(
        title='Interactive Graph with NetworkX and Plotly',
        hovermode='closest',
    )
    fig.show()

def calculate_distances(geometry: gpd.GeoSeries, point: tuple[float, float]) -> pd.Series:
    dst = gpd.GeoSeries([Point(point), ], crs='epsg:4326').to_crs(epsg=3857)
    return geometry.to_crs(epsg=3857).distance(dst[0])

def add_stops(
        g_: nx.Graph,
        positions: dict[int, tuple[float, float]],
        stops: list[tuple[int, float, float]],
        label: str
):
    geometry = [Point(xy) for xy in list(positions.values())]
    gdf_pos = gpd.GeoDataFrame(list(zip(positions.keys(), positions.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)
    for stop_id, lat, lon in stops:
        distances = calculate_distances(gdf_pos.geometry, (lat, lon))
        distances_less = distances[distances < buffer_size]
        if distances_less.shape[0] > 0:
            if g_.has_node(stop_id):
                raise ValueError('The node already exists')
            g_.add_node(stop_id, coordinate=(lat, lon),
                        label=label, education=0, kindergarten=0, school=0, bus_buf=0, tram_stop=0, subway_buf=0,
                        lat=lat, lon=lon)
            for building_id in gdf_pos[distances < buffer_size]['id']:
                g_.add_edge(building_id, stop_id, meters=0)

def build_data():
    cache_file_path = Path('/content') / filename.replace('.csv', '.pkl')
    if cache_file_path.exists():
        with open(cache_file_path, 'rb') as f:
            graph = pickle.load(f)
        return graph
    df = pd.read_csv(filepath)
    data_json = {'features': df.to_dict(orient='records')}
    stops_bus = extract_stops(filepath_bus)
    stops_tram = extract_stops(filepath_tram)
    stops_sub = extract_stops(filepath_subway)
    graph = nx.Graph()
    squares_map = df_squares.set_index('building_id')['building_area'].to_dict()
    n_count = len(data_json['features'])
    for i, building in enumerate(data_json['features']):
        if i % 100 == 0:
            print(f'Calc {i} / {n_count}')
        props = building['properties'] if 'properties' in building else building
        building_id = props['building_id']
        square = float(squares_map[building_id])
        education = int(props['education'] > 0)
        if square < 200:
            print(f'Skip building "{building_id}" with square {square} and education={education}')
            continue
        x_source, y_source = building['x'], building['y']
        graph.add_node(props['building_id'], coordinate=(x_source, y_source),
                       square=square,
                       label='Y' if props['education'] else 'N',
                       kindergarten=props['kindergarten'],
                       school =props['school'],
                       bus_buf=props['bus_buf'],
                       tram_stop=props['tram_stop'],
                       subway_buf=props['subway_buf'],
                       education=education,
                       lat=x_source,
                       lon=y_source
                       )
    print('The nodes was build')
    pos_: dict[int, tuple[float, float]] = nx.get_node_attributes(graph, 'coordinate')
    geometry = [Point(xy) for xy in list(pos_.values())]
    gdf_pos = gpd.GeoDataFrame(list(zip(pos_.keys(), pos_.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)
    n_count = len(pos_)
    for i, (building_id, coord) in enumerate(pos_.items()):
        if i % 100 == 0:
            print(f'Calc {i} / {n_count}')
        gdf_pos['distances'] = calculate_distances(gdf_pos.geometry, coord)
        indices = (gdf_pos['distances'] > 0) & (gdf_pos['distances'] < buffer_size)
        if indices.any():
            gdf_close = gdf_pos[gdf_pos['distances'] < buffer_size]
        else:
            gdf_close = gdf_pos.sort_values('distances')[1:n_closest]
        was_added_any = False
        for next_building_id, meters in zip(gdf_close['id'], gdf_close['distances']):
            if next_building_id == building_id:
                continue
            graph.add_edge(building_id, next_building_id, meters=meters)
            was_added_any = True
        if not was_added_any:
            raise ValueError('...')
    print('The graph was build')
    add_stops(graph, pos_, stops_bus, 'bus')
    print('The bus were added')
    add_stops(graph, pos_, stops_tram, 'tram')
    print('The tram were added')
    add_stops(graph, pos_, stops_sub, 'subway')
    print('The subway were added')
    with open(cache_file_path, 'wb') as f:
        pickle.dump(graph, f)
    return graph

if __name__ == '__main__':
    graph_data = build_data()
    draw_plotly(graph_data)


binary classification of graph using graph neural network GCN

In [None]:
filename = 'data_high1.pkl'

def read_data() -> nx.Graph:
    cache_file_path = Path('/content') / filename
    with open(cache_file_path, 'rb') as f:
        graph = pickle.load(f)
    return graph

G: nx.Graph = read_data()
N_FEATURES = 9

features = []
labels = []
node_mapping = {}
for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
    node_mapping[node_id] = i
    is_stop = 0
    if node_data['label'] == 'bus':
        is_stop = 1
    elif node_data['label'] == 'tram':
        is_stop = 2
    elif node_data['label'] == 'subway':
        is_stop = 3
    features.append([
        node_data.get('square', 1),
        node_data.get('building_area', 1),
        node_data.get('living_area', 1),
        node_data.get('population_balanced', 1),
        node_data['school'],
        node_data['kindergarten'],
        node_data['bus_buf'],
        node_data['tram_stop'],
        node_data['subway_buf'],
    ])
    labels.append(int(node_data['education'] > 0))

features = torch.tensor(features, dtype=torch.float)
num_nodes = features.size(0)
labels = torch.tensor(labels, dtype=torch.long)
train_indices, test_indices = train_test_split(
    np.arange(num_nodes),
    test_size=0.3,
    stratify=labels.numpy(),
    random_state=21
)
edges = list(map(lambda v: (node_mapping[v[0]], node_mapping[v[1]]), G.edges))
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data = Data(x=features, edge_index=edge_index, y=labels,
            train_mask=train_indices, test_mask=test_indices)
assert edge_index.max() < num_nodes, "Edge index is out of bounds!"

class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(N_FEATURES, 32)
        self.bn1 = torch.nn.BatchNorm1d(32)
        self.conv2 = GCNConv(32, 64)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.conv3 = GCNConv(64, 128)
        self.bn3 = torch.nn.BatchNorm1d(128)
        self.conv4 = GCNConv(128, 256)
        self.bn4 = torch.nn.BatchNorm1d(256)
        self.conv5 = GCNConv(256, 512)
        self.bn5 = torch.nn.BatchNorm1d(512)
        self.conv6 = GCNConv(512, 1024)
        self.bn6 = torch.nn.BatchNorm1d(1024)
        self.conv7 = GCNConv(1024, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index)
        x = self.bn5(x)
        x = F.relu(x)
        x = self.conv6(x, edge_index)
        x = self.bn6(x)
        x = F.relu(x)
        x = self.conv7(x, edge_index)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
class_weights = torch.tensor([1., 10.])
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

def calc_scores(out_, target_) -> tuple[float, float, float]:
    out_max = out_.max(1)[1]
    correct = target_.eq(out_max).sum().item()
    acc = correct / target_.size()[0]
    correct1 = target_[target_.eq(1)].eq(out_max[target_.eq(1)]).sum().item()
    acc1 = correct1 / target_[target_.eq(1)].size()[0]
    f1 = float(f1_score(target_.numpy(), out_max.numpy(), average='macro'))
    return f1, acc, acc1

model.train()
for epoch in range(1000):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        model.eval()
        out = model(data.x, data.edge_index)
        model.train()
        f1_train, acc_train, acc1_train = calc_scores(out[data.train_mask], data.y[data.train_mask])
        f1_test, acc_test, acc1_test = calc_scores(out[data.test_mask], data.y[data.test_mask])
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}, '
              f'f1/train: {f1_train:.2f}, acc/avg/train: {acc_train:.2f}, acc/1/train: {acc1_train:.2f}, '
              f'f1/test: {f1_test:.2f}, acc/avg/test: {acc_test:.2f}, acc/1/test: {acc1_test:.2f}')

def save_to_qgis(data_, pred_):
    data_out = []
    for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
        data_out.append([
            node_id,
            node_data['label'],
            node_data.get('square', 1),
            node_data['kindergarten'],
            node_data['school'],
            node_data['bus_buf'],
            node_data['tram_stop'],
            node_data['subway_buf'],
            node_data['label'],
            node_data['education'],
            node_data['coordinate'][1],
            node_data['coordinate'][0],
            pred_[i]
        ])
    df_out = pd.DataFrame(data_out, columns=['building_id', 'label', 'square', 'kindergarten', 'school', 'bus_buf', 'subway_buf', 'tram_stop',
                                              'label', 'education', 'lat', 'lon', 'predict'])
    print('saving...')
    df_out.iloc[data_.train_mask].to_csv('/content/train2.csv', index=False)
    df_out.iloc[data_.test_mask].to_csv('/content/test2.csv', index=False)

model_filename = 'model.pth'
torch.save(model.state_dict(), model_filename)

model.eval()
with torch.no_grad():
    pred = model(data.x, data.edge_index)
    f1, acc_test, acc1_test = calc_scores(pred[data.test_mask], data.y[data.test_mask])
    save_to_qgis(data, pred.max(1)[1].numpy())
    print(f'TEST: f1: {f1:.2f}, acc/avg/test: {acc_test:.2f}, acc/1/test: {acc1_test:.2f}')

application to new data

1 step

In [None]:
buffer_size = 300
n_closest = 3

path_root_city = Path('/content')
filename = 'data_high2.csv'
filepath = path_root_city / filename
filepath_subway = path_root_city / 'метро.geojson'
filepath_bus = path_root_city / 'остановки автобусов.geojson'
filepath_tram = path_root_city / 'остановки трамвай.geojson'

df_squares = gpd.read_file(path_root_city / 'squares2.csv')
df_squares['building_id'] = df_squares['building_id'].astype(int)

def read_geojson(path: Path) -> Dict:
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_stops(path: Path) -> List[Tuple[int, float, float]]:
    data = read_geojson(path)
    return [(int(f['properties']['osm_id']), f['geometry']['coordinates'][0], f['geometry']['coordinates'][1]) for f in data['features']]

def draw_plotly(g_: nx.Graph) -> None:
    pos_ = nx.get_node_attributes(g_, 'coordinate')
    nodes = list(g_.nodes())

    edge_x, edge_y, edges_text = [], [], []
    for edge in g_.edges(data=True):
        x0, y0 = pos_[edge[0]]
        x1, y1 = pos_[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edges_text.extend([f"Meters: {edge[2]['meters']}", None])

    node_x, node_y, node_text, node_colors = [], [], [], []
    nodes_data = g_.nodes(data=True)
    for node in nodes:
        x, y = pos_[node]
        node_x.append(x)
        node_y.append(y)
        attrs = nodes_data[node]
        node_colors.append('black' if attrs['label'] in {'subway', 'bus', 'tram'} else 'green' if attrs['kindergarten'] > 0 else 'red')
        filtered_attrs = {k: v for k, v in attrs.items() if k not in {'coordinate', 'label'}}
        node_text.append('<br>'.join([f'{k}: {v}' for k, v in filtered_attrs.items()]))

    df_nodes = pd.DataFrame.from_records([dict(nodes_data[node]) for node in nodes]).drop(columns='coordinate')
    df_nodes['color'] = node_colors

    fig = px.scatter_map(df_nodes, lat="lon", lon="lat", hover_name="kindergarten", hover_data=["label", "square"],
                         color=df_nodes['color'], zoom=8, height=1000)
    fig.update_layout(map_style="open-street-map", margin={"r": 0, "t": 0, "l": 0, "b": 0},
                      title='Interactive Graph with NetworkX and Plotly', hovermode='closest')
    fig.show()

def calculate_distances(geometry: gpd.GeoSeries, point: Tuple[float, float]) -> pd.Series:
    dst = gpd.GeoSeries([Point(point)], crs='epsg:4326').to_crs(epsg=3857)
    return geometry.to_crs(epsg=3857).distance(dst[0])

def add_stops(g_: nx.Graph, positions: Dict[int, Tuple[float, float]], stops: List[Tuple[int, float, float]], label: str) -> None:
    geometry = [Point(xy) for xy in positions.values()]
    gdf_pos = gpd.GeoDataFrame(list(zip(positions.keys(), positions.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)

    for stop_id, lat, lon in stops:
        distances = calculate_distances(gdf_pos.geometry, (lat, lon))
        distances_less = distances[distances < buffer_size]
        if distances_less.shape[0] > 0:
            if g_.has_node(stop_id):
                raise ValueError('The node already exists')
            g_.add_node(stop_id, coordinate=(lat, lon), label=label, kindergarten=0, school=0, bus_buf=0, tram_stop=0, lat=lat, lon=lon)
            for building_id in gdf_pos[distances < buffer_size]['id']:
                g_.add_edge(building_id, stop_id, meters=0)

def build_data() -> nx.Graph:
    cache_file_path = Path('/content') / filename.replace('.csv', '.pkl')
    if cache_file_path.exists():
        with open(cache_file_path, 'rb') as f:
            return pickle.load(f)

    df = pd.read_csv(filepath)
    data_json = {'features': df.to_dict(orient='records')}
    stops_bus = extract_stops(filepath_bus)
    stops_tram = extract_stops(filepath_tram)
    stops_sub = extract_stops(filepath_subway)
    graph = nx.Graph()
    squares_map = df_squares.set_index('building_id')['building_area'].to_dict()

    for i, building in enumerate(data_json['features']):
        props = building['properties'] if 'properties' in building else building
        building_id = props['building_id']
        square = float(squares_map[building_id])
        if square < 200:
            continue
        x_source, y_source = building['x'], building['y']
        graph.add_node(building_id, coordinate=(x_source, y_source), square=square,
                       label='Y' if props['kindergarten'] else 'N', kindergarten=props['kindergarten'],
                       school=props['school'], bus_buf=props['bus_buf'], tram_stop=props['tram_stop'],
                       subway_buf=props['subway_buf'], lat=x_source, lon=y_source)

    pos_ = nx.get_node_attributes(graph, 'coordinate')
    geometry = [Point(xy) for xy in pos_.values()]
    gdf_pos = gpd.GeoDataFrame(list(zip(pos_.keys(), pos_.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)

    for i, (building_id, coord) in enumerate(pos_.items()):
        gdf_pos['distances'] = calculate_distances(gdf_pos.geometry, coord)
        indices = (gdf_pos['distances'] > 0) & (gdf_pos['distances'] < buffer_size)
        gdf_close = gdf_pos[indices] if indices.any() else gdf_pos.sort_values('distances')[1:n_closest]

        for next_building_id, meters in zip(gdf_close['id'], gdf_close['distances']):
            if next_building_id != building_id:
                graph.add_edge(building_id, next_building_id, meters=meters)

    add_stops(graph, pos_, stops_bus, 'bus')
    add_stops(graph, pos_, stops_tram, 'tram')
    add_stops(graph, pos_, stops_sub, 'subway')

    with open(cache_file_path, 'wb') as f:
        pickle.dump(graph, f)
    return graph

if __name__ == '__main__':
    graph_data = build_data()
    draw_plotly(graph_data)

2 step

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from pathlib import Path
import pickle

class GCN(torch.nn.Module):
    def __init__(self, n_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(n_features, 32)
        self.bn1 = torch.nn.BatchNorm1d(32)
        self.conv2 = GCNConv(32, 64)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.conv3 = GCNConv(64, 128)
        self.bn3 = torch.nn.BatchNorm1d(128)
        self.conv4 = GCNConv(128, 256)
        self.bn4 = torch.nn.BatchNorm1d(256)
        self.conv5 = GCNConv(256, 512)
        self.bn5 = torch.nn.BatchNorm1d(512)
        self.conv6 = GCNConv(512, 1024)
        self.bn6 = torch.nn.BatchNorm1d(1024)
        self.conv7 = GCNConv(1024, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index)
        x = self.bn5(x)
        x = F.relu(x)
        x = self.conv6(x, edge_index)
        x = self.bn6(x)
        x = F.relu(x)
        x = self.conv7(x, edge_index)
        return x

def read_data(filename):
    with open(Path('/content') / filename, 'rb') as f:
        return pickle.load(f)

def prepare_data(G):
    features = []
    node_mapping = {}
    for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
        node_mapping[node_id] = i
        features.append([
            node_data.get('square', 1),
            node_data.get('building_area', 1),
            node_data.get('living_area', 1),
            node_data.get('population_balanced', 1),
            node_data['school'],
            node_data['kindergarten'],
            node_data['bus_buf'],
            node_data['tram_stop'],
            node_data.get('subway_buf', 0)
        ])
    features = torch.tensor(features, dtype=torch.float)
    edges = [(node_mapping[v[0]], node_mapping[v[1]]) for v in G.edges]
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    return Data(x=features, edge_index=edge_index), node_mapping

def predict_and_save(model_path, graph_path, output_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    G = read_data(graph_path)
    data, node_mapping = prepare_data(G)

    model = GCN(data.num_features).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    with torch.no_grad():
        pred = model(data.x.to(device), data.edge_index.to(device))
        pred_labels = pred.max(1)[1].cpu().numpy()

    results = []
    for node_id, node_data in G.nodes(data=True):
        results.append([
            node_id,
            node_data['label'],
            node_data.get('square', 1),
            node_data['kindergarten'],
            node_data['school'],
            node_data['bus_buf'],
            node_data.get('subway_buf', 0),
            node_data['tram_stop'],
            node_data.get('education', 0),
            node_data['coordinate'][1],
            node_data['coordinate'][0],
            pred_labels[node_mapping[node_id]]
        ])

    df = pd.DataFrame(results, columns=[
        'building_id', 'label', 'square', 'kindergarten', 'school',
        'bus_buf', 'subway_buf', 'tram_stop', 'education', 'y', 'x', 'predict'
    ])
    df.to_csv(output_path, index=False)

if __name__ == '__main__':
    predict_and_save(
        model_path='model.pth',
        graph_path='data_high2.pkl',
        output_path='/content/predictions_education.csv'
    )

# Stage 2 - Food servecies predict

code to create graph

In [None]:
buffer_size = 300
n_closest = 3
path_root_city = Path('/content')
filename = 'data_high3.csv'
filepath = path_root_city / filename
filepath_subway = path_root_city / 'метро.geojson'
filepath_bus = path_root_city / 'остановки автобусов.geojson'
filepath_tram = path_root_city / 'остановки трамвай.geojson'
df_squares = gpd.read_file(path_root_city / 'squares1.csv')
df_squares['building_id'] = df_squares['building_id'].astype(int)

def read_geojson(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def extract_stops(path: Path) -> list[tuple[int, float, float]]:
    data = read_geojson(path)
    coords = list(map(lambda v: (int(v['properties']['osm_id']),
                               v['geometry']['coordinates'][0],
                               v['geometry']['coordinates'][1]), data['features']))
    return coords

def draw_plotly(g_):
    pos_ = nx.get_node_attributes(g_, 'coordinate')
    nodes = list(g_.nodes())
    edge_x = []
    edge_y = []
    edges_text = []
    for edge in g_.edges(data=True):
        x0, y0 = pos_[edge[0]]
        x1, y1 = pos_[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edges_text.extend([f"Meters: {edge[2]['meters']}', f'Meters: {edge[2]['meters']}", None])
    nodes_data = g_.nodes(data=True)
    node_x = []
    node_y = []
    node_text = []
    node_colors = []
    for node in nodes:
        x, y = pos_[node]
        node_x.append(x)
        node_y.append(y)
        attrs = nodes_data[node]
        if attrs['label'] in {'subway', 'bus', 'tram'}:
            node_colors.append('black')
        else:
            node_colors.append('green' if attrs['food'] > 0 else 'red')
        attrs = dict(filter(lambda kv: kv[0] not in {'coordinate', 'label'}, attrs.items()))
        node_text.append('<br>'.join(list(map(lambda kv: f'{kv[0]}: {kv[1]}', attrs.items()))))
    df_nodes = pd.DataFrame.from_records(list(dict(nodes_data).values())).drop(columns='coordinate')
    df_nodes['s_cnt'] = df_nodes[['kindergarten', 'bus_buf', 'school', 'tram_stop', 'subway_buf', 'education', 'edu_buf']].sum(axis=1)
    df_nodes['color'] = 'gray'
    df_nodes.loc[df_nodes['s_cnt'] > 0, 'color'] = 'blue'
    df_nodes.loc[df_nodes['food'] == 1, 'color'] = 'green'
    df_nodes.loc[df_nodes['label'].isin({'tram', 'subway', 'bus'}), 'color'] = 'black'
    fig = px.scatter_map(df_nodes, lat="lon", lon="lat", hover_name="food", hover_data=["label", "square"],
                       color=df_nodes['color'],
                       zoom=8,
                       height=1000)
    fig.update_layout(map_style="open-street-map")
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(
        title='Interactive Graph with NetworkX and Plotly',
        hovermode='closest',
    )
    fig.show()

def calculate_distances(geometry: gpd.GeoSeries, point: tuple[float, float]) -> pd.Series:
    dst = gpd.GeoSeries([Point(point), ], crs='epsg:4326').to_crs(epsg=3857)
    return geometry.to_crs(epsg=3857).distance(dst[0])

def add_stops(
        g_: nx.Graph,
        positions: dict[int, tuple[float, float]],
        stops: list[tuple[int, float, float]],
        label: str
):
    geometry = [Point(xy) for xy in list(positions.values())]
    gdf_pos = gpd.GeoDataFrame(list(zip(positions.keys(), positions.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)
    for stop_id, lat, lon in stops:
        distances = calculate_distances(gdf_pos.geometry, (lat, lon))
        distances_less = distances[distances < buffer_size]
        if distances_less.shape[0] > 0:
            if g_.has_node(stop_id):
                raise ValueError('The node already exists')
            g_.add_node(stop_id, coordinate=(lat, lon),
                      label=label, education=0, kindergarten=0, school=0, bus_buf=0, tram_stop=0, subway_buf=0, edu_buf=0, food=0,
                      lat=lat, lon=lon)
            for building_id in gdf_pos[distances < buffer_size]['id']:
                g_.add_edge(building_id, stop_id, meters=0)

def build_data():
    cache_file_path = Path('/content') / filename.replace('.csv', '.pkl')
    if cache_file_path.exists():
        with open(cache_file_path, 'rb') as f:
            graph = pickle.load(f)
        return graph
    df = pd.read_csv(filepath)
    data_json = {'features': df.to_dict(orient='records')}
    stops_bus = extract_stops(filepath_bus)
    stops_tram = extract_stops(filepath_tram)
    stops_sub = extract_stops(filepath_subway)
    graph = nx.Graph()
    squares_map = df_squares.set_index('building_id')['building_area'].to_dict()
    n_count = len(data_json['features'])
    for i, building in enumerate(data_json['features']):
        if i % 100 == 0:
            print(f'Calc {i} / {n_count}')
        props = building['properties'] if 'properties' in building else building
        building_id = props['building_id']
        square = float(squares_map[building_id])
        food = int(props['food'] > 0)
        if square < 200:
            print(f'Skip building "{building_id}" with square {square} and food={food}')
            continue
        x_source, y_source = building['x'], building['y']
        graph.add_node(props['building_id'], coordinate=(x_source, y_source),
                     square=square,
                     label='Y' if props['food'] else 'N',
                     kindergarten=props['kindergarten'],
                     school=props['school'],
                     bus_buf=props['bus_buf'],
                     tram_stop=props['tram_stop'],
                     subway_buf=props['subway_buf'],
                     education=props['education'],
                     edu_buf=props['edu_buf'],
                     food=food,
                     lat=x_source,
                     lon=y_source)
    print('The nodes was build')
    pos_: dict[int, tuple[float, float]] = nx.get_node_attributes(graph, 'coordinate')
    geometry = [Point(xy) for xy in list(pos_.values())]
    gdf_pos = gpd.GeoDataFrame(list(zip(pos_.keys(), pos_.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)
    n_count = len(pos_)
    for i, (building_id, coord) in enumerate(pos_.items()):
        if i % 100 == 0:
            print(f'Calc {i} / {n_count}')
        gdf_pos['distances'] = calculate_distances(gdf_pos.geometry, coord)
        indices = (gdf_pos['distances'] > 0) & (gdf_pos['distances'] < buffer_size)
        if indices.any():
            gdf_close = gdf_pos[gdf_pos['distances'] < buffer_size]
        else:
            gdf_close = gdf_pos.sort_values('distances')[1:n_closest]
        was_added_any = False
        for next_building_id, meters in zip(gdf_close['id'], gdf_close['distances']):
            if next_building_id == building_id:
                continue
            graph.add_edge(building_id, next_building_id, meters=meters)
            was_added_any = True
        if not was_added_any:
            raise ValueError('...')
    print('The graph was build')
    add_stops(graph, pos_, stops_bus, 'bus')
    print('The bus were added')
    add_stops(graph, pos_, stops_tram, 'tram')
    print('The tram were added')
    add_stops(graph, pos_, stops_sub, 'subway')
    print('The subway were added')
    with open(cache_file_path, 'wb') as f:
        pickle.dump(graph, f)
    return graph

if __name__ == '__main__':
    graph_data = build_data()
    draw_plotly(graph_data)

binary classification of graph using graph neural network GCN

In [None]:
filename = 'data_high3.pkl'

def read_data() -> nx.Graph:
    cache_file_path = Path('/content') / filename
    with open(cache_file_path, 'rb') as f:
        graph = pickle.load(f)
    return graph

G: nx.Graph = read_data()
N_FEATURES = 12

features = []
labels = []
node_mapping = {}
for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
    node_mapping[node_id] = i
    is_stop = 0
    if node_data['label'] == 'bus':
        is_stop = 1
    elif node_data['label'] == 'tram':
        is_stop = 2
    elif node_data['label'] == 'subway':
        is_stop = 3
    features.append([
        node_data.get('square', 1),
        node_data.get('building_area', 1),
        node_data.get('living_area', 1),
        node_data.get('population_balanced', 1),
        node_data['school'],
        node_data['kindergarten'],
        node_data['bus_buf'],
        node_data['tram_stop'],
        node_data['subway_buf'],
        node_data['edu_buf'],
        node_data['education'],
        is_stop
    ])
    labels.append(int(node_data['food'] > 0))

features = torch.tensor(features, dtype=torch.float)
num_nodes = features.size(0)
labels = torch.tensor(labels, dtype=torch.long)
train_indices, test_indices = train_test_split(
    np.arange(num_nodes),
    test_size=0.3,
    stratify=labels.numpy(),
    random_state=21
)
edges = list(map(lambda v: (node_mapping[v[0]], node_mapping[v[1]]), G.edges))
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data = Data(x=features, edge_index=edge_index, y=labels,
            train_mask=train_indices, test_mask=test_indices)
assert edge_index.max() < num_nodes, "Edge index is out of bounds!"

class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(N_FEATURES, 32)
        self.bn1 = torch.nn.BatchNorm1d(32)
        self.conv2 = GCNConv(32, 64)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.conv3 = GCNConv(64, 128)
        self.bn3 = torch.nn.BatchNorm1d(128)
        self.conv4 = GCNConv(128, 256)
        self.bn4 = torch.nn.BatchNorm1d(256)
        self.conv5 = GCNConv(256, 512)
        self.bn5 = torch.nn.BatchNorm1d(512)
        self.conv6 = GCNConv(512, 1024)
        self.bn6 = torch.nn.BatchNorm1d(1024)
        self.conv7 = GCNConv(1024, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index)
        x = self.bn5(x)
        x = F.relu(x)
        x = self.conv6(x, edge_index)
        x = self.bn6(x)
        x = F.relu(x)
        x = self.conv7(x, edge_index)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
class_weights = torch.tensor([1., 10.])
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

def calc_scores(out_, target_) -> tuple[float, float, float]:
    out_max = out_.max(1)[1]
    correct = target_.eq(out_max).sum().item()
    acc = correct / target_.size()[0]
    correct1 = target_[target_.eq(1)].eq(out_max[target_.eq(1)]).sum().item()
    acc1 = correct1 / target_[target_.eq(1)].size()[0]
    f1 = float(f1_score(target_.numpy(), out_max.numpy(), average='macro'))
    return f1, acc, acc1

model.train()
for epoch in range(1000):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        model.eval()
        out = model(data.x, data.edge_index)
        model.train()
        f1_train, acc_train, acc1_train = calc_scores(out[data.train_mask], data.y[data.train_mask])
        f1_test, acc_test, acc1_test = calc_scores(out[data.test_mask], data.y[data.test_mask])
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}, '
              f'f1/train: {f1_train:.2f}, acc/avg/train: {acc_train:.2f}, acc/1/train: {acc1_train:.2f}, '
              f'f1/test: {f1_test:.2f}, acc/avg/test: {acc_test:.2f}, acc/1/test: {acc1_test:.2f}')

def save_to_qgis(data_, pred_):
    data_out = []
    for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
        data_out.append([
            node_id,
            node_data['label'],
            node_data.get('square', 1),
            node_data['kindergarten'],
            node_data['school'],
            node_data['bus_buf'],
            node_data['tram_stop'],
            node_data['subway_buf'],
            node_data['label'],
            node_data['education'],
            node_data['edu_buf'],
            node_data['food'],
            node_data['coordinate'][1],
            node_data['coordinate'][0],
            pred_[i]
        ])
    df_out = pd.DataFrame(data_out, columns=['building_id', 'label', 'square', 'kindergarten', 'school', 'bus_buf', 'subway_buf', 'tram_stop',
                                            'label', 'education', 'edu_buf', 'food', 'lat', 'lon', 'predict'])
    print('saving...')
    df_out.iloc[data_.train_mask].to_csv('/content/train3.csv', index=False)
    df_out.iloc[data_.test_mask].to_csv('/content/test3.csv', index=False)

model_filename = 'model2.pth'
torch.save(model.state_dict(), model_filename)

model.eval()
with torch.no_grad():
    pred = model(data.x, data.edge_index)
    f1, acc_test, acc1_test = calc_scores(pred[data.test_mask], data.y[data.test_mask])
    save_to_qgis(data, pred.max(1)[1].numpy())
    print(f'TEST: f1: {f1:.2f}, acc/avg/test: {acc_test:.2f}, acc/1/test: {acc1_test:.2f}')

application to new data

1 step

In [None]:
import json
import pickle
from pathlib import Path
from typing import Tuple, List, Dict
import geopandas as gpd
import networkx as nx
import pandas as pd
import plotly.express as px
from shapely.geometry import Point

buffer_size = 300
n_closest = 3
path_root_city = Path('/content')
filename = 'data_high4.csv'
filepath = path_root_city / filename
filepath_subway = path_root_city / 'метро.geojson'
filepath_bus = path_root_city / 'остановки автобусов.geojson'
filepath_tram = path_root_city / 'остановки трамвай.geojson'
df_squares = gpd.read_file(path_root_city / 'squares2.csv')
df_squares['building_id'] = df_squares['building_id'].astype(int)

def read_geojson(path: Path) -> Dict:
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_stops(path: Path) -> List[Tuple[int, float, float]]:
    data = read_geojson(path)
    return [(int(f['properties']['osm_id']), f['geometry']['coordinates'][0], f['geometry']['coordinates'][1]) for f in data['features']]

def draw_plotly(g_: nx.Graph) -> None:
    pos_ = nx.get_node_attributes(g_, 'coordinate')
    nodes = list(g_.nodes())

    edge_x, edge_y, edges_text = [], [], []
    for edge in g_.edges(data=True):
        x0, y0 = pos_[edge[0]]
        x1, y1 = pos_[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edges_text.extend([f"Meters: {edge[2]['meters']}", None])

    node_x, node_y, node_text, node_colors = [], [], [], []
    nodes_data = g_.nodes(data=True)
    for node in nodes:
        x, y = pos_[node]
        node_x.append(x)
        node_y.append(y)
        attrs = nodes_data[node]
        node_colors.append('black' if attrs['label'] in {'subway', 'bus', 'tram'} else 'green' if attrs['kindergarten'] > 0 else 'red')
        filtered_attrs = {k: v for k, v in attrs.items() if k not in {'coordinate', 'label'}}
        node_text.append('<br>'.join([f'{k}: {v}' for k, v in filtered_attrs.items()]))

    df_nodes = pd.DataFrame.from_records([dict(nodes_data[node]) for node in nodes]).drop(columns='coordinate')
    df_nodes['color'] = node_colors

    fig = px.scatter_map(df_nodes, lat="lon", lon="lat", hover_name="kindergarten", hover_data=["label", "square"],
                         color=df_nodes['color'], zoom=8, height=1000)
    fig.update_layout(map_style="open-street-map", margin={"r": 0, "t": 0, "l": 0, "b": 0},
                      title='Interactive Graph with NetworkX and Plotly', hovermode='closest')
    fig.show()

def calculate_distances(geometry: gpd.GeoSeries, point: Tuple[float, float]) -> pd.Series:
    dst = gpd.GeoSeries([Point(point)], crs='epsg:4326').to_crs(epsg=3857)
    return geometry.to_crs(epsg=3857).distance(dst[0])

def add_stops(g_: nx.Graph, positions: Dict[int, Tuple[float, float]], stops: List[Tuple[int, float, float]], label: str) -> None:
    geometry = [Point(xy) for xy in positions.values()]
    gdf_pos = gpd.GeoDataFrame(list(zip(positions.keys(), positions.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)

    for stop_id, lat, lon in stops:
        distances = calculate_distances(gdf_pos.geometry, (lat, lon))
        distances_less = distances[distances < buffer_size]
        if distances_less.shape[0] > 0:
            if g_.has_node(stop_id):
                raise ValueError('Node already exists')
            g_.add_node(stop_id, coordinate=(lat, lon), label=label, kindergarten=0, school=0,
                       bus_buf=0, tram_stop=0, subway_buf=0, lat=lat, lon=lon, education=0, edu_buf=0)
            for building_id in gdf_pos[distances < buffer_size]['id']:
                g_.add_edge(building_id, stop_id, meters=0)

def build_data() -> nx.Graph:
    cache_file_path = Path('/content') / filename.replace('.csv', '.pkl')
    if cache_file_path.exists():
        with open(cache_file_path, 'rb') as f:
            return pickle.load(f)

    df = pd.read_csv(filepath)
    data_json = {'features': df.to_dict(orient='records')}
    stops_bus = extract_stops(filepath_bus)
    stops_tram = extract_stops(filepath_tram)
    stops_sub = extract_stops(filepath_subway)
    graph = nx.Graph()
    squares_map = df_squares.set_index('building_id')['building_area'].to_dict()

    for i, building in enumerate(data_json['features']):
        props = building.get('properties', building)
        building_id = props['building_id']
        square = float(squares_map[building_id])

        if square < 200:
            continue

        x_source, y_source = building['x'], building['y']
        graph.add_node(building_id, coordinate=(x_source, y_source), square=square,
                      label='Y' if props['kindergarten'] else 'N', kindergarten=props['kindergarten'],
                      school=props['school'], bus_buf=props['bus_buf'], tram_stop=props['tram_stop'],
                      subway_buf=props['subway_buf'], education=props['education'], edu_buf=props['edu_buf'],
                      lat=x_source, lon=y_source)

    pos_ = nx.get_node_attributes(graph, 'coordinate')
    geometry = [Point(xy) for xy in pos_.values()]
    gdf_pos = gpd.GeoDataFrame(list(zip(pos_.keys(), pos_.values())), columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)

    for i, (building_id, coord) in enumerate(pos_.items()):
        gdf_pos['distances'] = calculate_distances(gdf_pos.geometry, coord)
        indices = (gdf_pos['distances'] > 0) & (gdf_pos['distances'] < buffer_size)
        gdf_close = gdf_pos[indices] if indices.any() else gdf_pos.sort_values('distances')[1:n_closest]

        for next_building_id, meters in zip(gdf_close['id'], gdf_close['distances']):
            if next_building_id != building_id:
                graph.add_edge(building_id, next_building_id, meters=meters)

    add_stops(graph, pos_, stops_bus, 'bus')
    add_stops(graph, pos_, stops_tram, 'tram')
    add_stops(graph, pos_, stops_sub, 'subway')

    with open(cache_file_path, 'wb') as f:
        pickle.dump(graph, f)
    return graph

if __name__ == '__main__':
    graph_data = build_data()
    draw_plotly(graph_data)

2 step

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from pathlib import Path
import pickle

class GCN(torch.nn.Module):
    def __init__(self, n_features: int):
        super(GCN, self).__init__()
        self.layers = torch.nn.ModuleList([
            GCNConv(n_features, 32),
            GCNConv(32, 64),
            GCNConv(64, 128),
            GCNConv(128, 256),
            GCNConv(256, 512),
            GCNConv(512, 1024),
            GCNConv(1024, 2)
        ])
        self.batch_norms = torch.nn.ModuleList([
            torch.nn.BatchNorm1d(32),
            torch.nn.BatchNorm1d(64),
            torch.nn.BatchNorm1d(128),
            torch.nn.BatchNorm1d(256),
            torch.nn.BatchNorm1d(512),
            torch.nn.BatchNorm1d(1024)
        ])

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        for i, (conv, bn) in enumerate(zip(self.layers[:-1], self.batch_norms)):
            x = conv(x, edge_index)
            x = bn(x)
            x = F.relu(x)
        return self.layers[-1](x, edge_index)

def read_data(filename: str) -> nx.Graph:
    with open(Path('/content') / filename, 'rb') as f:
        return pickle.load(f)

def prepare_graph_data(G: nx.Graph) -> tuple[Data, dict]:
    features, node_mapping = [], {}
    for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
        node_mapping[node_id] = i
        stop_type = {
            'bus': 1,
            'tram': 2,
            'subway': 3
        }.get(node_data['label'], 0)

        features.append([
            node_data.get('square', 1),
            node_data.get('building_area', 1),
            node_data.get('living_area', 1),
            node_data.get('population_balanced', 1),
            node_data['school'],
            node_data['kindergarten'],
            node_data['bus_buf'],
            node_data['tram_stop'],
            node_data['subway_buf'],
            node_data['education'],
            node_data['edu_buf'],
            stop_type
        ])

    features = torch.tensor(features, dtype=torch.float)
    edges = [(node_mapping[u], node_mapping[v]) for u, v in G.edges()]
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    return Data(x=features, edge_index=edge_index), node_mapping

def predict_and_save(model_path: str, graph_path: str, output_path: str) -> None:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    G = read_data(graph_path)
    data, node_mapping = prepare_graph_data(G)

    model = GCN(data.num_features).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    with torch.no_grad():
        pred = model(data.x.to(device), data.edge_index.to(device))
        pred_labels = pred.argmax(dim=1).cpu().numpy()

    results = []
    for node_id, node_data in G.nodes(data=True):
        results.append([
            node_id,
            node_data['label'],
            node_data.get('square', 1),
            node_data['kindergarten'],
            node_data['school'],
            node_data['bus_buf'],
            node_data['subway_buf'],
            node_data['tram_stop'],
            node_data['education'],
            node_data['edu_buf'],
            node_data.get('food', 0),
            node_data['coordinate'][1],
            node_data['coordinate'][0],
            pred_labels[node_mapping[node_id]]
        ])

    pd.DataFrame(results, columns=[
        'building_id', 'label', 'square', 'kindergarten', 'school',
        'bus_buf', 'tram_stop', 'subway_buf', 'education',
        'edu_buf', 'food', 'y', 'x', 'predict'
    ]).to_csv(output_path, index=False)

if __name__ == '__main__':
    predict_and_save(
        model_path='model2.pth',
        graph_path='data_high4.pkl',
        output_path='/content/predictions_food.csv'
    )

# Stage 3 - Commerce servecies predict

code to create graph

In [None]:
buffer_size = 300
n_closest = 3
path_root_city = Path('/content')
filename = 'data_high5.csv'
filepath = path_root_city / filename
filepath_subway = path_root_city / 'метро.geojson'
filepath_bus = path_root_city / 'остановки автобусов.geojson'
filepath_tram = path_root_city / 'остановки трамвай.geojson'
df_squares = gpd.read_file(path_root_city / 'squares1.csv')
df_squares['building_id'] = df_squares['building_id'].astype(int)

def read_geojson(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def extract_stops(path: Path) -> list[tuple[int, float, float]]:
    data = read_geojson(path)
    coords = list(map(lambda v: (int(v['properties']['osm_id']),
                     v['geometry']['coordinates'][0],
                     v['geometry']['coordinates'][1]), data['features']))
    return coords

def draw_plotly(g_):
    pos_ = nx.get_node_attributes(g_, 'coordinate')
    nodes = list(g_.nodes())
    edge_x = []
    edge_y = []
    edges_text = []
    for edge in g_.edges(data=True):
        x0, y0 = pos_[edge[0]]
        x1, y1 = pos_[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edges_text.extend([f"Meters: {edge[2]['meters']}", f"Meters: {edge[2]['meters']}", None])

    nodes_data = g_.nodes(data=True)
    df_nodes = pd.DataFrame.from_records(list(dict(nodes_data).values())).drop(columns='coordinate')
    df_nodes['s_cnt'] = df_nodes[['kindergarten', 'bus_buf', 'school', 'tram_stop', 'subway_buf', 'education', 'edu_buf']].sum(axis=1)
    df_nodes['color'] = 'gray'
    df_nodes.loc[df_nodes['s_cnt'] > 0, 'color'] = 'blue'
    df_nodes.loc[df_nodes['commerce'] == 1, 'color'] = 'green'
    df_nodes.loc[df_nodes['label'].isin({'tram', 'subway', 'bus'}), 'color'] = 'black'

    fig = px.scatter_map(df_nodes, lat="lon", lon="lat", hover_name="commerce",
                        hover_data=["label", "square"], color=df_nodes['color'],
                        zoom=8, height=1000)
    fig.update_layout(map_style="open-street-map")
    fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})
    fig.update_layout(title='Interactive Graph with NetworkX and Plotly', hovermode='closest')
    fig.show()

def calculate_distances(geometry: gpd.GeoSeries, point: tuple[float, float]) -> pd.Series:
    dst = gpd.GeoSeries([Point(point)], crs='epsg:4326').to_crs(epsg=3857)
    return geometry.to_crs(epsg=3857).distance(dst[0])

def add_stops(g_: nx.Graph, positions: dict[int, tuple[float, float]],
             stops: list[tuple[int, float, float]], label: str):
    geometry = [Point(xy) for xy in list(positions.values())]
    gdf_pos = gpd.GeoDataFrame(list(zip(positions.keys(), positions.values())),
                              columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)

    for stop_id, lat, lon in stops:
        distances = calculate_distances(gdf_pos.geometry, (lat, lon))
        distances_less = distances[distances < buffer_size]
        if distances_less.shape[0] > 0:
            if g_.has_node(stop_id):
                raise ValueError('The node already exists')
            g_.add_node(stop_id, coordinate=(lat, lon),
                       label=label, education=0, kindergarten=0, school=0,
                       bus_buf=0, tram_stop=0, subway_buf=0, edu_buf=0,
                       food=0, commerce=0, food_buf=0, lat=lat, lon=lon)
            for building_id in gdf_pos[distances < buffer_size]['id']:
                g_.add_edge(building_id, stop_id, meters=0)

def build_data():
    cache_file_path = Path('/content') / filename.replace('.csv', '.pkl')
    if cache_file_path.exists():
        with open(cache_file_path, 'rb') as f:
            graph = pickle.load(f)
        return graph

    df = pd.read_csv(filepath)
    data_json = {'features': df.to_dict(orient='records')}
    stops_bus = extract_stops(filepath_bus)
    stops_tram = extract_stops(filepath_tram)
    stops_sub = extract_stops(filepath_subway)
    graph = nx.Graph()
    squares_map = df_squares.set_index('building_id')['building_area'].to_dict()

    n_count = len(data_json['features'])
    for i, building in enumerate(data_json['features']):
        if i % 100 == 0:
            print(f'Calc {i} / {n_count}')

        props = building['properties'] if 'properties' in building else building
        building_id = props['building_id']
        square = float(squares_map[building_id])
        commerce = int(props['commerce'] > 0)

        if square < 200:
            print(f'Skip building "{building_id}" with square {square} and commerce={commerce}')
            continue

        x_source, y_source = building['x'], building['y']
        graph.add_node(props['building_id'], coordinate=(x_source, y_source),
                      square=square, label='Y' if props['commerce'] else 'N',
                      kindergarten=props['kindergarten'], school=props['school'],
                      bus_buf=props['bus_buf'], tram_stop=props['tram_stop'],
                      subway_buf=props['subway_buf'], education=props['education'],
                      edu_buf=props['edu_buf'], food_buf=props['food_buf'],
                      food=props['food'], commerce=commerce, lat=x_source, lon=y_source)

    print('The nodes was build')
    pos_ = nx.get_node_attributes(graph, 'coordinate')
    geometry = [Point(xy) for xy in list(pos_.values())]
    gdf_pos = gpd.GeoDataFrame(list(zip(pos_.keys(), pos_.values())),
                              columns=['id', 'geometry'], crs='epsg:4326', geometry=geometry)

    n_count = len(pos_)
    for i, (building_id, coord) in enumerate(pos_.items()):
        if i % 100 == 0:
            print(f'Calc {i} / {n_count}')

        gdf_pos['distances'] = calculate_distances(gdf_pos.geometry, coord)
        indices = (gdf_pos['distances'] > 0) & (gdf_pos['distances'] < buffer_size)
        gdf_close = gdf_pos[gdf_pos['distances'] < buffer_size] if indices.any() else gdf_pos.sort_values('distances')[1:n_closest]

        was_added_any = False
        for next_building_id, meters in zip(gdf_close['id'], gdf_close['distances']):
            if next_building_id != building_id:
                graph.add_edge(building_id, next_building_id, meters=meters)
                was_added_any = True

        if not was_added_any:
            raise ValueError('No edges added')

    print('The graph was build')
    add_stops(graph, pos_, stops_bus, 'bus')
    print('The bus were added')
    add_stops(graph, pos_, stops_tram, 'tram')
    print('The tram were added')
    add_stops(graph, pos_, stops_sub, 'subway')
    print('The subway were added')

    with open(cache_file_path, 'wb') as f:
        pickle.dump(graph, f)
    return graph

if __name__ == '__main__':
    graph_data = build_data()
    draw_plotly(graph_data)

binary classification of graph using graph neural network GCN

In [None]:
filename = 'data_high5.pkl'

def read_data() -> nx.Graph:
    cache_file_path = Path('/content') / filename
    with open(cache_file_path, 'rb') as f:
        graph = pickle.load(f)
    return graph

G: nx.Graph = read_data()
N_FEATURES = 14

features = []
labels = []
node_mapping = {}
for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
    node_mapping[node_id] = i
    is_stop = 0
    if node_data['label'] == 'bus':
        is_stop = 1
    elif node_data['label'] == 'tram':
        is_stop = 2
    elif node_data['label'] == 'subway':
        is_stop = 3
    features.append([
        node_data.get('square', 1),
        node_data.get('building_area', 1),
        node_data.get('living_area', 1),
        node_data.get('population_balanced', 1),
        node_data['school'],
        node_data['kindergarten'],
        node_data['bus_buf'],
        node_data['tram_stop'],
        node_data['subway_buf'],
        node_data['edu_buf'],
        node_data['education'],
        node_data['food'],
        node_data['food_buf'],
        is_stop
    ])
    labels.append(int(node_data['commerce'] > 0))

features = torch.tensor(features, dtype=torch.float)
num_nodes = features.size(0)
labels = torch.tensor(labels, dtype=torch.long)
train_indices, test_indices = train_test_split(
    np.arange(num_nodes),
    test_size=0.3,
    stratify=labels.numpy(),
    random_state=21
)
edges = list(map(lambda v: (node_mapping[v[0]], node_mapping[v[1]]), G.edges))
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data = Data(x=features, edge_index=edge_index, y=labels,
            train_mask=train_indices, test_mask=test_indices)
assert edge_index.max() < num_nodes, "Edge index is out of bounds!"

class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(N_FEATURES, 32)
        self.bn1 = torch.nn.BatchNorm1d(32)
        self.conv2 = GCNConv(32, 64)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.conv3 = GCNConv(64, 128)
        self.bn3 = torch.nn.BatchNorm1d(128)
        self.conv4 = GCNConv(128, 256)
        self.bn4 = torch.nn.BatchNorm1d(256)
        self.conv5 = GCNConv(256, 512)
        self.bn5 = torch.nn.BatchNorm1d(512)
        self.conv6 = GCNConv(512, 1024)
        self.bn6 = torch.nn.BatchNorm1d(1024)
        self.conv7 = GCNConv(1024, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index)
        x = self.bn5(x)
        x = F.relu(x)
        x = self.conv6(x, edge_index)
        x = self.bn6(x)
        x = F.relu(x)
        x = self.conv7(x, edge_index)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
class_weights = torch.tensor([1., 10.])
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

def calc_scores(out_, target_) -> tuple[float, float, float]:
    out_max = out_.max(1)[1]
    correct = target_.eq(out_max).sum().item()
    acc = correct / target_.size()[0]
    correct1 = target_[target_.eq(1)].eq(out_max[target_.eq(1)]).sum().item()
    acc1 = correct1 / target_[target_.eq(1)].size()[0]
    f1 = float(f1_score(target_.numpy(), out_max.numpy(), average='macro'))
    return f1, acc, acc1

model.train()
for epoch in range(1000):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        model.eval()
        out = model(data.x, data.edge_index)
        model.train()
        f1_train, acc_train, acc1_train = calc_scores(out[data.train_mask], data.y[data.train_mask])
        f1_test, acc_test, acc1_test = calc_scores(out[data.test_mask], data.y[data.test_mask])
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}, '
              f'f1/train: {f1_train:.2f}, acc/avg/train: {acc_train:.2f}, acc/1/train: {acc1_train:.2f}, '
              f'f1/test: {f1_test:.2f}, acc/avg/test: {acc_test:.2f}, acc/1/test: {acc1_test:.2f}')

def save_to_qgis(data_, pred_):
    data_out = []
    for i, (node_id, node_data) in enumerate(G.nodes(data=True)):
        data_out.append([
            node_id,
            node_data['label'],
            node_data.get('square', 1),
            node_data['kindergarten'],
            node_data['school'],
            node_data['bus_buf'],
            node_data['tram_stop'],
            node_data['subway_buf'],
            node_data['education'],
            node_data['edu_buf'],
            node_data['food'],
            node_data['food_buf'],
            node_data['commerce'],
            node_data['coordinate'][1],
            node_data['coordinate'][0],
            pred_[i]
        ])
    df_out = pd.DataFrame(data_out, columns=['building_id', 'label', 'square', 'kindergarten', 'school', 'bus_buf',
                                           'subway_buf', 'tram_stop', 'education', 'edu_buf', 'food', 'food_buf',
                                           'commerce', 'lat', 'lon', 'predict'])
    print('saving...')
    df_out.iloc[data_.train_mask].to_csv('/content/train4.csv', index=False)
    df_out.iloc[data_.test_mask].to_csv('/content/test4.csv', index=False)

model_filename = 'model3.pth'
torch.save(model.state_dict(), model_filename)

model.eval()
with torch.no_grad():
    pred = model(data.x, data.edge_index)
    f1, acc_test, acc1_test = calc_scores(pred[data.test_mask], data.y[data.test_mask])
    save_to_qgis(data, pred.max(1)[1].numpy())
    print(f'TEST: f1: {f1:.2f}, acc/avg/test: {acc_test:.2f}, acc/1/test: {acc1_test:.2f}')

The code is then generated similarly, requiring only the target variable to be defined and additional input data to be specified.
The choice of target depends on the study's goals.