Within this notebook, MSOA street network graphs are assembled from DFs of nodes and edges imported from OpenStreetMaps. These graphs are then analyzed, their key features saved in a separate CSV, and plotted to ensure the validity of the graphs.

# Import Libraries

In [28]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
from torch_geometric.data import Data, Dataset
from torch_geometric.transforms import AddLaplacianEigenvectorPE
import pandas as pd
import geopandas as gpd
import networkx as nx
from torch_geometric.utils.convert import to_networkx
import folium
from shapely.wkt import loads
import numpy as np

2.1.2


# Functions

## Assemble graph 
Inputs: node file, edge file, MSOA, Y df name <br>
Outputs: graph
* extract Y from Y df
* save Y as long if classification, float if regression
* save node index as long
* save edge index as long
* create graph

In [2]:
def read_data_to_graph(MSOA, node_file, edge_file, y_file, y_column):
    # Read in data
    node_df = pd.read_csv(node_file)
    edge_df = pd.read_csv(edge_file)

    # Node index _________
    # Reset index to start at 0
    node_df['new_index'] = range(len(node_df))
    node_index = node_df[['new_index']].values
    node_index = torch.tensor(node_index, dtype=torch.long)

    # Edge index _________
    # Merge new index to get source and destination nodes
    edge_df = edge_df.drop(columns=['osmid'])
    edge_df = edge_df.merge(node_df[['osmid', 'new_index']], how='left', left_on='u', right_on='osmid')
    edge_df = edge_df.rename(columns={'new_index': 'new_source'}).drop(columns=['osmid'])
    edge_df = edge_df.merge(node_df[['osmid', 'new_index']], how='left', left_on='v', right_on='osmid')
    edge_df = edge_df.rename(columns={'new_index': 'new_dest'}).drop(columns=['osmid'])

    # Save edge index as tensor edge_index
    edge_index = edge_df[['new_source', 'new_dest']].values.T
    edge_index = torch.tensor(edge_index, dtype=torch.long)

    # Node attributes _________
    # Leave with array of ones, optionally adding positional encoding in the next step
    num_nodes = node_df.shape[0]  # Number of nodes
    num_features = 1
    x = torch.ones(num_nodes, num_features, dtype=torch.float)
    
    # Y features _________
    y_value = y_file[y_file['MSOA11CD'] == MSOA][y_column].values
    if len(y_value) == 0:
        print(f"Skipping missing MSOA data for {MSOA}")
        return None

    y = torch.tensor(y_value, dtype=torch.float)

    # Create graph, decide on inclusion of edge attributes
    graph = Data(x=x, edge_index=edge_index, y=y)
    return graph

## Save data 
Inputs: node/edge folder, destination, y df name, task type <br>
Outputs: .pt files to destination folder
* iterate through folder
* call create graph for each msoa

In [3]:
# Open csv of non-rural MSOAs to filter out rural areas
nonrural_MSOAs = pd.read_csv('geographies/nonrural_MSOAs.csv')
nonrural_MSOAs_set = set(nonrural_MSOAs['MSOA11CD'].values)

def save_graph(node_edge_source, destination, y_file, node_attr_on, y_column):
    # create the .pt directory if it doesn't exist
    if not os.path.exists(destination):
        os.makedirs(destination, exist_ok=True)

    # iterate through the csvs and save to .pt directory
    for idx, MSOA in enumerate(os.listdir(node_edge_source)):
        
        # Skip Welsh graphs and rural areas
        if MSOA.startswith("W02") or MSOA not in nonrural_MSOAs_set:
            continue

        # identify elements
        MSOA_code = MSOA
        MSOA_path = os.path.join(node_edge_source, MSOA)
        node_file = os.path.join(MSOA_path, 'node_list.csv')
        edge_file = os.path.join(MSOA_path, 'edge_list.csv')
        
        # transform nodes and edges into a PyG data object
        if os.path.exists(node_file) and os.path.exists(edge_file):
            graph = read_data_to_graph(MSOA, node_file, edge_file, y_file, y_column)

            if graph is None or not hasattr(graph, 'edge_index'):
                continue
            
            # Apply positional encoding
            if node_attr_on == True:
                pos_enc = AddLaplacianEigenvectorPE(k=5, is_undirected=True)
                graph = pos_enc(graph)
                graph.x = graph.laplacian_eigenvector_pe
            
            #save data to .pt data object at root
            torch.save(graph, os.path.join(destination, f'{MSOA}.pt'))

# Create graphs

#### Number of Amenities within 15 minutes walk

#### Regular

In [4]:
destination = 'graphs/graphs_n_amenities_15min'
node_edge_source = 'EW_msoa_node_edge_drive'
y_file = pd.read_csv("clean_y_values/y_n_amenities_15min.csv")
y_column = 'n_amenities_15min'
node_attr_on = False


save_graph(node_edge_source, destination, y_file, node_attr_on, y_column)

#### With node attributes / positional encoding

In [5]:
destination = 'graphs/graphs_n_amenities_15min_pos_enc'
node_edge_source = 'EW_msoa_node_edge_drive'
y_file = pd.read_csv("clean_y_values/y_n_amenities_15min_pos_enc.csv")
y_column = 'n_amenities_15min'
node_attr_on = True


save_graph(node_edge_source, destination, y_file, node_attr_on, y_column)

# Call Graphs

In [6]:
class EW_msoa_graphs(Dataset):
    def __init__(self, root):
        super().__init__()
        self.root = root
        
    @property
    # make list of cities in the processed .pt file
    def processed_file_names(self):
        return [f'{msoa}' for msoa in os.listdir(self.root) if msoa.endswith('.pt')]

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        file_name = self.processed_file_names[idx]  # This will automatically raise IndexError if idx is out of range
        return self.load_graph(os.path.join(self.root, file_name))

    def load_graph(self, file_path):
        try:
            return torch.load(file_path)
        except FileNotFoundError:
            raise FileNotFoundError(f"File {file_path} not found.")

    def get_by_filename(self, filename):
        # Method to load a graph directly by filename
        if filename in self.processed_file_names:
            return self.load_graph(os.path.join(self.root, filename))
        else:
            raise FileNotFoundError(f"File {filename} not found in dataset directory.")

In [7]:
dataset = EW_msoa_graphs('graphs/graphs_n_amenities_15min')

# Get Graph Data

In [24]:
# Define columns for degree counts
degree_columns = [f'degree_{i}' for i in range(1, 11)]

# Prepare an empty DataFrame to store results
msoa_attributes = pd.DataFrame(columns=['MSOA11CD', 'edge_length', 'edge_count', 'node_count'] + degree_columns)

# Set root directory
root_dir = 'EW_msoa_node_edge_drive'

In [25]:
# Open csv of non-rural MSOAs to filter out rural areas (read this outside the loop)
nonrural_MSOAs = pd.read_csv('geographies/nonrural_MSOAs.csv')
nonrural_MSOAs_set = set(nonrural_MSOAs['MSOA11CD'].values)

# Iterate over each directory in the root directory
for msoa11cd in os.listdir(root_dir):
    # Skip directories that start with 'W02', are '.DS_Store', or are not in the non-rural MSOAs set
    if msoa11cd.startswith('W02') or msoa11cd == '.DS_Store' or msoa11cd not in nonrural_MSOAs_set:
        continue
    
    # Construct file paths for edge_list.csv and node_list.csv
    edge_path = os.path.join(root_dir, msoa11cd, 'edge_list.csv')
    node_path = os.path.join(root_dir, msoa11cd, 'node_list.csv')
    
    try:
        # Read CSV files
        edge_df = pd.read_csv(edge_path)
        node_df = pd.read_csv(node_path)
        
        # Calculate sum of the 'length' column in edge_list.csv
        edge_length = edge_df['length'].sum()
        
        # Calculate the number of intersections and streets
        node_count = len(node_df)
        edge_count = len(edge_df)
        
        # Count the occurrences of each unique street count
        street_count_freq = node_df['street_count'].value_counts().to_dict()

        # Initialize a row for the current MSOA
        row_data = {f'degree_{i}': 0 for i in range(1, 11)}
        row_data.update({
            'MSOA11CD': msoa11cd,
            'edge_length': edge_length,
            'edge_count': edge_count,
            'node_count': node_count
        })
        
        # Update row data with street counts
        for k, v in street_count_freq.items():
            if 1 <= k < 11:  # Ensure k is within the desired range
                row_data[f'degree_{k}'] = v

        # Create a DataFrame for this row and append it to the main DataFrame
        temp_df = pd.DataFrame([row_data])
        msoa_attributes = pd.concat([msoa_attributes, temp_df], ignore_index=True)
        
    except FileNotFoundError as e:
        print(f"Skipping {msoa11cd}: File not found - {e}")

  msoa_attributes = pd.concat([msoa_attributes, temp_df], ignore_index=True)


Skipping E02001359: File not found - [Errno 2] No such file or directory: 'EW_msoa_node_edge_drive/E02001359/edge_list.csv'
Skipping E02003173: File not found - [Errno 2] No such file or directory: 'EW_msoa_node_edge_drive/E02003173/edge_list.csv'
Skipping E02000994: File not found - [Errno 2] No such file or directory: 'EW_msoa_node_edge_drive/E02000994/edge_list.csv'
Skipping E02001779: File not found - [Errno 2] No such file or directory: 'EW_msoa_node_edge_drive/E02001779/edge_list.csv'
Skipping E02001713: File not found - [Errno 2] No such file or directory: 'EW_msoa_node_edge_drive/E02001713/edge_list.csv'


In [26]:
msoa_attributes

Unnamed: 0,MSOA11CD,edge_length,edge_count,node_count,degree_1,degree_2,degree_3,degree_4,degree_5,degree_6,degree_7,degree_8,degree_9,degree_10
0,E02001830,29944.871,298,146,46,0,95,3,2,0,0,0,0,0
1,E02003569,27895.929,297,138,25,4,93,14,2,0,0,0,0,0
2,E02004328,40262.901,549,268,106,0,161,1,0,0,0,0,0,0
3,E02000788,29802.922,334,141,30,4,103,4,0,0,0,0,0,0
4,E02001468,29015.290,370,156,40,0,108,8,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4157,E02006404,36105.992,470,211,68,0,137,6,0,0,0,0,0,0
4158,E02003176,30187.296,274,114,26,0,83,5,0,0,0,0,0,0
4159,E02001621,23490.337,215,92,16,0,71,5,0,0,0,0,0,0
4160,E02004353,36453.697,545,256,101,0,139,16,0,0,0,0,0,0


In [29]:
# Find nodal proportions, remove empty columns
# Ensure that node_count is not zero to avoid division by zero errors
msoa_attributes['node_count'].replace(0, np.nan, inplace=True)  # Replace 0 with NaN to avoid division by zero

# Perform the division for each degree column
for col in degree_columns:
    msoa_attributes[col] = msoa_attributes[col] / msoa_attributes['node_count']
msoa_attributes.fillna(0, inplace=True)

# Drop any columns that only contain 0s
msoa_attributes = msoa_attributes.loc[:, (msoa_attributes != 0).any(axis=0)]

In [31]:
# Import MSOA data, find area
msoa = gpd.read_file("geographies/MSOA_2011_EW_BFC_shp/MSOA_2011_EW_BFC.shp")
msoa['area'] = msoa['geometry'].area
msoa_area = msoa[['MSOA11CD', 'area']]

In [33]:
graph_data = pd.merge(msoa_attributes, msoa_area, on='MSOA11CD')

# Find intersection density - excluding dead ends
graph_data['intersections'] = graph_data['node_count'] - (graph_data['node_count'] * graph_data['degree_1'])
graph_data['intersection_density'] = graph_data['intersections'] / graph_data['area']

# Find dead end density
graph_data['dead_end_density'] = (graph_data['node_count'] * graph_data['degree_1']) / graph_data['area']

In [35]:
# Find average nodal degree
degree_columns = [col for col in graph_data.columns if 'degree_' in col]
graph_data['average_nodal_degree'] = sum(graph_data[col] * int(col.split('_')[1]) * graph_data['node_count'] for col in degree_columns) / graph_data['node_count']

In [37]:
# Find node / edge ratio
graph_data['node_edge_ratio'] = graph_data['node_count'] / graph_data['edge_count']

In [38]:
graph_data

Unnamed: 0,MSOA11CD,edge_length,edge_count,node_count,degree_1,degree_2,degree_3,degree_4,degree_5,degree_6,degree_7,degree_8,area,intersections,intersection_density,dead_end_density,average_nodal_degree,node_edge_ratio
0,E02001830,29944.871,298,146,0.315068,0.0,0.650685,0.020548,0.013699,0.0,0.0,0.0,1.311980e+07,100.0,0.000008,0.000004,2.417808,0.489933
1,E02003569,27895.929,297,138,0.181159,0.028986,0.673913,0.101449,0.014493,0.0,0.0,0.0,2.024317e+06,113.0,0.000056,0.000012,2.73913,0.464646
2,E02004328,40262.901,549,268,0.395522,0.0,0.600746,0.003731,0.0,0.0,0.0,0.0,2.159859e+06,162.0,0.000075,0.000049,2.212687,0.48816
3,E02000788,29802.922,334,141,0.212766,0.028369,0.730496,0.028369,0.0,0.0,0.0,0.0,1.084730e+06,111.0,0.000102,0.000028,2.574468,0.422156
4,E02001468,29015.290,370,156,0.25641,0.0,0.692308,0.051282,0.0,0.0,0.0,0.0,1.010312e+06,116.0,0.000115,0.00004,2.538462,0.421622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4157,E02006404,36105.992,470,211,0.322275,0.0,0.649289,0.028436,0.0,0.0,0.0,0.0,2.093421e+06,143.0,0.000068,0.000032,2.383886,0.448936
4158,E02003176,30187.296,274,114,0.22807,0.0,0.72807,0.04386,0.0,0.0,0.0,0.0,2.159483e+06,88.0,0.000041,0.000012,2.587719,0.416058
4159,E02001621,23490.337,215,92,0.173913,0.0,0.771739,0.054348,0.0,0.0,0.0,0.0,1.485892e+06,76.0,0.000051,0.000011,2.706522,0.427907
4160,E02004353,36453.697,545,256,0.394531,0.0,0.542969,0.0625,0.0,0.0,0.0,0.0,2.461158e+06,155.0,0.000063,0.000041,2.273438,0.469725


In [39]:
graph_data.to_csv('clean_y_values/graph_node_edge_data.csv', index=False)

In [15]:
graph_data = pd.read_csv('clean_y_values/graph_node_edge_data.csv')

In [41]:
graph_data_generalised_sorted = graph_data_generalised.sort_values(by='area', ascending=False)
#graph_data_generalised_sorted.iloc[1000:1021]
graph_data_generalised_sorted

Unnamed: 0,MSOA11CD,node_edge_ratio,intersection_density,dead_end_density,average_nodal_degree,area,num_nodes,num_edges
1341,E02002517,0.431416,0.000004,0.000001,2.574359,3.355687e+07,195,452
1088,E02002537,0.460022,0.000010,0.000005,2.357143,2.704201e+07,420,913
3643,E02001706,0.450108,0.000013,0.000004,2.563855,2.462237e+07,415,922
1084,E02003849,0.483376,0.000007,0.000003,2.455026,2.070579e+07,189,391
2958,E02000524,0.591623,0.000019,0.000003,2.758850,2.050881e+07,452,764
...,...,...,...,...,...,...,...,...
2614,E02000874,0.449367,0.000143,0.000064,2.478873,3.427440e+05,71,158
1846,E02000885,0.490566,0.000056,0.000021,2.500000,3.381236e+05,26,53
421,E02000869,0.477124,0.000171,0.000056,2.547945,3.222546e+05,73,153
987,E02000189,0.580645,0.000191,0.000050,2.583333,2.977969e+05,72,124


# Visualize graphs

## Define plotting function

In [17]:
def plot_folium_msoa(msoa):
    # Construct file paths
    node_mapping_path = os.path.join('graph_osmid_mappings', f'{msoa}', 'node_mapping.csv')
    edge_mapping_path = os.path.join('graph_osmid_mappings', f'{msoa}', 'edge_mapping.csv')

    # Check if files exist
    if not os.path.exists(node_mapping_path) or not os.path.exists(edge_mapping_path):
        raise FileNotFoundError(f"Mapping files for {msoa} not found")
    
    # Load the CSV files into DataFrames
    node_df = pd.read_csv(node_mapping_path)
    edge_df = pd.read_csv(edge_mapping_path)

    # Transform to gdfs
    node_df['geometry'] = node_df['geometry'].apply(loads)
    node_gdf = gpd.GeoDataFrame(node_df, geometry='geometry')
    node_gdf.set_crs(epsg=4326, inplace=True)
    edge_df['geometry'] = edge_df['geometry'].apply(loads)
    edge_gdf = gpd.GeoDataFrame(edge_df, geometry='geometry')
    edge_gdf.set_crs(epsg=4326, inplace=True)
    
    # Calculate the centroid of the entire dataset for map centering
    center = node_gdf.geometry.unary_union.centroid

    # Create a Folium map centered around the calculated centroid
    m = folium.Map(location=[center.y, center.x], zoom_start=15)

    # Plot lines on the map
    for idx, line in edge_gdf.iterrows():
        line_coords = [[point[1], point[0]] for point in line.geometry.coords]
        folium.PolyLine(
            line_coords,
            color='blue',  # Default color for lines
            weight=2
        ).add_to(m)

    '''# Plot points on the map
    for idx, point in node_gdf.iterrows():
        folium.Circle(
            location=[point.geometry.y, point.geometry.x],
            radius=2,
            color='red',  # Default color for points
            fill=True,
            fill_color='red',
            fill_opacity=0.5
        ).add_to(m)'''

    # Display the map
    return m

## Plot graphs

In [44]:
plot_folium_msoa('E02000378')

In [31]:
plot_folium_msoa('E02003062')

In [33]:
plot_folium_msoa('E02006809')

In [34]:
plot_folium_msoa('E02000493')