In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import networkx as nx
from scipy import stats


In [2]:


def analyze_weather_correlation(nodes_df, accidents_df):
    """
    Analyze correlation between weather and accident frequency
    """
    # Merge weather data with accident counts
    merged_df = pd.merge(nodes_df, accidents_df, 
                        left_on='node_id', 
                        right_on=['node_1', 'node_2'], 
                        how='left')
    
    correlations = {
        'temperature': stats.pearsonr(merged_df['tavg'], merged_df['acc_count'])[0],
        'precipitation': stats.pearsonr(merged_df['prcp'], merged_df['acc_count'])[0]
    }
    
    return correlations

def analyze_road_types(edge_features, accidents_df):
    """
    Analyze accident frequency by road type
    """
    road_type_accidents = {}
    for road_type in edge_features.keys():
        if road_type.startswith(('highway_', 'residential_')):
            # Convert sparse tensor to dense for analysis
            road_matrix = edge_features[road_type].to_dense()
            accident_count = accidents_df[accidents_df['node_1_idx'].isin(road_matrix.nonzero()[0])]['acc_count'].sum()
            road_type_accidents[road_type] = accident_count
    
    return road_type_accidents

def identify_high_risk_intersections(adj_matrix, accidents_df, threshold_percentile=90):
    """
    Identify high-risk intersections using betweenness centrality and accident frequency
    """
    # Convert to networkx graph for centrality calculation
    G = nx.from_scipy_sparse_matrix(adj_matrix.to_sparse())
    
    # Calculate betweenness centrality
    centrality = nx.betweenness_centrality(G)
    
    # Create risk score combining centrality and accident frequency
    risk_scores = {}
    for node in G.nodes():
        accident_count = accidents_df[
            (accidents_df['node_1_idx'] == node) | 
            (accidents_df['node_2_idx'] == node)
        ]['acc_count'].sum()
        
        risk_scores[node] = centrality[node] * accident_count
    
    # Identify high-risk nodes
    threshold = np.percentile(list(risk_scores.values()), threshold_percentile)
    high_risk_nodes = {k: v for k, v in risk_scores.items() if v > threshold}
    
    return high_risk_nodes

def detect_hotspots(nodes_df, accidents_df, eps=0.1, min_samples=5):
    """
    Detect accident hotspots using DBSCAN
    """
    # Prepare data for clustering
    accident_locations = nodes_df.merge(
        accidents_df, 
        left_on='node_id', 
        right_on=['node_1', 'node_2']
    )[['lat', 'lon', 'acc_count']]
    
    # Weight locations by accident count
    weighted_locations = np.repeat(
        accident_locations[['lat', 'lon']].values,
        accident_locations['acc_count'].astype(int),
        axis=0
    )
    
    # Perform DBSCAN clustering
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(weighted_locations)
    
    # Analyze clusters
    cluster_stats = pd.DataFrame({
        'cluster': clustering.labels_,
        'lat': weighted_locations[:, 0],
        'lon': weighted_locations[:, 1]
    }).groupby('cluster').agg({
        'lat': 'mean',
        'lon': 'mean',
        'cluster': 'size'
    }).rename(columns={'cluster': 'accident_count'})
    
    return cluster_stats

In [3]:
def load_network_data(base_path, state, year, month):
    """
    Load all required data files for network analysis
    """
    # Load adjacency matrix
    adj_matrix = torch.load(f"{base_path}/{state}/adj_matrix.pt")
    
    # Load node features with weather data
    nodes_df = pd.read_csv(f"{base_path}/{state}/Nodes/node_features_{year}_{month}.csv")
    
    # Load edge features
    edge_features = torch.load(f"{base_path}/{state}/Edges/edge_features.pt")
    
    # Load traffic data
    # traffic_features = torch.load(f"{base_path}/{state}/Edges/edge_features_traffic_{year}.pt")
    
    # Load accident data
    accidents_df = pd.read_csv(f"{base_path}/{state}/Accidents_Nearest_Street_{state}_Monthly.csv")
    accidents_df = accidents_df[(accidents_df['year'] == year) & (accidents_df['month'] == month)]
    
    return adj_matrix, nodes_df, edge_features, accidents_df

adj_matrix, nodes_df, edge_features, accidents_df = load_network_data('/Users/beck/Documents/GitHub/ML4RoadSafety', 'CA', 2021, 11)

  adj_matrix = torch.load(f"{base_path}/{state}/adj_matrix.pt")
  edge_features = torch.load(f"{base_path}/{state}/Edges/edge_features.pt")


### convert edge features to df

In [6]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

def edge_features_to_dataframe(edge_features, nodes_df):
    """
    Convert PyTorch sparse edge features into a pandas DataFrame.
    Memory-efficient version that avoids creating dense matrices.
    Includes progress bars for tracking conversion progress.
    
    Parameters:
        edge_features (dict): Dictionary of sparse tensors for each feature
        nodes_df (pd.DataFrame): DataFrame containing node IDs for mapping
    
    Returns:
        pd.DataFrame: DataFrame with all edge features
    """
    # Initialize empty list to store edge data
    edge_data = []
    
    # Get indices and values directly from sparse tensor
    first_feature = list(edge_features.keys())[0]
    indices = edge_features[first_feature]._indices()
    
    # Create mapping dictionary for node IDs
    node_id_mapping = nodes_df['node_id'].to_dict()
    
    # Process edges in batches to manage memory
    batch_size = 10000
    num_edges = indices.shape[1]
    
    # Create progress bar for overall progress
    pbar = tqdm(total=num_edges, desc="Converting edges", unit="edges")
    
    for batch_start in range(0, num_edges, batch_size):
        batch_end = min(batch_start + batch_size, num_edges)
        
        # Process each edge in the batch
        for i in range(batch_start, batch_end):
            source_idx = indices[0][i].item()
            target_idx = indices[1][i].item()
            
            # Get source and target node IDs from mapping
            source_id = node_id_mapping[source_idx]
            target_id = node_id_mapping[target_idx]
            
            # Create dictionary for this edge
            edge_dict = {
                'source_node': source_id,
                'target_node': target_id,
            }
            
            # Add values for each feature
            for feat_name, feat_tensor in edge_features.items():
                # Get value directly from sparse tensor
                edge_dict[feat_name] = feat_tensor.coalesce().values()[
                    (feat_tensor.coalesce().indices()[0] == source_idx) & 
                    (feat_tensor.coalesce().indices()[1] == target_idx)
                ].item()
            
            edge_data.append(edge_dict)
            pbar.update(1)
        
        # Optional: Convert batch to DataFrame and free memory
        if len(edge_data) >= 1000000:  # Process in chunks of 1 million edges
            temp_df = pd.DataFrame(edge_data)
            if 'final_df' not in locals():
                final_df = temp_df
            else:
                final_df = pd.concat([final_df, temp_df], ignore_index=True)
            edge_data = []  # Free memory
    
    pbar.close()
    
    # Convert remaining data to DataFrame
    if edge_data:
        temp_df = pd.DataFrame(edge_data)
        if 'final_df' not in locals():
            final_df = temp_df
        else:
            final_df = pd.concat([final_df, temp_df], ignore_index=True)
    
    return final_df

def print_memory_estimate(edge_features, nodes_df):
    """
    Print estimated memory usage before conversion
    """
    num_nodes = len(nodes_df)
    num_features = len(edge_features)
    
    # Get actual number of edges
    first_feature = list(edge_features.keys())[0]
    num_edges = edge_features[first_feature]._nnz()
    
    print(f"Network statistics:")
    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}")
    print(f"Number of features: {num_features}")
    
    # Estimate memory for dense representation
    dense_memory_gb = (num_nodes * num_nodes * num_features * 4) / (1024**3)
    print(f"\nDense matrix would require approximately {dense_memory_gb:.2f} GB")
    
    # Estimate memory for sparse representation
    sparse_memory_gb = (num_edges * num_features * 4) / (1024**3)
    print(f"Sparse representation uses approximately {sparse_memory_gb:.2f} GB")

# Example usage:
# First check memory requirements
print_memory_estimate(edge_features, nodes_df)
# Then convert if manageable
edge_df = edge_features_to_dataframe(edge_features, nodes_df)

Network statistics:
Number of nodes: 1242784
Number of edges: 3061644
Number of features: 28

Dense matrix would require approximately 161105.16 GB
Sparse representation uses approximately 0.32 GB


Converting edges:   0%|          | 45/3061644 [06:37<7442:39:43,  8.75s/edges]

KeyboardInterrupt: 

In [9]:
weather_correlation = analyze_weather_correlation(nodes_df, accidents_df)
print(weather_correlation)

ValueError: len(right_on) must equal len(left_on)

In [11]:
def analyze_road_types(edge_features, accidents_df):
    """
    Analyze accident frequency by road type
    """
    road_type_accidents = {}
    for road_type in edge_features.keys():
        if road_type.startswith(('highway_', 'residential_')):
            # Convert sparse tensor to dense for analysis
            road_matrix = edge_features[road_type].to_dense()
            accident_count = accidents_df[accidents_df['node_1_idx'].isin(road_matrix.nonzero()[0])]['acc_count'].sum()
            road_type_accidents[road_type] = accident_count
    
    return road_type_accidents

road_type_accidents = analyze_road_types(edge_features, accidents_df)
print(road_type_accidents)

{}


In [10]:
high_risk_intersections = identify_high_risk_intersections(adj_matrix, accidents_df, threshold_percentile=90)
print(high_risk_intersections)

AttributeError: module 'networkx' has no attribute 'from_scipy_sparse_matrix'