In [1]:
import pandas as pd
import json
import geopandas as gpd
import os
import requests
def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

# Function to load data from disk
def load_data(filename):
    with open(filename, 'r') as f:
        return json.load(f)
    
def infrastructure_to_gdf(data):
    features = []
    for element in data:
        if element['type'] == 'node':
            geom = gpd.points_from_xy([element['lon']], [element['lat']])
        elif 'center' in element:
            geom = gpd.points_from_xy([element['center']['lon']], [element['center']['lat']])
        else:
            continue
        features.append({
            'geometry': geom[0],
            'properties': element.get('tags', {})
        })
    return gpd.GeoDataFrame(features)

def load_all_features(sumsk_infrastructure_data, kiev_infrastructure_data):
    """
    Load all infrastructure features, print out the structure of each feature, 
    and return a unified dataframe containing the necessary attributes.
    """
    def print_feature_structure(infra_data, region):
        """
        Prints the structure of the feature data.
        """
        for element in infra_data:
            feature_type = element.get('type')
            tags = element.get('tags', {})
            print(f"Region: {region}, Feature Type: {feature_type}")
            print("Tags:", json.dumps(tags, indent=4))
            print("="*50)

    # Print structure for Sumsk infrastructure
    print("Sumsk Infrastructure Data Structure:")
    print_feature_structure(sumsk_infrastructure_data, "Sumsk")
    
    # Print structure for Kiev infrastructure
    print("\nKiev Infrastructure Data Structure:")
    print_feature_structure(kiev_infrastructure_data, "Kiev")
    
    # Convert both infrastructures into a unified GeoDataFrame
    sumsk_gdf = infrastructure_to_gdf(sumsk_infrastructure_data)
    kiev_gdf = infrastructure_to_gdf(kiev_infrastructure_data)
    
    unified_gdf = pd.concat([sumsk_gdf, kiev_gdf], ignore_index=True)
    
    return unified_gdf

def prepare_unified_dataframe(unified_gdf):
    """
    Prepares a unified DataFrame from the GeoDataFrame for model training.
    The dataframe includes relevant features such as type, coordinates, and tags.
    """
    # Extract relevant properties for model building
    df = pd.DataFrame({
        'latitude': unified_gdf.geometry.y,
        'longitude': unified_gdf.geometry.x,
        'infrastructure_type': unified_gdf['properties'].apply(lambda x: x.get('military') or 
                                                               x.get('railway') or 
                                                               x.get('aeroway') or 
                                                               x.get('amenity', 'unknown')),
        'tags': unified_gdf['properties'].apply(json.dumps)  # Save all tags for later analysis
    })
    
    # Here, you could engineer additional features such as proximity to clusters, critical areas, etc.
    # For example, add a distance feature to a key area
    # df['distance_to_key_area'] = df.apply(lambda row: calculate_distance(row, key_area), axis=1)
    
    return df

def fetch_infrastructure_data(bbox, filename):
    if os.path.exists(filename):
        return load_data(filename)

    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json];
    (
      node["military"]{bbox};
      way["military"]{bbox};
      relation["military"]{bbox};
      node["railway"="station"]{bbox};
      way["railway"="station"]{bbox};
      node["amenity"="hospital"]{bbox};
      way["amenity"="hospital"]{bbox};
      node["aeroway"="aerodrome"]{bbox};
      way["aeroway"="aerodrome"]{bbox};
    );
    out center;
    """
    response = requests.get(overpass_url, params={'data': overpass_query})
    data = response.json()
    
    save_data(data['elements'], filename)
    return data['elements']

sumsk_bbox_coords = [[50.0, 33.0], [52.0, 35.5]]
kiev_bbox_coords = [50.4, 51.553167, 29.267549, 32.161026]
sumsk_bbox = f"({sumsk_bbox_coords[0][0]},{sumsk_bbox_coords[0][1]},{sumsk_bbox_coords[1][0]},{sumsk_bbox_coords[1][1]})"
kiev_bbox = f"({kiev_bbox_coords[0]},{kiev_bbox_coords[2]},{kiev_bbox_coords[1]},{kiev_bbox_coords[3]})"
sumsk_infrastructure_data = fetch_infrastructure_data(sumsk_bbox, 'sumsk_infrastructure.json')
kiev_infrastructure_data = fetch_infrastructure_data(kiev_bbox, 'kiev_infrastructure.json')

# Load all features and inspect their structure
unified_gdf = load_all_features(sumsk_infrastructure_data, kiev_infrastructure_data)

# Prepare the unified DataFrame for model building
unified_df = prepare_unified_dataframe(unified_gdf)

# Display the first few rows of the unified DataFrame to check the structure
print("Unified DataFrame for Model Building")
print(unified_df.head())

Sumsk Infrastructure Data Structure:
Region: Sumsk, Feature Type: node
Tags: {
    "esr:code": "327805",
    "esr:user": "327805",
    "express:code": "2200020",
    "name": "\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446\u044c",
    "name:fr": "Krolevets",
    "name:ru": "\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446",
    "name:uk": "\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446\u044c",
    "network": "\u0423\u0417",
    "operator": "\u042e\u0433\u043e-\u0417\u0430\u043f\u0430\u0434\u043d\u0430\u044f \u0436\u0435\u043b\u0435\u0437\u043d\u0430\u044f \u0434\u043e\u0440\u043e\u0433\u0430",
    "public_transport": "stop_position",
    "railway": "station",
    "train": "yes",
    "uic_ref": "2200020",
    "wikidata": "Q12114968",
    "wikipedia": "uk:\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446\u044c (\u0441\u0442\u0430\u043d\u0446\u0456\u044f)"
}
Region: Sumsk, Feature Type: node
Tags: {
    "esr:user": "328102",
    "express:code": "2200013",
    "name": "\u0422\u0435\u0440

In [18]:
def add_roads_and_convoy_data(unified_gdf, sumsk_roads_gdf, kiev_roads_gdf, convoy_gdf):
    """
    Adds roads and convoy data to the unified GeoDataFrame.
    """
    # Step 1: Combine Sumsk and Kiev roads into one GeoDataFrame
    all_roads_gdf = pd.concat([sumsk_roads_gdf, kiev_roads_gdf], ignore_index=True)
    
    # Step 2: Ensure all GeoDataFrames use the same CRS (EPSG:4326)
    if unified_gdf.crs is None:
        unified_gdf = unified_gdf.set_crs("EPSG:4326", allow_override=True)
    
    # Step 3: Add road information to unified_gdf based on proximity or intersection
    # Use a spatial join to match roads near infrastructure points (using predicate instead of op)
    roads_near_infra = gpd.sjoin(unified_gdf, all_roads_gdf, how='left', predicate='intersects')

    # Step 4: Add convoy information based on proximity (using predicate instead of op)
    convoy_near_infra = gpd.sjoin(unified_gdf, convoy_gdf, how='left', predicate='intersects')

    # Step 5: Create a unified DataFrame that includes infrastructure, road, and convoy data
    combined_df = roads_near_infra.copy()
    combined_df['convoy_present'] = ~convoy_near_infra['index_right'].isnull()  # True if a convoy is nearby

    return combined_df

# Function to create convoy GeoDataFrame
def create_convoy_gdf(convoy_data):
    """
    Converts convoy data into a GeoDataFrame for spatial operations.
    Ensures that the convoy data has a 'geometry' column with valid geometries.
    """
    # Ensure the geometry column exists
    if 'geometry' in convoy_data.columns:
        # Check if the 'geometry' column has valid Point geometries
        convoy_data['geometry'] = convoy_data['geometry'].apply(lambda x: x if isinstance(x, Point) else None)
        
        # Filter out rows without valid geometry
        convoy_data = convoy_data[convoy_data['geometry'].notnull()]
    else:
        raise KeyError("Convoy data must contain a 'geometry' column with valid spatial points.")
    
    # Convert to GeoDataFrame
    convoy_gdf = gpd.GeoDataFrame(convoy_data, geometry='geometry', crs="EPSG:4326")
    
    return convoy_gdf
def prepare_final_dataframe(unified_gdf, sumsk_roads_gdf, kiev_roads_gdf, convoy_data):
    """
    Prepares the final unified DataFrame by combining infrastructure, road, and convoy data.
    """
    # Step 1: Convert convoy data into GeoDataFrame
    convoy_gdf = create_convoy_gdf(convoy_data)

    # Step 2: Add roads and convoy data to the unified infrastructure dataframe
    final_df = add_roads_and_convoy_data(unified_gdf, sumsk_roads_gdf, kiev_roads_gdf, convoy_gdf)

    return final_df

In [19]:

# Load the data

def load_convoy_data():
    return gpd.read_file("fixed_convoy_data.geojson")

from shapely.geometry import box
import osmnx as ox
def get_road_network(bbox_coords, cache_file):
    if os.path.exists(cache_file):
        try:
            return ox.load_graphml(cache_file)
        except Exception as e:
            print(f"Error loading cached road network: {str(e)}")
            # If loading fails, we'll fetch the data again
    
    try:
        # Try to get the road network using OSMnx
        road_network = ox.graph_from_bbox(
            bbox_coords[3], bbox_coords[1], bbox_coords[2], bbox_coords[0],
            network_type='drive', simplify=True,
            custom_filter='["highway"~"motorway|trunk|primary|secondary"]'
        )
        # Save the road network to cache
        ox.save_graphml(road_network, cache_file)
        return road_network
    except Exception as e:
        print(f"Error fetching road network from OSM: {str(e)}")
        print("Falling back to simplified road network...")
        
        # Fallback: Create a simplified road network
        bbox = box(bbox_coords[0], bbox_coords[1], bbox_coords[2], bbox_coords[3])
        simplified_roads = gpd.GeoDataFrame(
            geometry=[bbox.boundary],
            crs="EPSG:4326"
        )
        simplified_roads['highway'] = 'primary'
        return simplified_roads

In [20]:
print(convoy_data.columns)

Index(['id', 'status', 'violenceLevel', 'url', 'type', 'geolocUrl', 'credit',
       'description', 'areaTypeAffected', 'civCas', 'verifiedDate', 'country',
       'province', 'district', 'city', 'isPublished', 'categories',
       'geometry'],
      dtype='object')


In [21]:
# Load the necessary data
sumsk_infrastructure_data = fetch_infrastructure_data(sumsk_bbox, 'sumsk_infrastructure.json')
kiev_infrastructure_data = fetch_infrastructure_data(kiev_bbox, 'kiev_infrastructure.json')
convoy_data = load_convoy_data()  # Assuming this returns convoy data in a DataFrame
# Check the convoy data to ensure 'geometry' contains valid geometries
from shapely.geometry import Point

# Check the convoy data to ensure 'geometry' contains valid geometries
convoy_data['geometry'] = convoy_data['geometry'].apply(lambda x: x if isinstance(x, Point) else None)

# Filter out rows without valid geometry
convoy_data = convoy_data[convoy_data['geometry'].notnull()]
# Create unified infrastructure GeoDataFrame
unified_gdf = load_all_features(sumsk_infrastructure_data, kiev_infrastructure_data)

# Get the roads data
sumsk_roads = get_road_network(sumsk_bbox_coords, 'sumsk_roads.graphml')
kiev_roads = get_road_network(kiev_bbox_coords, 'kiev_roads.graphml')

sumsk_roads_gdf = sumsk_roads if isinstance(sumsk_roads, gpd.GeoDataFrame) else ox.graph_to_gdfs(sumsk_roads, nodes=False)
kiev_roads_gdf = kiev_roads if isinstance(kiev_roads, gpd.GeoDataFrame) else ox.graph_to_gdfs(kiev_roads, nodes=False)

# Assuming `convoy_data` has been loaded with the correct structure
convoy_gdf = create_convoy_gdf(convoy_data)

# Continue with adding roads and convoy data to your unified DataFrame
final_df = prepare_final_dataframe(unified_gdf, sumsk_roads_gdf, kiev_roads_gdf, convoy_gdf)

# Display the final dataframe for model building
print("Final DataFrame with Roads and Convoy Information")
print(final_df.head())

Sumsk Infrastructure Data Structure:
Region: Sumsk, Feature Type: node
Tags: {
    "esr:code": "327805",
    "esr:user": "327805",
    "express:code": "2200020",
    "name": "\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446\u044c",
    "name:fr": "Krolevets",
    "name:ru": "\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446",
    "name:uk": "\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446\u044c",
    "network": "\u0423\u0417",
    "operator": "\u042e\u0433\u043e-\u0417\u0430\u043f\u0430\u0434\u043d\u0430\u044f \u0436\u0435\u043b\u0435\u0437\u043d\u0430\u044f \u0434\u043e\u0440\u043e\u0433\u0430",
    "public_transport": "stop_position",
    "railway": "station",
    "train": "yes",
    "uic_ref": "2200020",
    "wikidata": "Q12114968",
    "wikipedia": "uk:\u041a\u0440\u043e\u043b\u0435\u0432\u0435\u0446\u044c (\u0441\u0442\u0430\u043d\u0446\u0456\u044f)"
}
Region: Sumsk, Feature Type: node
Tags: {
    "esr:user": "328102",
    "express:code": "2200013",
    "name": "\u0422\u0435\u0440