# Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os

# Set working directory

In [None]:
# Define working directory path
wd = (
    '/Users/carmen/Library/CloudStorage/OneDrive-TheUniversityofLiverpool/'
    'Research/RECAST/latin-mobility-covid-local-files'
)

# Define country and parameters

In [None]:
# Select target country
country = 'Colombia'

# Set country-specific parameters: ISO codes and buffer size (in meters)
if country == 'Argentina':
    country_short = 'ARG'   # ISO 3-letter code
    country_code = 'AR'     # ISO 2-letter code
    buffer = 8000           # Buffer size in meters for spatial processing (chosen empirically)
elif country == 'Chile':
    country_short = 'CHL'
    country_code = 'CL'
    buffer = 8000
elif country == 'Colombia':
    country_short = 'COL'
    country_code = 'CO'
    buffer = 2000
# Uncomment the following if Mexico is to be included in the analysis
# elif country == 'Mexico':
#     country_short = 'MEX'
#     country_code = 'MX'
#     buffer = 8000

# Test: load movement grid and plot buffers for sample movement data

**Note:** Do the buffers around the movmeent data fall within the grid cells? If that's the case, grid works and is in the right resolution for data

In [None]:
# Define the directory path for Facebook movement data for the selected country
directory = f"/Volumes/RECAST/data/movements/{country}"

# List and sort all files in the directory
files = sorted(os.listdir(directory))

# Filter to include only files that contain '_0800.csv' (likely representing 08:00 time data)
files = [file for file in files if '_0800.csv' in file]

# Load the Facebook movement grid shapefile for the country and convert it to EPSG:4326 (WGS 84) coordinate system
grid_path = f"{wd}/data/inputs/grids/Grid_{country}_FB_mov/Grid_{country}.shp"
grid_mov = gpd.read_file(grid_path).to_crs("EPSG:4326")

# Load a specific CSV file for testing (e.g., the 101st file matching '_0800.csv') into a DataFrame
df_movs = pd.read_csv(f"{directory}/{files[100]}")

# Filter the DataFrame to include only rows matching the selected country code,
# then reset the index
df_movs = df_movs[df_movs['country']==country_code].reset_index(drop=True)

# Initialise lists to store extracted coordinates
start_lstr_lons = []
start_lstr_lats = []
end_lstr_lons = []
end_lstr_lats = []

# Iterate over each row in the DataFrame `df_movs`
for j in range(len(df_movs)):
    try:
        # Try extracting coordinates from the 'geometry' column string
        geom_str = df_movs.loc[j, 'geometry']
    except KeyError:
        # Fallback to 'GEOMETRY' column if 'geometry' doesn't exist
        geom_str = df_movs.loc[j, 'GEOMETRY']

    # Example geometry format: "LINESTRING (lon1 lat1, lon2 lat2)"
    # Parse start and end coordinates from the string
    try:
        coords_str = geom_str.split("(")[1].split(")")[0].strip()
        start_str, end_str = coords_str.split(", ")
        
        # Split lon and lat for start point
        start_lon, start_lat = start_str.split(" ")
        start_lstr_lons.append(float(start_lon))
        start_lstr_lats.append(float(start_lat))
        
        # Split lon and lat for end point
        end_lon, end_lat = end_str.split(" ")
        end_lstr_lons.append(float(end_lon))
        end_lstr_lats.append(float(end_lat))
        
    except Exception as e:
        # Handle any parsing error by appending NaNs or skipping
        start_lstr_lons.append(float('nan'))
        start_lstr_lats.append(float('nan'))
        end_lstr_lons.append(float('nan'))
        end_lstr_lats.append(float('nan'))
        print(f"Parsing error at row {j}: {e}")

# Add extracted coordinates as new columns to the DataFrame
df_movs['start_lstr_lon'] = start_lstr_lons
df_movs['start_lstr_lat'] = start_lstr_lats
df_movs['end_lstr_lon'] = end_lstr_lons
df_movs['end_lstr_lat'] = end_lstr_lats

# Convert the DataFrame to a GeoDataFrame using start coordinates as geometry points
gdf_movs = gpd.GeoDataFrame(
    df_movs,
    geometry=gpd.points_from_xy(df_movs['start_lstr_lon'], df_movs['start_lstr_lat']),
    crs="EPSG:4326"
)                          

# Create a figure and axis with specified size
fig, ax = plt.subplots(figsize=(15, 15))

# Plot the movement grid polygons with thin white edges, base layer (zorder=1)
grid_mov.plot(ax=ax, linewidth=0.1, edgecolor='white', zorder=1)

# Buffer the points (movements) in meters (projected CRS EPSG:3857),
# then reproject back to geographic coordinates (EPSG:4326)
gdf_buff = gdf_movs.to_crs('EPSG:3857').buffer(buffer).to_crs('EPSG:4326')

# Plot the buffered points as red patches on top (zorder=2)
gdf_buff.plot(ax=ax, color='red', zorder=2)

# Optionally, you can set map bounds for zoom - change according to country
ax.set_xlim(-75, -74)
ax.set_ylim(4, 5)

# Display the plot
plt.show()


# Computation of movement grid cell size 
... to chose the size of buffers

In [None]:
# Calculate the side (in km) of a typical cell (geometry at index 1000)
# Convert geometry to EPSG:3857 (meters), get area in m², take square root, and convert to kilometers

area_sqrt_km = np.sqrt(grid_mov.to_crs('epsg:3857').loc[1000, 'geometry'].area) / 1000
print(area_sqrt_km)

# Add Feature IDs (FID) to movement data and save processed files

**Note:** This process can be optimised by using the `quadkey2` package. Update in the future!

In [None]:
# Define the directory containing movement CSV files for the selected country
directory = f'/Volumes/RECAST/data/movements/{country}'

# List all files in the directory and sort them alphabetically for consistent ordering
files = sorted(os.listdir(directory))

# Filter the list to include only files with '_0800.csv' in their filename (likely representing data for 08:00)
files = [file for file in files if '_0800.csv' in file]

# Loop through each file for processing
for i in range(len(files)):
    
    # Print progress every 20 files as a percentage
    if i % 20 == 0:
        print(f'Progress: {(i / len(files)) * 100:.2f}%')
    
    try:
        file = files[i]

        # Load movement data for the current file
        df_movs = pd.read_csv(f"{directory}/{file}")
        
        # Filter rows for the target country and reset index
        df_movs = df_movs[df_movs['country'] == country_code].reset_index(drop=True)
        
        # Initialize lists to hold extracted longitude and latitude for start and end points
        start_lstr_lons = []
        start_lstr_lats = []
        end_lstr_lons = []
        end_lstr_lats = []

        # Extract coordinates from 'geometry' or 'GEOMETRY' column strings
        for j in range(len(df_movs)):
            try:
                # Attempt to parse from 'geometry' column
                start_lstr_lons.append(float(df_movs.loc[j, 'geometry'].split("(")[1].split(",")[0].split(" ")[0]))
                start_lstr_lats.append(float(df_movs.loc[j, 'geometry'].split("(")[1].split(",")[0].split(" ")[1]))
                end_lstr_lons.append(float(df_movs.loc[j, 'geometry'].split(", ")[1].split(")")[0].split(" ")[0]))
                end_lstr_lats.append(float(df_movs.loc[j, 'geometry'].split(", ")[1].split(")")[0].split(" ")[1]))
            except:
                # Fallback: parse from 'GEOMETRY' column if 'geometry' parsing fails
                start_lstr_lons.append(float(df_movs.loc[j, 'GEOMETRY'].split("(")[1].split(",")[0].split(" ")[0]))
                start_lstr_lats.append(float(df_movs.loc[j, 'GEOMETRY'].split("(")[1].split(",")[0].split(" ")[1]))
                end_lstr_lons.append(float(df_movs.loc[j, 'GEOMETRY'].split(", ")[1].split(")")[0].split(" ")[0]))
                end_lstr_lats.append(float(df_movs.loc[j, 'GEOMETRY'].split(", ")[1].split(")")[0].split(" ")[1]))

        # Add extracted coordinate columns to the DataFrame
        df_movs['start_lstr_lat'] = start_lstr_lats
        df_movs['start_lstr_lon'] = start_lstr_lons
        df_movs['end_lstr_lat'] = end_lstr_lats
        df_movs['end_lstr_lon'] = end_lstr_lons
        
        # Initialize start and end Feature ID (FID) columns with -1 as default
        df_movs['start_FID'] = -1
        df_movs['end_FID'] = -1
        
        # Keep track of rows to drop if processing fails
        index_to_drop = []
        
        # Assign FID for start and end locations by checking intersection with grid polygons
        for j in range(len(df_movs)):
            try:
                # Create buffered start point geometry for intersection
                start = (
                    gpd.GeoDataFrame({'geometry': [Point(df_movs.loc[j, 'start_lstr_lon'], df_movs.loc[j, 'start_lstr_lat'])]})
                    .set_crs('EPSG:4326')
                    .to_crs('EPSG:3857')
                    .buffer(buffer)
                    .to_crs('EPSG:4326')
                )
                overlap_start = start[0].intersects(grid_mov['geometry'])
                df_movs.loc[j, 'start_FID'] = grid_mov.loc[np.where(overlap_start == True)[0][0]]['FID']

                # Create buffered end point geometry for intersection
                end = (
                    gpd.GeoDataFrame({'geometry': [Point(df_movs.loc[j, 'end_lstr_lon'], df_movs.loc[j, 'end_lstr_lat'])]})
                    .set_crs('EPSG:4326')
                    .to_crs('EPSG:3857')
                    .buffer(buffer)
                    .to_crs('EPSG:4326')
                )
                overlap_end = end[0].intersects(grid_mov['geometry'])
                df_movs.loc[j, 'end_FID'] = grid_mov.loc[np.where(overlap_end == True)[0][0]]['FID']

            except:
                # Log problematic file and row indices, and mark row for dropping
                print(f"Error processing file index {i}, row {j}")
                index_to_drop.append(j)
        
        # Drop rows where either start or end FID assignment failed
        df_movs = df_movs.drop(index_to_drop).reset_index(drop=True)
        
        # Save processed DataFrame to output directory
        output_path = f"{wd}/data/outputs/{country_short}/mov/{file}"
        df_movs.to_csv(output_path, index=False)
    
    except:
        print(f"Failed to process file index {i}")
