# Imports

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Set working directory

In [None]:
# Define the working directory where project files are stored
wd = (
    "/Users/carmen/Library/CloudStorage/OneDrive-TheUniversityofLiverpool/"
    "Research/RECAST/latin-mobility-covid-local-files"
)

# Define country and parameters

In [None]:
# Select the target country for analysis
country = "Colombia"

# Set country-specific parameters based on the selected country
if country == "Argentina":
    country_short = "ARG"  # ISO 3-letter country code
    country_code = "AR"    # ISO 2-letter country code
    buffer = 4000          # Buffer size in meters for spatial processing (chosen empirically)
elif country == "Chile":
    country_short = "CHL"
    country_code = "CL"
    buffer = 4000
elif country == "Colombia":
    country_short = "COL"
    country_code = "CO"
    buffer = 2000
# Uncomment the following if Mexico is to be included in the analysis
# elif country == "Mexico":
#     country_short = "MEX"
#     country_code = "MX"
#     buffer = 4000

# Test: load population grid and plot buffers for sample population data

**Note:** Do the buffers around the population data fall within the grid cells? If that's the case, grid works and is in the right resolution for data

In [None]:
# Define the directory path for Facebook population data for the selected country
directory = f"/Volumes/RECAST/data/populations/facebook/{country}"

# List all files in the directory
files = os.listdir(directory)

# Filter to include only files that contain '_0800.csv' (likely representing 08:00 time data)
files = [file for file in files if '_0800.csv' in file]

# Load the Facebook population grid shapefile for the country and convert it to EPSG:4326 (WGS 84) coordinate system
grid_path = f"{wd}/data/inputs/grids/Grid_{country}_FB_pop/Grid_{country}.shp"
grid_pop = gpd.read_file(grid_path).to_crs("EPSG:4326")

# Load a specific CSV file for testing (e.g., the 701st file matching '_0800.csv') into a DataFrame
df_pops = pd.read_csv(f"{directory}/{files[700]}")

# Filter the DataFrame to include only rows matching the selected country code,
# then reset the index
df_pops = df_pops[df_pops["country"] == country_code].reset_index(drop=True)


# Convert the population DataFrame into a GeoDataFrame by creating Point geometries
# from the 'lon' (longitude) and 'lat' (latitude) columns
gdf_pops = gpd.GeoDataFrame(
    df_pops,
    geometry=gpd.points_from_xy(df_pops["lon"], df_pops["lat"]),
    crs="EPSG:4326"
)

# Create a figure and axes object with a 15x15 inch size
fig, ax = plt.subplots(figsize=(15, 15))

# Plot the population grid polygons with thin white edges
grid_pop.plot(ax=ax, linewidth=0.1, edgecolor="white", zorder=1)

# Create buffer zones (in meters) around each point after projecting to EPSG:3857,
# then reproject the result back to EPSG:4326
gdf_buff = (
    gdf_pops.to_crs("EPSG:3857")
    .buffer(buffer)
    .to_crs("EPSG:4326")
)

# Plot the buffer zones in red
gdf_buff.plot(ax=ax, color="red", zorder=2)

# Set the visible map extent (longitude and latitude bounds)
ax.set_xlim(-75, -74)
ax.set_ylim(4, 5)

# Computation of population grid cell size 
... to chose the size of buffers

In [None]:
# Calculate the side (in km) of a typical cell (geometry at index 1000)
# Convert geometry to EPSG:3857 (meters), get area in m², take square root, and convert to kilometers

area_sqrt_km = np.sqrt(grid_pop.to_crs('epsg:3857').loc[1000, 'geometry'].area) / 1000
print(area_sqrt_km)

# Add Feature IDs (FID) to population data and save processed files

**Note:** This process can be optimised by using the `quadkey2` package. Update in the future!

In [None]:
# Define the directory path for Facebook population data based on the selected country
directory = f"/Volumes/RECAST/data/populations/facebook/{country}"

# List and sort all files in the directory to have a consistent order
files = sorted(os.listdir(directory))

# Filter the sorted list to include only files containing '_0800.csv'
files = [file for file in files if '_0800.csv' in file]

# Loop through each file for processing
for i in range(len(files)):

    # Print progress every 50 files as a percentage completed
    if i % 50 == 0:
        print(f"{(i / len(files)) * 100:.2f}% processed")

    try:
        file = files[i]

        # Load population data from CSV for the current file
        df_pops = pd.read_csv(f"{directory}/{file}")

        # Filter rows by country code and reset index
        df_pops = df_pops[df_pops["country"] == country_code].reset_index(drop=True)

        index_to_drop = []
        # Initialize 'FID' column with -1
        df_pops["FID"] = -1

        # Loop through each row to find the grid cell intersecting the buffered point
        for j in range(len(df_pops)):
            try:
                # Create a GeoDataFrame with a point geometry at (lon, lat)
                point = gpd.GeoDataFrame(
                    {"geometry": [Point(df_pops.loc[j, "lon"], df_pops.loc[j, "lat"])]},
                    crs="EPSG:4326",
                )

                # Buffer the point in meters (via EPSG:3857), then back to EPSG:4326
                buffered_point = (
                    point.to_crs("EPSG:3857").buffer(buffer).to_crs("EPSG:4326")
                )

                # Check which grid geometries intersect with the buffered point
                overlap_point = buffered_point[0].intersects(grid_pop["geometry"])

                # Find the first matching grid polygon and assign its Feature ID (FID) to the row
                matched_indices = np.where(overlap_point)[0]
                if len(matched_indices) > 0:
                    df_pops.loc[j, "FID"] = grid_pop.loc[matched_indices[0], "FID"]
                else:
                    # If no overlap, mark for dropping
                    index_to_drop.append(j)

            except Exception as e:
                # If error processing this row, mark it for dropping
                index_to_drop.append(j)

        # Drop rows where no grid polygon matched or error occurred
        df_pops = df_pops.drop(index_to_drop).reset_index(drop=True)

        # Save the processed DataFrame to output directory
        output_path = f"{wd}/data/outputs/{country_short}/pop/{file}"
        df_pops.to_csv(output_path, index=False)

    except Exception as e:
        # Log the index of the file that caused an error
        print(f"Error processing file index {i}: {e}")
