# Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import summary_table
import scipy.stats
from matplotlib.ticker import AutoMinorLocator
from matplotlib.lines import Line2D
from mycolorpy import colorlist as mcp
from matplotlib_scalebar.scalebar import ScaleBar
import geopandas as gpd
import rioxarray as rx
from pysal.lib import weights
import shapely.geometry
import os
from datetime import datetime

# Define country and parameters

In [None]:
country = 'Chile'

if country=='Argentina':
    country_short = 'ARG'
    country_code = 'AR'
elif country == 'Chile':
    country_short = 'CHL'
    country_code = 'CL'
elif country == 'Colombia':
    country_short = 'COL'
    country_code = 'CO'
elif country == 'Mexico':
    country_short = 'MEX'
    country_code = 'MX'

# Set working directory

In [None]:
# Define working directory path

wd = (
    '/Users/carmen/Library/CloudStorage/OneDrive-TheUniversityofLiverpool/'
    'Research/RECAST/latin-mobility-covid-local-files'
)

# Merge Facebook Population grid with baseline population data 
and quick plot to see where missing values are

In [None]:
# Load grid shapefile and convert to EPSG:4326 coordinate reference system
grid_pop = gpd.read_file(
    wd + f'/data/inputs/grids/Grid_{country}_FB_pop/Grid_{country}.shp'
).to_crs('EPSG:4326')

# Load baseline population data
baseline_pop = pd.read_csv(wd + f'/data/outputs/{country_short}/baseline/baseline_pop.csv')

# Replace missing and placeholder values
baseline_pop = baseline_pop.fillna(-1)
baseline_pop = baseline_pop.replace(-999, -1)

# Merge spatial grid with baseline population data using 'FID' as key
grid_pop = pd.merge(grid_pop, baseline_pop, on='FID', how='left')


# Quick plot of baseline population values for Monday (column '0') 
# using Natural Breaks classification with 5 bins.
# Missing values (-1) are treated as NaN for plotting only, without modifying the original dataframe.

# Create a temporary column for plotting where -1 is replaced by NaN to mark missing values
plot_col = grid_pop['0'].replace(-1, np.nan)

# Plot using the temporary column; the original grid_pop remains unchanged
grid_pop.assign(plot_col=plot_col).plot(
    column='plot_col',
    legend=True,
    scheme="NaturalBreaks",
    k=5,
    figsize=(15, 10),
    missing_kwds={
        "color": "lightgrey",
        "hatch": "///",
        "label": "Missing values (-1 or NaN)",
    },
)

plt.show()


# Integrate WorldPop raster data with FB population grid 
...and handle anomalies for Mexico

In [None]:
# This cell:
# - Loads WorldPop raster and converts to GeoDataFrame of points with population values
# - Spatially joins population points to FB grid cells and aggregates population per cell
# - For Mexico, smooths max population value (a WorldPop outlier) using spatial lag of neighbours

# Load the WorldPop raster file for the specified country (1km resolution)
rds = rx.open_rasterio(
    wd + f'/data/inputs/population/worldpop/{country_short.lower()}_ppp_2020_1km_Aggregated.tif'
)
rds.name = "population"

# Convert the raster to a DataFrame, with one row per pixel
df = rds.squeeze().to_dataframe().reset_index()

# Create point geometries from the pixel centroid coordinates
geometry = gpd.points_from_xy(df.x, df.y)

# Convert the DataFrame to a GeoDataFrame with the appropriate CRS and geometry column
gdf_worldpop = gpd.GeoDataFrame(df, crs=rds.rio.crs, geometry=geometry)

# Drop columns no longer needed: 'x', 'y', 'band', and 'spatial_ref'
gdf_worldpop = gdf_worldpop.drop(['x', 'y', 'band', 'spatial_ref'], axis=1)

# Remove rows where population value is undefined (coded as -99999)
gdf_worldpop = gdf_worldpop[gdf_worldpop['population'] != -99999].reset_index(drop=True)

# Reproject GeoDataFrame to WGS84 geographic coordinates (EPSG:4326)
gdf_worldpop = gdf_worldpop.to_crs('EPSG:4326')

# Spatial join of grid cells with WorldPop points
# Note: This operation drops grid cells without matching raster pixels, which may be addressed later
grid_wp = gpd.sjoin(grid_pop, gdf_worldpop)

# Aggregate joined data by grid cell (FID) summing population and other numeric columns
grid_wp_group = grid_wp.drop(['geometry', 'index_right'] + [str(i) for i in range(7)], axis=1)
grid_wp_group = grid_wp_group.groupby(['FID']).sum()

# Merge aggregated population data back to grid_wp to get one entry per grid cell with summed population
gdf_merge = pd.merge(grid_wp, grid_wp_group, on='FID')
gdf_merge = gdf_merge.drop(['index_right', 'population_x', 'geometry'] + [str(i) for i in range(7)], axis=1)
gdf_merge = gdf_merge.drop_duplicates().reset_index(drop=True)

# Merge the summarised population data into the original grid GeoDataFrame
gdf_mmerge = pd.merge(grid_pop, gdf_merge, how='left', on='FID')

# Rename population column for clarity and convert to GeoDataFrame
gdf_mmerge = gdf_mmerge.rename({'population_y': 'pop_wp'}, axis=1)
gdf_mmerge = gpd.GeoDataFrame(gdf_mmerge, geometry='geometry')

# Update the grid_pop variable with the merged data
grid_pop = gdf_mmerge

# Special handling for Mexico: replace max population value with spatial lag of neighbours population
if country == 'Mexico':
    
    # Find index of grid cell with max population
    id_max = grid_pop[grid_pop['pop_wp'] == max(grid_pop['pop_wp'])].index[0]
    
    # Build Queen contiguity spatial weights based on grid geometry
    w = weights.Queen.from_dataframe(grid_pop, idVariable="FID")
    
    # Remove islands (cells with no neighbours) and rebuild weights
    grid_pop_w = grid_pop.drop(w.islands)
    w = weights.Queen.from_dataframe(grid_pop_w, idVariable="FID")
    w.transform = 'R'  # Row-standardise weights
    
    # Compute spatial lag (average neighbour value) of population for the filtered grid
    grid_pop_w['pop_wp_lag'] = weights.lag_spatial(w, grid_pop_w['pop_wp'])
    
    # Replace max population cell value with its spatial lag value
    grid_pop.loc[id_max, 'pop_wp'] = grid_pop_w.loc[grid_pop_w['FID'] == id_max, 'pop_wp_lag'].iloc[0]


# Testing correlation of WorldPop and FB baseline population by weekday
To see if it makes sense to estimate missing baseline data with the WP population

In [None]:
nrows = 2
ncols = 4
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, sharey=True, figsize=(55, 35))
fig.subplots_adjust(hspace=0.15)
fig.subplots_adjust(wspace=0.05)
ax = axes.flatten()

wday = 0  # use integer for easier control

for i, ax in enumerate(axes.flat):
    if wday > 6:
        # Remove unused subplot for the 8th panel
        fig.delaxes(ax)
        break

    try:
        ax.xaxis.set_minor_locator(AutoMinorLocator(2))
        ax.yaxis.set_minor_locator(AutoMinorLocator(2))

        for spine in ax.spines.values():
            spine.set_linewidth(3)

        grid_pop_plot = grid_pop.copy()
        grid_pop_plot = grid_pop_plot.dropna(subset=['pop_wp']).reset_index(drop=True)
        grid_pop_plot = grid_pop_plot.dropna(subset=[str(wday)]).reset_index(drop=True)
        grid_pop_plot = grid_pop_plot[grid_pop_plot[str(wday)] > 0]
        grid_pop_plot = grid_pop_plot.sort_values(by=['pop_wp']).reset_index(drop=True)

        ax.scatter(
            grid_pop_plot['pop_wp'], grid_pop_plot[str(wday)],
            color='blue', marker="o", label="Observed population counts",
            zorder=2, alpha=0.3, s=500
        )

        xticks = [0]
        upper = 0
        while upper < ax.get_xlim()[1] - 200000:
            upper += 200000
            xticks.append(upper)

        ax.set_xticks(xticks)
        ax.set_xticklabels([int(x / 1000) for x in xticks], rotation=45, color='k')
        yticks = list(ax.get_yticks())
        ax.set_yticks(yticks)
        ax.set_yticklabels([int(y / 1000) for y in yticks], rotation=0, color='k')
        ax.tick_params(axis='both', which='both', width=0, length=0, color='k', labelsize=60, pad=20)

        # Regressions
        grid_pop_dropna = grid_pop.copy().dropna().sort_values(by=['pop_wp'])

        x_predict = grid_pop['pop_wp']

        # OLS regression
        df = pd.DataFrame({'pop_wp': grid_pop_dropna['pop_wp'], 'pop_FB': grid_pop_dropna[str(wday)]})
        res = smf.ols('pop_FB ~ pop_wp - 1', data=df).fit()
        st, data, ss2 = summary_table(res, alpha=0.05)
        y_predict = res.predict(x_predict)
        predict_mean_se = data[:, 3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
        predict_ci_low, predict_ci_upp = data[:, 6:8].T

        # Plot OLS regression line
        ax.plot(x_predict, y_predict, color="firebrick", linestyle="-", label="OLS regressor", lw=6, zorder=3)

        # Pearson correlation stats
        pearson = scipy.stats.pearsonr(df['pop_wp'], df['pop_FB'])
        R = f"{pearson.statistic:.3f}"
        pvalue = f"{pearson.pvalue:.2f}"
        if float(pvalue) < 0.05:
            pvalue = '< 0.05'
        ax.text(0.04, 0.87, f'R = {R}\np-value = {pvalue}', transform=ax.transAxes, size=50)

        # Set subplot title by weekday
        weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        ax.set_title(weekdays[wday], size=60, pad=16)

        # Store predictions for imputation if needed
        grid_pop[str(wday) + '_imput'] = y_predict

        wday += 1

    except Exception as e:
        print(f"Plotting stopped due to error: {e}")
        break

legend = axes[0, 0].legend(loc=(0.03, 0.7), fontsize=40)
legend.get_frame().set_edgecolor('black')
legend.get_frame().set_linewidth(2)

# For matplotlib 2.1.1 compatibility
axes[0, 3].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

# Set common axis labels
fig.text(0.5, 0.01, 'WorldPop population counts (thousands)', ha='center', va='center', size=60)
fig.text(0.08, 0.5, 'FB Population counts for baseline (thousands)', ha='center', va='center', rotation='vertical', size=60)

# Uncomment to save the figure
# plt.savefig(wd + f'/plots/correlation-wp-fb-pop-baseline/correlation-wp-fb-pop-baseline-{country_short}.png', bbox_inches='tight')

plt.show()



# Impute missing and invalid population values using model predictions

In [None]:
# Create a copy of grid_pop to perform imputation
grid_pop_imput = grid_pop.copy()

# List of weekday columns as strings
wday_cols = [str(i) for i in range(7)]

# Loop over each row and weekday column to impute missing or invalid values (-1 or NaN)
for i in range(len(grid_pop_imput)):
    for col in wday_cols:
        if grid_pop_imput.loc[i, col] == -1 or pd.isna(grid_pop_imput.loc[i, col]):
            grid_pop_imput.loc[i, col] = grid_pop_imput.loc[i, col + '_imput']

# Drop the '_imput' columns as they are no longer needed
grid_pop_imput = grid_pop_imput.drop(columns=[col + '_imput' for col in wday_cols])

# Optional: save the imputed grid to file
# grid_pop_imput.to_file(wd + f'/data/outputs/{country_short}/grids-with-data/popcell-baseline-imput-pop/popcell-baseline-imput-pop.gpkg')

# Scatter plot of imputed Facebook Population vs. WorldPop data

In [None]:
# Create a square figure and axis for plotting
fig, ax = plt.subplots(figsize=(8, 8))

# Set minor tick locators for both axes (adds smaller ticks between major ticks)
ax.xaxis.set_minor_locator(AutoMinorLocator(2))
ax.yaxis.set_minor_locator(AutoMinorLocator(2))

# Customize tick parameters: no tick marks, large font size for labels, padding
ax.tick_params(
    axis='both',      # Apply to both x and y axes
    which='both',     # Apply to both major and minor ticks
    width=0,          # No tick lines
    length=0,         # No tick length
    color='k',        # Tick color (black)
    labelsize=20,     # Large font size for labels
    pad=15            # Padding between ticks and labels
)

# Sort the DataFrame by WorldPop population counts for orderly plotting
grid_pop_imput_sort = grid_pop_imput.sort_values(by=['pop_wp'])

# Scatter plot FB population counts for each weekday (0 to 6) against WorldPop population
# Using low alpha for transparency and large point size
for day in range(7):
    ax.scatter(
        grid_pop_imput_sort['pop_wp'], 
        grid_pop_imput_sort[str(day)], 
        alpha=0.05, 
        s=250
    )

# Set both axes to logarithmic scale to better visualize wide range of population values
ax.set_xscale('log')
ax.set_yscale('log')


# Comparison of Facebook imputed vs. WorldPop population with mapping

In [None]:
nrows = 1
ncols = 2
fig, (ax1, ax2) = plt.subplots(nrows=nrows, ncols=ncols, sharey=True, figsize=(15, 15))
fig.subplots_adjust(wspace=0.05)

# Prepare data for plotting
grid_pop_imput_map = grid_pop_imput.dropna(subset=['pop_wp']).reset_index(drop=True)
grid_pop_imput_map = grid_pop_imput_map[grid_pop_imput_map['0'] > 0].reset_index(drop=True)

# Plot FB imputed population counts for Monday
grid_pop_imput_map.plot(
    column='0',
    cmap='viridis',
    scheme='natural_breaks',
    k=8,
    legend=True,
    zorder=2,
    ax=ax1
)

# Plot WorldPop population counts
grid_pop.plot(
    column='pop_wp',
    cmap='viridis',
    scheme='natural_breaks',
    k=8,
    legend=True,
    zorder=2,
    ax=ax2
)

# Load background shapefile based on country
if country != 'Mexico':
    background = gpd.read_file(wd + '/data/inputs/boundaries/south-america/vc965bq8111.shp').to_crs('EPSG:4326')
else:
    background = gpd.read_file(wd + '/data/inputs/boundaries/central-america/bound_p.shp').to_crs('EPSG:4326')

# Configure axes for both plots
for ax in (ax1, ax2):
    background.plot(ax=ax, facecolor='dimgray', alpha=0.5, zorder=1)
    ax.set_facecolor('lightskyblue')

    # Set axis limits based on country
    if country == 'Argentina':
        ax.set_xlim(-75, -49)
        ax.set_ylim(-60, -19.5)
    elif country == 'Chile':
        ax.set_xlim(-80, -55)
        ax.set_ylim(-57, -16)
    elif country == 'Colombia':
        ax.set_xlim(-80, -53)
        ax.set_ylim(-5.5, 17)
    elif country == 'Mexico':
        ax.set_xlim(-119, -75)
        ax.set_ylim(12, 45)

    # Extract and process legend labels for customization
    labels = [t.get_text() for t in ax.get_legend().get_texts()]
    upper = []
    for label in labels:
        part = label.split(',')[1]
        digits = [ch for ch in part if ch.isdigit() or ch == '.']
        upper.append(float(''.join(digits)))
    upper[-1] += 0.005  # Slightly increase last upper bound

    # Create custom labels for legend
    custom_labels = ['[0, ' + str(int(upper[0])) + ']']
    for i in range(len(upper) - 1):
        custom_labels.append(f'[{int(upper[i])}, {int(upper[i+1])}]')

    # Generate colors for legend entries
    colors = mcp.gen_color(cmap='viridis', n=8)
    legend_elements = [
        Line2D([0], [0], lw=0, color=colors[i], marker='o', markersize=10, label=custom_labels[i])
        for i in range(len(colors))
    ]

    # Set legend title based on subplot
    title = "FB population counts \nfor imput baseline \n(Monday)" if ax == ax1 else '\nWP population counts\n'

    # Set legend location based on country
    legend_locs = {
        'Argentina': (0.51, 0.07),
        'Chile': (0.51, 0.17),
        'Colombia': (0.51, 0.07),
        'Mexico': (0.52, 0.33)
    }
    legend = ax.legend(
        handles=legend_elements,
        handlelength=0,
        fontsize=14,
        shadow=False,
        fancybox=False,
        loc=legend_locs.get(country, (0.5, 0.1)),
        ncol=1,
        columnspacing=1.2,
        borderpad=1,
        title=title
    )

    # Align legend text to right
    for t in legend.get_texts():
        t.set_ha('right')

    # Style legend box
    legend.get_frame().set_edgecolor('black')
    legend.get_frame().set_linewidth(1)
    legend.get_frame().set_alpha(None)
    legend.get_frame().set_facecolor((1, 1, 1, 0.7))
    legend.get_title().set_fontsize('14')

    # Add scale bar
    ax.add_artist(
        ScaleBar(
            dx=1,
            units="km",
            dimension="si-length",
            length_fraction=.1,
            scale_formatter=lambda value, unit: f' {value * 100} km ',
            pad=0.7,
            sep=5,
            border_pad=1,
            scale_loc='top',
            box_color='w',
            box_alpha=0,
            font_properties={'size': 20},
            location='upper left'
        )
    )

    # Add north arrow image
    im = plt.imread(wd + '/data/inputs/boundaries/north-arrow.png')

    # North arrow positions based on country and subplot
    north_arrow_locs = {
        'Argentina': ([0.455, 0.75, 0.04, 0.04], [0.855, 0.75, 0.04, 0.04]),
        'Chile': ([0.455, 0.77, 0.04, 0.04], [0.855, 0.77, 0.04, 0.04]),
        'Colombia': ([0.455, 0.61, 0.04, 0.04], [0.855, 0.61, 0.04, 0.04]),
        'Mexico': ([0.16, 0.58, 0.04, 0.04], [0.56, 0.58, 0.04, 0.04])
    }
    loc_arr1, loc_arr2 = north_arrow_locs.get(country, ([0.45, 0.75, 0.04, 0.04], [0.85, 0.75, 0.04, 0.04]))

    for loc in (loc_arr1, loc_arr2):
        newax = fig.add_axes(loc, zorder=1)
        newax.tick_params(axis='both', which='both', labelbottom=False, labelleft=False, width=0, length=0)
        newax.set_facecolor('None')
        plt.setp(newax.spines.values(), linewidth=0)
        newax.imshow(im)

# Comparison of original vs. imputed FB population baseline (Monday)

In [None]:
nrows = 1
ncols = 2
fig, (ax1, ax2) = plt.subplots(nrows=nrows, ncols=ncols, sharey=True, figsize=(15, 15))
fig.subplots_adjust(wspace=0.05)

grid_pop_imput_map = grid_pop_imput.dropna(subset=['pop_wp']).reset_index(drop=True)
# grid_pop_imput_map = grid_pop_imput_map[grid_pop_imput_map['0'] > 0].reset_index(drop=True)
grid_pop_imput_map.plot(
    column='0',
    cmap='viridis',
    scheme='natural_breaks',
    k=5,
    legend=False,
    zorder=2,
    ax=ax2
)

grid_pop.plot(
    column='0',
    cmap='viridis',
    scheme='natural_breaks',
    k=5,
    legend=True,
    zorder=2,
    ax=ax1
)

grid_pop_map_notmissing = grid_pop[grid_pop['0'] > 0]
grid_pop_map_missing = grid_pop.drop(grid_pop_map_notmissing.index).reset_index(drop=True)
grid_pop_map_missing.plot(color='lightgray', zorder=3, ax=ax1)

if country != 'Mexico':
    background = gpd.read_file(wd + '/data/inputs/boundaries/south-america/vc965bq8111.shp').to_crs('EPSG:4326')
else:
    background = gpd.read_file(wd + '/data/inputs/boundaries/central-america/bound_p.shp').to_crs('EPSG:4326')

for ax in (ax1, ax2):
    background.plot(ax=ax, facecolor='dimgray', alpha=0.5, zorder=1)
    ax.set_facecolor('lightskyblue')

    if country == 'Argentina':
        ax.set_xlim(-75, -49)
        ax.set_ylim(-60, -19.5)
    elif country == 'Chile':
        ax.set_xlim(-80, -55)
        ax.set_ylim(-57, -16)
    elif country == 'Colombia':
        ax.set_xlim(-80, -65)
        ax.set_ylim(-12, 13)
    elif country == 'Mexico':
        ax.set_xlim(-119, -75)
        ax.set_ylim(12, 45)

    ax.tick_params(axis='both', which='both', width=0, length=0, color='k', labelleft=False, labelbottom=False)

    if ax == ax1:
        labels = [t.get_text() for t in ax.get_legend().get_texts()]

        upper = []
        for i in range(len(labels)):
            a = labels[i].split(',')[1]
            b = [e for e in a if e.isdigit() or e == '.']
            upper.append(float(''.join(b)))
        upper[-1] += 0.005

        custom_labels = ['Missing']
        custom_labels.append('[0, ' + str(int(upper[0])) + ']')
        for i in range(len(upper) - 1):
            custom_labels.append('[' + str(int(upper[i])) + ', ' + str(int(upper[i + 1])) + ']')

        colors = ['gray'] + mcp.gen_color(cmap='viridis', n=5)
        legend_elements = [
            Line2D([0], [0], lw=0, color=colors[i], marker='o', markersize=10, label=custom_labels[i])
            for i in range(len(colors))
        ]

    if ax == ax2:
        title = "FB population counts \nfor imput baseline \n(Monday)"
    else:
        title = 'FB population counts\nfor baseline \n(Monday)'

    if country == 'Argentina':
        loc = (0.37, 0.01)
    elif country == 'Chile':
        loc = (0.37, 0.02)
    elif country == 'Colombia':
        loc = (0.35, 0.02)
    elif country == 'Mexico':
        loc = (0.52, 0.33)

    if ax == ax1:
        legend = ax.legend(
            handles=legend_elements,
            handlelength=0,
            fontsize=19,
            shadow=False,
            fancybox=False,
            loc=loc,
            ncol=1,
            columnspacing=0.1,
            borderpad=1,
            title=title,
        )
        for t in legend.get_texts():
            t.set_ha('right')
        legend.get_frame().set_edgecolor('black')
        legend.get_frame().set_linewidth(1)
        legend.get_frame().set_alpha(None)
        legend.get_frame().set_facecolor((1, 1, 1, 0.7))
        legend.get_title().set_fontsize('20')

        ax.add_artist(
            ScaleBar(
                dx=1,
                units="km",
                dimension="si-length",
                length_fraction=0.1,
                scale_formatter=lambda value, unit: f' {value * 100} km ',
                pad=0.7,
                sep=5,
                border_pad=1,
                scale_loc='top',
                box_color='w',
                box_alpha=0,
                font_properties={'size': 20},
                location='upper left',
            )
        )

    im = plt.imread(wd + '/data/inputs/boundaries/north-arrow.png')

    if country == 'Argentina':
        loc_arr1 = [0.455, 0.75, 0.04, 0.04]
        loc_arr2 = [0.855, 0.75, 0.04, 0.04]
    elif country == 'Chile':
        loc_arr1 = [0.455, 0.77, 0.04, 0.04]
        loc_arr2 = [0.855, 0.77, 0.04, 0.04]
    elif country == 'Colombia':
        loc_arr1 = [0.455, 0.76, 0.04, 0.04]
        loc_arr2 = [0.855, 0.61, 0.04, 0.04]
    elif country == 'Mexico':
        loc_arr1 = [0.16, 0.58, 0.04, 0.04]
        loc_arr2 = [0.56, 0.58, 0.04, 0.04]

    newax = fig.add_axes(loc_arr1, zorder=1)
    newax.tick_params(axis='both', which='both', labelbottom=False, labelleft=False, width=0, length=0)
    newax.set_facecolor('None')
    plt.setp(newax.spines.values(), linewidth=0)
    newax.imshow(im)

# Uncomment if you want to add a second north-arrow
# newax = fig.add_axes(loc_arr2, zorder=1)
# newax.tick_params(axis='both', which='both', labelbottom=False, labelleft=False, width=0, length=0)
# newax.set_facecolor('None')
# plt.setp(newax.spines.values(), linewidth=0)
# newax.imshow(im)

# plt.savefig(wd + '/plots/map-missing-fb-pop-baseline/map-missing-fb-pop-baseline-' + country_short + '.pdf', bbox_inches='tight')

plt.show()
