# Imports

In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta

# Define country and parameters

In [None]:
# Select target country
country = 'Colombia'

# Set country-specific parameters: ISO codes and buffer size (in meters)
if country == 'Argentina':
    country_short = 'ARG'   # ISO 3-letter code
    country_code = 'AR'     # ISO 2-letter code
elif country == 'Chile':
    country_short = 'CHL'
    country_code = 'CL'
elif country == 'Colombia':
    country_short = 'COL'
    country_code = 'CO'
# Uncomment the following if Mexico is to be included in the analysis
# elif country == 'Mexico':
#     country_short = 'MEX'
#     country_code = 'MX'

# Set working directory

In [None]:
# Define working directory path
wd = (
    '/your/path/to/working/directory/'
)

# Load baseline imputed data and corresponding grid files

In [None]:
# Load geopackages with imputed baseline Facebook population and movement data
popcell_baseline_imput_pop = gpd.read_file(
    os.path.join(
        wd, 'data', 'outputs', country_short,
        'grids-with-data/popcell-baseline-imput-pop/popcell-baseline-imput-pop.gpkg'
    )
)

movcell_baseline_imput_pop = gpd.read_file(
    os.path.join(
        wd, 'data', 'outputs', country_short,
        'grids-with-data/movcell-baseline-imput-pop/movcell-baseline-imput-pop.gpkg'
    )
)

# Load baseline movement imputations CSV, excluding the unnecessary column
baseline_mov_imput = pd.read_csv(
    os.path.join(
        wd, 'data', 'outputs', country_short,
        'baseline/movcell-baseline-imput-mov-dist-with-exo-var-flatten.csv'
    )
).drop(columns=['Unnamed: 0'])

# Load population grid shapefile and reproject to WGS84 (EPSG:4326)
grid_popcell = gpd.read_file(
    os.path.join(
        wd, 'data', 'inputs', 'grids',
        f'Grid_{country}_FB_pop', f'Grid_{country}.shp'
    )
).to_crs('EPSG:4326')

# Load movement grid shapefile and reproject to WGS84
grid_movcell = gpd.read_file(
    os.path.join(
        wd, 'data', 'inputs', 'grids',
        f'Grid_{country}_FB_mov', f'Grid_{country}.shp'
    )
).to_crs('EPSG:4326')

# Load lookup table linking movement grid cells to population grid cells
grid_lookup = gpd.read_file(
    os.path.join(
        wd, 'data', 'inputs', 'grids',
        f'Grid_{country}_lookup_mov_to_pop.gpkg'
    )
).to_crs('EPSG:4326')


# Create empty DataFrame for population data over time and spatial units

In [None]:
# Define directory path containing population data files
directory = os.path.join(wd, 'data', 'outputs', country_short, 'pop')

# List and sort files in the directory, excluding hidden files (starting with '.')
files = sorted([file for file in os.listdir(directory) if not file.startswith('.')])

# Extract start and end dates from the filenames assuming date format in last 19 to 9 chars
start_date = datetime.strptime(files[0][-19:-9], '%Y-%m-%d')
end_date = datetime.strptime(files[-1][-19:-9], '%Y-%m-%d')

# Calculate the total number of days between start and end dates
delta = end_date - start_date

# Create list of date strings for all days between start and end dates
columns = [str(start_date + timedelta(days=i))[:10] for i in range(delta.days + 1)]

# Initialise DataFrame with NaNs, columns are dates, rows correspond to baseline population grid cells
df_pop_evo_popcell = pd.DataFrame({
    column: [np.nan] * len(popcell_baseline_imput_pop)
    for column in columns
})

# Insert 'FID' column at the start with index values from baseline population GeoDataFrame
df_pop_evo_popcell.insert(
    loc=0,
    column='FID',
    value=popcell_baseline_imput_pop.index
)

# Populate DataFrame with crisis pop data for each date and spatial unit
Two options:<br>
    - with raw data (reported) only<br>
    - with both raw and estimated data from imputed baseline when raw data is missing <br>

In [None]:
# Set to True or False according to desired option (see description above)
raw = True

In [None]:
if raw == True:
    
    # Iterate over all files containing daily population data
    for i in range(len(files)):

        # Print progress percentage every 20 files processed to track progress
        if i % 20 == 0:
            print(i / len(files) * 100)

        file = files[i]

        # Read the CSV file for the current date and remove the unnamed index column
        df_pops = pd.read_csv(os.path.join(directory, file)).drop('Unnamed: 0', axis=1)

        date = file[-19:-9]
        wday = datetime.strptime(date, "%Y-%m-%d").weekday()

        # Loop through each row of the current population data file
        for j in range(len(df_pops)):
            # Get the unique identifier for the population cell (popcell)
            FID_popcell = df_pops.loc[j, 'FID']

            # Get the crisis population count for this popcell on this date
            n_crisis = df_pops.loc[j, 'n_crisis']

            # Update the master DataFrame with the crisis population count for the corresponding date
            # Find the index in df_pop_evo_popcell where 'FID' matches FID_popcell
            df_pop_evo_popcell.loc[
                np.where(df_pop_evo_popcell['FID'] == FID_popcell)[0],
                str(date)
            ] = n_crisis

    # After processing all files, save the populated DataFrame to a CSV file for later use
    df_pop_evo_popcell.to_csv(
        os.path.join(wd, 'data', 'outputs', country_short, 'evo', 'pop_evo_popcell_raw.csv')
    )
    
else:

    # Loop through all files to process daily population data
    for i in range(len(files)):
        # Print progress every 20 files (~5%)
        if i % 20 == 0:
            print(f"{(i / len(files)) * 100:.2f}% completed")

        file = files[i]

        # Load population data for the current day and drop unnecessary column
        df_pops = pd.read_csv(os.path.join(directory, file)).drop('Unnamed: 0', axis=1)

        # Extract date and corresponding weekday from filename
        date = file[-19:-9]
        wday = datetime.strptime(date, "%Y-%m-%d").weekday()

        # Iterate through each row (popcell) in the daily data
        for j in range(len(df_pops)):
            FID_popcell = df_pops.loc[j, 'FID']
            n_crisis = df_pops.loc[j, 'n_crisis']

            if not pd.isna(n_crisis):
                # If crisis population data exists, update dataframe directly
                df_pop_evo_popcell.loc[
                    np.where(df_pop_evo_popcell['FID'] == FID_popcell)[0], str(date)
                ] = n_crisis
            else:
                # If crisis data is missing, estimate using baseline and percent change
                per_change = df_pops.loc[j, 'percent_change']
                n_baseline = popcell_baseline_imput_pop.loc[
                    np.where(df_pop_evo_popcell['FID'] == FID_popcell)[0], str(wday)
                ].values[0]  # Ensure single value extraction
                df_pop_evo_popcell.loc[
                    np.where(df_pop_evo_popcell['FID'] == FID_popcell)[0], str(date)
                ] = n_baseline * per_change / 100 + n_baseline

    # Save the updated population evolution dataframe to CSV
    df_pop_evo_popcell.to_csv(
        os.path.join(wd, 'data', 'outputs', country_short, 'evo', 'pop_evo_popcell.csv')
    )

# Aggregate crisis population data from population cells to movement cells

In [None]:
if raw == True:
    # Load previously saved crisis population data for popcells, dropping the unnamed index column
    df_pop_evo_popcell = pd.read_csv(
        os.path.join(wd, 'data', 'outputs', country_short, 'evo', 'pop_evo_popcell_raw.csv')
    ).drop('Unnamed: 0', axis=1)
    
else:
    df_pop_evo_popcell = pd.read_csv(
        os.path.join(wd, 'data', 'outputs', country_short, 'evo', 'pop_evo_popcell.csv')
    ).drop('Unnamed: 0', axis=1)

# Load lookup GeoPackage linking movement grid cells to population grid cells
gdf_lookup = gpd.read_file(
    os.path.join(wd, 'data', 'inputs', 'grids', f'Grid_{country}_lookup_mov_to_pop.gpkg')
)

# Initialise DataFrame to store crisis population data aggregated by movement grid cells
df_pop_evo_movcell = pd.DataFrame(
    {column: [np.nan] * len(movcell_baseline_imput_pop) for column in columns}
)

# Insert 'FID' column at the start with indices from the baseline movement grid
df_pop_evo_movcell.insert(
    loc=df_pop_evo_movcell.columns.get_loc(df_pop_evo_movcell.columns[0]),
    column='FID',
    value=movcell_baseline_imput_pop.index
)

# Loop through each movement grid cell by index
for i in range(len(df_pop_evo_movcell)):

    # Print progress every 500 rows to monitor progress
    if i % 500 == 0:
        print(i / len(df_pop_evo_movcell) * 100)

    # Get all population cell FIDs linked to the current movement cell
    FIDs_pop = np.array(gdf_lookup[gdf_lookup['FID_mov'] == i]['FID_pop'])

    # For each date column (skip 'FID' column)
    for j in range(1, len(df_pop_evo_movcell.columns)):

        # Extract population data from popcells for the current date and linked popcell FIDs
        pops = np.array(df_pop_evo_popcell.iloc[FIDs_pop][df_pop_evo_popcell.columns[j]])

        # Remove NaN values before aggregation
        pops = pops[~np.isnan(pops)]

        # If any valid data exists, sum it; otherwise assign NaN
        if len(pops) > 0:
            df_pop_evo_movcell.loc[i, df_pop_evo_movcell.columns[j]] = np.sum(pops)
        else:
            df_pop_evo_movcell.loc[i, df_pop_evo_movcell.columns[j]] = np.nan    

# Save the aggregated movement cell population data to CSV for further analysis
if raw == True:
    df_pop_evo_movcell.to_csv(
        os.path.join(wd, 'data', 'outputs', country_short, 'evo', 'pop_evo_movcell_raw.csv')
    )
else:
    df_pop_evo_movcell.to_csv(
        os.path.join(wd, 'data', 'outputs', country_short, 'evo', 'pop_evo_movcell.csv')
    )    