# **Sampling to generate download locations and undersampling to avoid bias**

- Systematic and Stochastic Sampling

- Undersample areas with low/no nightlight (avoid class imbalance)

### Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


### Add absolute path to the project folder

In [3]:
import sys

sys.path.append("/content/drive/MyDrive/UNECA_MachineLearning_Project/")

# See the full list of paths in sys.path
sys.path

['/content',
 '/env/python',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/usr/local/lib/python3.10/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.10/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/drive/MyDrive/UNECA_MachineLearning_Project/']

### Importing necessary python libraries and modules

In [4]:
# For interacting with the operating system, such as reading or writing files.
import os

# Importing the numpy library for array and matrix manipulation.
import numpy as np

# Importing the pandas library for manipulation DataFrame.
import pandas as pd

import math

import random

# Importing a custom utility function 'create_space' from the 'utils' module.
from utils import create_space

### Add Base Directory

In [5]:
# Sets the base directory variable
BASE_DIR = '/content/drive/MyDrive/UNECA_MachineLearning_Project/'

In [6]:
# Change the current working directory
os.chdir(BASE_DIR)

# Print the current working directory to verify the change
print("Current Working Directory:", os.getcwd())

Current Working Directory: /content/drive/MyDrive/UNECA_MachineLearning_Project


## Define the folder data paths

In [7]:
# This line creates the COUNTRIES_DIR variable & join it to the BASE_DIR and data

COUNTRIES_DIR = os.path.join(BASE_DIR, 'countries')

# This line creates the PROCESSED_DIR variable & join it to the BASE_DIR and data

PROCESSED_DIR = os.path.join(BASE_DIR, 'processed')

# Generate Download Locations

## Approach for Systematic and Stochastic Sampling

* Load the input dataframe containing consumption and nightlight data.

* Utilize the bounding box generated in the previous step to create a grid of latitudes and longitudes uniformly distributed within the bounding box.

* Initialize an empty dataframe to store downloaded image names and locations.

* Define the image per cluster (e.g, ipc=50) and compute edge_num as the floored square root. [sqrt(50) ≈ 7.07, so edge_num = 7]

* This creates 49 uniformly spaced grid points (7 * 7) within the bounding box.

* The remaining 1 point is generated through random sampling [adding some random points enhances diversity and fills gaps].

* Together, they ensure the collection of a robust sample of imagery across the landscape.

* For each latitude/longitude point (ipc=50), construct the image name, image latitude, and image longitude, and append it to the dataframe.

## Load input dataframe

In [9]:
df_mw = pd.read_csv(os.path.join(PROCESSED_DIR, 'df_clusters_malawi_nl.csv'))
df_mw

Unnamed: 0,cluster_lat,cluster_lon,cons_pc,nightlights
0,-17.095150,35.217213,1.423239,0.025206
1,-17.092351,35.114643,1.266204,0.000000
2,-17.016698,35.079629,1.566870,0.000000
3,-16.977243,35.205706,1.669245,0.008266
4,-16.956385,35.168967,1.089891,0.002295
...,...,...,...,...
775,-9.591378,33.057450,1.409932,0.000000
776,-9.550397,33.291558,1.242801,0.000000
777,-9.519230,33.139193,1.804122,0.003557
778,-9.507538,33.259649,1.791725,0.000000


In [10]:
def generate_download_locations(df, ipc=50):

    #np.random.seed(RANDOM_SEED) # for reproducability
    df_download = {'image_name': [], 'image_lat': [], 'image_lon': [], 'cluster_lat': [],
                   'cluster_lon': [], 'cons_pc': [], 'nightlights': [] }

    # side length of square for uniform distribution
    edge_num = math.floor(math.sqrt(ipc))
    for _, r in df.iterrows():
        min_lat, min_lon, max_lat, max_lon = create_space(r.cluster_lat, r.cluster_lon)
        lats = np.linspace(min_lat, max_lat, edge_num).tolist()
        lons = np.linspace(min_lon, max_lon, edge_num).tolist()

        # performs cartesian product
        uniform_points = np.transpose([np.tile(lats, len(lons)), np.repeat(lons, len(lats))])

        lats = uniform_points[:,0].tolist()
        lons = uniform_points[:,1].tolist()

        # fills the remainder with random points
        for _ in range(ipc - edge_num * edge_num):
            lat = random.uniform(min_lat, max_lat)
            lon = random.uniform(min_lon, max_lon)
            lats.append(lat)
            lons.append(lon)

        # add to dict
        for lat, lon in zip(lats, lons):
            # image name is going to be image_lat_image_lon_cluster_lat_cluster_lon.png
            image_name = str(lat) + '_' + str(lon) + '_' + str(r.cluster_lat) + '_' + str(r.cluster_lon) + '.png'
            df_download['image_name'].append(image_name)
            df_download['image_lat'].append(lat)
            df_download['image_lon'].append(lon)
            df_download['cluster_lat'].append(r.cluster_lat)
            df_download['cluster_lon'].append(r.cluster_lon)
            df_download['cons_pc'].append(r.cons_pc)
            df_download['nightlights'].append(r.nightlights)

    return pd.DataFrame.from_dict(df_download)

In [11]:
df_mw_download = generate_download_locations(df_mw)
df_mw_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206
...,...,...,...,...,...,...,...
38995,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448
38996,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448
38997,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448
38998,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448


## Undersample areas with low/no nightlight

- The goal is to undersample/reduce areas with low/no nightlight data to balance the dataset.

- To avoid class imbalance: If there are significantly more samples with no/low nightlights compared to bright areas, it can skew model training.

- Focus on populated areas - The areas with zero nightlights are likely unpopulated. Dropping some allows concentrating more samples on populated regions which may be more useful for the model objectives.

-  This induce variety into the model by droping rows with zero nightlights to reach target fraction.

The key steps are:

- Take a dataframe from previous step and set the desired target fraction.

- Solves the equation to determine number of zeros to drop (d) to reach target fraction.

- Solves for d:
        d = (c_z - n*fr) / (1 - fr)
        Where
        d = rows to drop,
        c_z = num rows with zero nightlights,
        n = num rows
        fr = frac remaining


- Define minimum number of image per cluste to avoid wiping of images per cluster.

- Return the final dataframe with rows dropped.

## Maximum and minimum nightlight values


In [14]:
# Maximum and minimum nightlight values

['nightlights'].min(), df_mw_download['nightlights'].max()

(0.0, 9.763892)

## Nightlight values equal to zero

In [26]:
(df_mw_download['nightlights'] == 0).mean()

0.21794871794871795

## Nightlight values between 0 and 1


In [18]:
# Nightlight values between 0 and 1

((0 <= df_mw_download['nightlights']) & (df_mw_download['nightlights'] <= 1)).mean()

0.8576923076923076

## Nightlight values between 1 and Maximum (9.763892)

In [17]:
# Nightlight values between 1 and Maximum(9.763892)

((1 <= df_mw_download['nightlights']) & (df_mw_download['nightlights'] <= 9.763892)).mean()

0.1423076923076923

## Drop Zeros function

In [19]:
# each cluster must have AT LEAST this many images after doing nightlights processing
MIN_IMAGES_PER_CLUSTER = 10

# Drops rows with 0 nightlights to reach target fraction
def drop_0s(df, fr=0.1):

  # Calculate relevant stats
  c_z = (df['nightlights']==0).sum()
  n = len(df)

  # Check target is valid
  assert c_z / n > fr, print(f'Dataframe already has under {fr} zeros')

  # Calculate number to drop
  d = (c_z - n * fr) / (1 - fr)
  d = int(d)

  # Drop evenly across clusters
  print(f'dropping: {d}')
  zero_df = df[df['nightlights']==0]
  zero_clusters = zero_df.groupby(['cluster_lat', 'cluster_lon'])
  per_cluster_drop = int(d / len(zero_clusters))

  # Drop specified amount per cluster
  print(f'Need to drop {per_cluster_drop} per cluster with 0 nightlights')
  drop_inds = []
  for (lat, lon), group in zero_clusters:
    clust_drop = np.random.choice(group.index, per_cluster_drop, replace=False)
    assert len(group) - len(clust_drop) >= MIN_IMAGES_PER_CLUSTER, print(f'dropping too many in {lat}, {lon}')
    drop_inds += clust_drop.tolist()

  return df.drop(drop_inds).reset_index(drop=True)

In [37]:
df_mw_download_loc = drop_0s(df_mw_download, fr=0.1)

dropping: 5111
Need to drop 30 per cluster with 0 nightlights


In [38]:
(df_mw_download_loc['nightlights'] == 0).mean()

0.10029498525073746

In [39]:
df_mw_download_loc

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206
...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448


## Export the download location data

In [40]:
df_mw_download_loc.to_csv(os.path.join(PROCESSED_DIR,  'df_malawi_download_loc.csv'), index=False)