# **Split data into  training and validation sets by cluster**


- A stratified train-validation split is used: where each cluster group has some samples assigned to train-validation set randomly.

-  This avoids potential sampling issues of ending up with clusters that have no training-validation data.

- This results a more consistent sampling distribution.

Steps:

- Create directories for training and validation [Train & Valid

- Create sub-directories for each nightlight bin [Low, Medium, High].

- Load the data frame that contains the image location.

- Exclude rows/images that failed to download from the data frame.

- Calculate the percentage of nightlight bin clusters [Low, Medium, High].

- Introduce a new column, 'Is_train,' to the existing data frame and set it to True.

- Determine the number of images (n_ims) within each cluster.

- Randomly allocate 80% for training (n_train) and 20% for validation (n_valid) in each cluster.

- Set the "Is_train" column to False for the validation data set using the image index.

- Ensure that 80% of the data is assigned to training and 20% to validation in each cluster.

- Save the updated data frame.

- Transfer training and validation images from a source directory into their respective sub-directories of the nightlight bin clusters [Low, Medium, High].

- Verify the percentage of train and validation images within each nightlight bin cluster.
   



### Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


### Add absolute path to the project folder

In [2]:
import sys

sys.path.append("/content/drive/MyDrive/UNECA_MachineLearning_Project/")

# See the full list of paths in sys.path
sys.path

['/content',
 '/env/python',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/usr/local/lib/python3.10/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.10/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/drive/MyDrive/UNECA_MachineLearning_Project/']

### Importing necessary python libraries and modules

In [3]:
# For interacting with the operating system, such as reading or writing files.
import os

# Provides functions to copy, move, rename, and delete files and directories.
import shutil

# Importing the pandas library for manipulation DataFrame.
import pandas as pd

# Importing the numpy library for array and matrix manipulation.
import numpy as np

# tqdm package  add a progress bar to your code.

from tqdm.notebook import tqdm

### Add Base Directory

In [5]:
# Sets the base directory variable
BASE_DIR = '/content/drive/MyDrive/UNECA_MachineLearning_Project/'

In [6]:
# Change the current working directory
os.chdir(BASE_DIR)

# Print the current working directory to verify the change
print("Current Working Directory:", os.getcwd())

Current Working Directory: /content/drive/MyDrive/UNECA_MachineLearning_Project


### Define the folder data paths

In [7]:
# Creates the BASE_DIR variable & join it to the BASE_DIR
COUNTRIES_DIR = os.path.join(BASE_DIR, 'countries')

# Creates the PROCESSED_DIR variable & join it to the BASE_DIR
PROCESSED_DIR = os.path.join(BASE_DIR, 'processed')


# Print the variables
print(COUNTRIES_DIR)
print(PROCESSED_DIR)

/content/drive/MyDrive/UNECA_MachineLearning_Project/countries
/content/drive/MyDrive/UNECA_MachineLearning_Project/processed


## Define the CNN Image Directory and Model Paths

In [8]:
CNN_IMAGE_DIR = os.path.join(BASE_DIR, 'cnn_images')
CNN_SAVE_DIR = os.path.join(BASE_DIR, 'models')

# Print the variables
print(CNN_IMAGE_DIR)
print(CNN_SAVE_DIR)

/content/drive/MyDrive/UNECA_MachineLearning_Project/cnn_images
/content/drive/MyDrive/UNECA_MachineLearning_Project/models


## Create CNN Images directory and train and valid sub-directory

In [9]:
os.makedirs(os.path.join(CNN_IMAGE_DIR, 'train'), exist_ok=False)
os.makedirs(os.path.join(CNN_IMAGE_DIR, 'valid'), exist_ok=False)

## Create the labels directory [0, 1, 2 ]

In [10]:
labels = ['0', '1', '2']
for l in labels:
    os.makedirs(os.path.join(CNN_IMAGE_DIR, 'train', l), exist_ok=False)
    os.makedirs(os.path.join(CNN_IMAGE_DIR, 'valid', l), exist_ok=False)

## Create the model directory



In [11]:
os.makedirs(os.path.join(CNN_SAVE_DIR), exist_ok=False)

## Load the image location dataframe

In [13]:
df_download = pd.read_csv(os.path.join(PROCESSED_DIR, 'df_malawi_loc_labed_2016.csv'))
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
...,...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw


## Check all the images are downloaded

In [16]:
downloaded = os.listdir(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'images_14_5'))
print(f"actually downloaded: {len(downloaded)}, expected: {len(df_download)}")

actually downloaded: 33900, expected: 33900


##

In [17]:
df_download['row'] = np.arange(len(df_download))
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country,row
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,0
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,1
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,2
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,3
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,4
...,...,...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,33895
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,33896
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,33897
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,33898


## Filters the dataframe removing the ones that failed

In [18]:
idx_not_download = df_download.set_index('image_name').drop(downloaded)['row'].values.tolist()
df_download.drop(idx_not_download, inplace=True)
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country,row
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,0
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,1
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,2
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.09515,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,3
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,4


In [19]:
df_download.drop('row', axis=1, inplace=True)
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
...,...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw


## Calculate the percentage of nightlights_bin

In [20]:
(df_download['nightlights_bin']==0).mean(), (df_download['nightlights_bin']==1).mean(), (df_download['nightlights_bin']==2).mean()

(0.49557522123893805, 0.3023598820058997, 0.20206489675516223)

## Resetting the index

In [21]:
df_download.reset_index(drop=True, inplace=True)
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw
...,...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw


## Add a new column called 'is_train' to the existing dataframe

In [22]:
df_download['is_train'] = True
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country,is_train
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
...,...,...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True


In [24]:
groups = df_download.groupby(['cluster_lat', 'cluster_lon'])
for _, g in groups:
    n_ims = len(g)
    n_train = int(0.8 * n_ims)
    n_valid = n_ims - n_train
    valid_choices = np.random.choice(np.arange(n_ims), replace=False, size=n_valid).tolist()
    current_index = g.index
    idx_valid = current_index[valid_choices]
    df_download['is_train'].loc[idx_valid] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_download['is_train'].loc[idx_valid] = False


## Verify 80% of the data is used as  training data

In [25]:
df_download['is_train'].mean()

0.8

## Save this new dataframe

In [26]:
# save this new dataframe
df_download.to_csv(os.path.join(PROCESSED_DIR, 'image_download_actual_malawi2016.csv'), index=False)

 ## Split the dataframe into training and validation sets

In [27]:
t = df_download[df_download['is_train']]
v = df_download[~df_download['is_train']]

In [28]:
t

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country,is_train
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
5,-17.065206157196016_35.17229723579403_-17.0951...,-17.065206,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,True
...,...,...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0,mw,True


In [29]:
v

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country,is_train
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,False
6,-17.050234235794026_35.17229723579403_-17.0951...,-17.050234,35.172297,-17.095150,35.217213,1.423239,0.025206,1,mw,False
20,-17.050234235794026_35.20224107859801_-17.0951...,-17.050234,35.202241,-17.095150,35.217213,1.423239,0.025206,1,mw,False
23,-17.11012192140199_35.217213_-17.09515_35.2172...,-17.110122,35.217213,-17.095150,35.217213,1.423239,0.025206,1,mw,False
29,-17.125093842803985_35.232184921401995_-17.095...,-17.125094,35.232185,-17.095150,35.217213,1.423239,0.025206,1,mw,False
...,...,...,...,...,...,...,...,...,...,...
33880,-9.444638921401992_33.03708992140199_-9.429667...,-9.444639,33.037090,-9.429667,33.022118,1.534702,0.000448,0,mw,False
33885,-9.474582764205977_33.05206184280398_-9.429667...,-9.474583,33.052062,-9.429667,33.022118,1.534702,0.000448,0,mw,False
33886,-9.459610842803984_33.05206184280398_-9.429667...,-9.459611,33.052062,-9.429667,33.022118,1.534702,0.000448,0,mw,False
33888,-9.429667_33.05206184280398_-9.429667_33.02211...,-9.429667,33.052062,-9.429667,33.022118,1.534702,0.000448,0,mw,False


In [30]:
len(t), len(v)

(27120, 6780)

##  Copy training images from a source directory to a destination directory

In [31]:
print('copying train images')
for im_name, nl, country in tqdm(zip(t['image_name'], t['nightlights_bin'], t['country']), total=len(t)):
    country_dir = None
    if country == 'mw':
        country_dir = 'malawi_2016'
    else:
        print(f"no match for country {country}")
        raise ValueError()

    src = os.path.abspath(os.path.join(COUNTRIES_DIR, country_dir, 'images_14_5', im_name))
    dest = os.path.join(CNN_IMAGE_DIR, 'train', str(nl), im_name)

    # Use shutil.copy() to copy the file
    shutil.copy(src, dest)

copying train images


  0%|          | 0/27120 [00:00<?, ?it/s]

##  Copy Validation images from a source directory to a destination directory

In [32]:
print('copying valid images')
for im_name, nl, country in tqdm(zip(v['image_name'], v['nightlights_bin'], v['country']), total=len(v)):
    country_dir = None
    if country == 'mw':
        country_dir = 'malawi_2016'
    else:
        print(f"no match for country {country}")
        raise ValueError()
    src = os.path.abspath(os.path.join(COUNTRIES_DIR, country_dir, 'images_14_5', im_name))
    dest = os.path.join(CNN_IMAGE_DIR, 'valid', str(nl), im_name)

    # Use shutil.copy() to copy the file
    shutil.copy(src, dest)

copying valid images


  0%|          | 0/6780 [00:00<?, ?it/s]

## Count distribution in train folder

In [33]:
# shows count distribution in train folder, make sure this matches above
counts = []
for l in ['0', '1', '2']:
    counts.append(len(os.listdir(os.path.join(CNN_IMAGE_DIR, 'train', l))))
print(counts)
print([c/sum(counts) for c in counts])
print(sum(counts))

[13440, 8200, 5480]
[0.49557522123893805, 0.3023598820058997, 0.20206489675516223]
27120


## Count distribution in valid folder

In [34]:
# shows count distribution in valid folder
counts = []
for l in ['0', '1', '2']:
    counts.append(len(os.listdir(os.path.join(CNN_IMAGE_DIR, 'valid', l))))
print(counts)
print([c/sum(counts) for c in counts])
print(sum(counts))

[3360, 2050, 1370]
[0.49557522123893805, 0.3023598820058997, 0.20206489675516223]
6780
