# **Create nightlights bins or labels for the nightlight values**

The key steps are:

- Extract nightlights as a numpy array and reshape it 1D vector.  

- Fit a Gaussian Mixture Model (GMM) to predict cluster label:

  - Cluster 0: Low nightlights values
  - Cluster 1: Medium nightlights values
  - Cluster 2: High nightlights values

- Predict cluster labels or cutoff threshold for each cluster.

- Create nightlights bins based on GMM cluster cutoffs.

- Calculate percentage in each cluster based on the cutoff threshold.

### Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Add absolute path to the project folder

In [2]:
import sys

sys.path.append("/content/drive/MyDrive/UNECA_MachineLearning_Project/")

# See the full list of paths in sys.path
sys.path

['/content',
 '/env/python',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/usr/local/lib/python3.10/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.10/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/drive/MyDrive/UNECA_MachineLearning_Project/']

### Importing necessary python libraries and modules

In [3]:
# For interacting with the operating system, such as reading or writing files.
import os

# Importing the numpy library for array and matrix manipulation.
import numpy as np

# Importing the pandas library for manipulation DataFrame.
import pandas as pd

# Import Gaussian Mixture Model
from sklearn.mixture import GaussianMixture as GMM

### Add Base Directory

In [4]:
# Sets the base directory variable
BASE_DIR = '/content/drive/MyDrive/UNECA_MachineLearning_Project/'

In [5]:
# Change the current working directory
os.chdir(BASE_DIR)

# Print the current working directory to verify the change
print("Current Working Directory:", os.getcwd())

Current Working Directory: /content/drive/MyDrive/UNECA_MachineLearning_Project


## Define the folder data paths

In [6]:
# This line creates the PROCESSED_DIR variable & join it to the BASE_DIR and data

PROCESSED_DIR = os.path.join(BASE_DIR, 'processed')

## Load input dataframe

In [7]:
df_mod_download = pd.read_csv(os.path.join(PROCESSED_DIR, 'df_malawi_download_loc.csv'))
df_mod_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206
...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448


In [8]:
# Extract nightlights column as numpy array
X = df_mod_download['nightlights'].values.reshape(-1,1)

# Fit GMM model
gmm = GMM(n_components=3).fit(X)

# Predict cluster labels
labels = gmm.predict(df_mod_download['nightlights'].values.reshape(-1,1))

# Calculate percentage of points in each cluster
(labels==0).mean(), (labels==1).mean(), (labels==2).mean()

(0.49557522123893805, 0.20206489675516223, 0.3023598820058997)

In [9]:
# Create nightlights bins based on GMM cluster cutoffs

def create_nightlights_bin(df, cutoffs):

  # Validate at least 2 cutoffs
  assert len(cutoffs) >= 2, print('need at least 2 bins')

  # Sort cutoffs descending
  cutoffs = sorted(cutoffs, reverse=True)

  # Create labels
  labels = list(range(len(cutoffs)))[::-1]

  # Initialize bin column
  df['nightlights_bin'] = len(cutoffs)

  # Assign bins
  for cutoff, label in zip(cutoffs, labels):
    df['nightlights_bin'].loc[df['nightlights'] <= cutoff] = label

In [10]:
# Get the maximum cutoff identified by the Guassian Mixture Model
label0_max = df_mod_download['nightlights'][labels==0].max()
label1_max = df_mod_download['nightlights'][labels==1].max()
label2_max = df_mod_download['nightlights'][labels==2].max()

label0_max, label1_max, label2_max

(0.020256236, 9.763892, 0.3769323)

## Lable the nightlights values/ create the nightlight bins

In [11]:
df_download = df_mod_download.copy()

create_nightlights_bin(df_download, cutoffs=[label0_max, label1_max, label2_max])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['nightlights_bin'].loc[df['nightlights'] <= cutoff] = label


In [12]:
df_download

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,1.423239,0.025206,1
1,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.095150,35.217213,1.423239,0.025206,1
2,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,1.423239,0.025206,1
3,-17.09515_35.17229723579403_-17.09515_35.21721...,-17.095150,35.172297,-17.095150,35.217213,1.423239,0.025206,1
4,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,1.423239,0.025206,1
...,...,...,...,...,...,...,...,...
33895,-9.429667_33.06703376420597_-9.429667_33.02211...,-9.429667,33.067034,-9.429667,33.022118,1.534702,0.000448,0
33896,-9.414695078598008_33.06703376420597_-9.429667...,-9.414695,33.067034,-9.429667,33.022118,1.534702,0.000448,0
33897,-9.399723157196016_33.06703376420597_-9.429667...,-9.399723,33.067034,-9.429667,33.022118,1.534702,0.000448,0
33898,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,1.534702,0.000448,0


## Export the labled data

In [13]:
df_download.to_csv(os.path.join(PROCESSED_DIR, 'df_malawi_loc_labed.csv'), index=False)