In [22]:
import os
import math
import random
import numpy as np
import pandas as pd
from osgeo import gdal
from tqdm.notebook import tqdm

In [23]:
RANDOM_SEED = 42

In [120]:
path = os.path.join('..', 'data', 'processed', 'grw_scaled_encoded_df.csv')
df = pd.read_csv(path)

In [121]:
df.sample(20)

Unnamed: 0,district,income_per_cluster,dist_lat,dist_long
74,gujranwala,296.583774,32.168056,74.120556
155,gujranwala,272.314815,32.168056,74.120556
113,gujranwala,255.888889,32.168056,74.120556
87,gujranwala,289.631834,32.168056,74.120556
146,gujranwala,1289.783951,32.168056,74.120556
92,gujranwala,219.954171,32.168056,74.120556
104,gujranwala,215.715168,32.168056,74.120556
100,gujranwala,355.185185,32.168056,74.120556
162,gujranwala,451.185185,32.168056,74.120556
3,gujranwala,315.736626,32.168056,74.120556


# Generating Coordinates per cluster

In [122]:
import json

In [123]:
with open(os.path.join("..", "data", "geocode_info", "district_bbox.json")) as f:
    district_bbox = json.load(f)

In [124]:
def generate_download_locations(dframe):

    np.random.seed(RANDOM_SEED) # for reproducibility
    df_download = {'image_name': [], 'cluster_lat': [], 'cluster_long': [], 'dist_lat': [],
                   'dist_long': [], 'income_per_cluster': []}

    districts = dframe['district'].unique()
    
    for district in districts:
        
        filtered_df = dframe[dframe['district'] == district].reset_index().drop('index', axis=1)
        ipc = len(filtered_df)
        dist_lat, dist_long = filtered_df.dist_lat.iloc[0], filtered_df.dist_long.iloc[0]
        
        # side length of square for uniform distribution
        edge_num = math.floor(math.sqrt(ipc))

        min_lon, min_lat, max_lon, max_lat = district_bbox[district]
        lats = np.linspace(min_lat, max_lat, edge_num).tolist()
        longs = np.linspace(min_lon, max_lon, edge_num).tolist()

        # performs cartesian product
        uniform_points = np.transpose([np.tile(lats, len(longs)), np.repeat(longs, len(lats))])

        lats = uniform_points[:,0].tolist()
        longs = uniform_points[:,1].tolist()

        # fills the remainder with random points
        for _ in range(ipc - edge_num * edge_num):
            lat = random.uniform(min_lat, max_lat)
            lon = random.uniform(min_lon, max_lon)
            lats.append(lat)
            longs.append(lon)

        
        # add to dict
        for lat, lon in zip(lats, longs):
            # image name is going to be cluster_lat_cluster_long_dist_lat_dist_long.png
            image_name = str(lat) + '_' + str(lon) + '_' + str(dist_lat) + '_' + str(dist_long) + '.png'
            df_download['image_name'].append(image_name)
            df_download['cluster_lat'].append(lat)
            df_download['cluster_long'].append(lon)
            df_download['dist_lat'].append(dist_lat)
            df_download['dist_long'].append(dist_long)
            # df_download['income_per_cluster'].append(r.income_per_cluster)

        df_download['income_per_cluster'] = filtered_df['income_per_cluster']
        df_download['district'] = district
        
    return pd.DataFrame.from_dict(df_download)

In [125]:
cluster_encoded_df = generate_download_locations(df)

In [126]:
cluster_encoded_df

Unnamed: 0,image_name,cluster_lat,cluster_long,dist_lat,dist_long,income_per_cluster,district
0,31.817606_73.7963557_32.16805556_74.12055556.png,31.817606,73.796356,32.168056,74.120556,326.523810,gujranwala
1,31.856990392307694_73.7963557_32.16805556_74.1...,31.856990,73.796356,32.168056,74.120556,273.312169,gujranwala
2,31.896374784615386_73.7963557_32.16805556_74.1...,31.896375,73.796356,32.168056,74.120556,244.464903,gujranwala
3,31.935759176923078_73.7963557_32.16805556_74.1...,31.935759,73.796356,32.168056,74.120556,315.736626,gujranwala
4,31.97514356923077_73.7963557_32.16805556_74.12...,31.975144,73.796356,32.168056,74.120556,236.365079,gujranwala
...,...,...,...,...,...,...,...
196,31.940611679461487_74.38018679065134_32.168055...,31.940612,74.380187,32.168056,74.120556,324.813933,gujranwala
197,32.13107039035056_73.81401917126567_32.1680555...,32.131070,73.814019,32.168056,74.120556,355.189300,gujranwala
198,31.832099682525712_73.97807705923267_32.168055...,31.832100,73.978077,32.168056,74.120556,292.169312,gujranwala
199,32.313793047056855_74.23464025341895_32.168055...,32.313793,74.234640,32.168056,74.120556,267.072310,gujranwala


# Extracting Nightlights - Per Cluster

In [127]:
ds = gdal.Open(os.path.join('..', 'data', 'nightlights', 'BlackMarble_2016_C1_geo_gray.tif'))

In [128]:
# Read the image data as a NumPy array
data = ds.ReadAsArray()

In [129]:
geo_transform = ds.GetGeoTransform(1)

In [130]:
avg_intensities = []

for _, row in tqdm(cluster_encoded_df.iterrows()):
    latitudes = row.cluster_lat
    longitudes = row.cluster_long

    # Get geo_transform information
    geo_transform = ds.GetGeoTransform(1)

    # Convert coordinates to pixel indices
    x_indices = np.clip(
        np.round((longitudes - geo_transform[0]) / geo_transform[1], 0), 0, data.shape[1] - 1
    )
    y_indices = np.clip(
        np.round((latitudes - geo_transform[3]) / geo_transform[5], 0), 0, data.shape[0] - 1
    )
    
    x_indices = x_indices.astype(int)
    y_indices = y_indices.astype(int)
    
    # Extract pixel values
    pixel_values = data[y_indices, x_indices]
    mean_pixel = np.mean(pixel_values)
    avg_intensities.append(mean_pixel)
    

0it [00:00, ?it/s]

In [131]:
cluster_encoded_df['nightlights'] = avg_intensities

In [132]:
cluster_encoded_df.sample(10)

Unnamed: 0,image_name,cluster_lat,cluster_long,dist_lat,dist_long,income_per_cluster,district,nightlights
95,32.250834315384616_74.15657820769232_32.168055...,32.250834,74.156578,32.168056,74.120556,243.194885,gujranwala,1.844676
15,31.856990392307694_73.8563927846154_32.1680555...,31.85699,73.856393,32.168056,74.120556,484.814815,gujranwala,1.856759
30,31.896374784615386_73.91642986923078_32.168055...,31.896375,73.91643,32.168056,74.120556,474.057319,gujranwala,1.434259
158,31.97514356923077_74.45676363076923_32.1680555...,31.975144,74.456764,32.168056,74.120556,306.656085,gujranwala,2.371574
128,31.896374784615386_74.33668946153847_32.168055...,31.896375,74.336689,32.168056,74.120556,310.308642,gujranwala,2.049167
115,31.935759176923078_74.27665237692308_32.168055...,31.935759,74.276652,32.168056,74.120556,317.698413,gujranwala,1.951389
69,32.3296031_74.03650403846154_32.16805556_74.12...,32.329603,74.036504,32.168056,74.120556,271.556437,gujranwala,1.895
171,31.935759176923078_74.51680071538462_32.168055...,31.935759,74.516801,32.168056,74.120556,387.805899,gujranwala,2.678102
175,32.09329674615385_74.51680071538462_32.1680555...,32.093297,74.516801,32.168056,74.120556,481.979718,gujranwala,2.678102
45,31.935759176923078_73.97646695384616_32.168055...,31.935759,73.976467,32.168056,74.120556,888.290123,gujranwala,1.383935


In [133]:
file_save_path = os.path.join('..', 'data', 'processed')
cluster_encoded_df.to_csv(os.path.join(file_save_path, 'cluster_encoded_nightlights_df.csv'), index=False)
print('Data Saved successfully !!')

Data Saved successfully !!


## Defining & Applying Income thresholds
Instead of going for a difficult approach of defining income thresholds and assigning nighttime labels accordingly which is in itself a headache to distribute the already present ratio of the labels to the incomes, what we do is sort the income column "separately" and then sort the nightlights or labels columns with image_name column separately. After this assign the labels. Simple as that.
This solves the problem of defining the income thresholds as well as the arrangement of the label and image_name columns with respect to income levels

In [167]:
path = os.path.join('..', 'data', 'processed', 'cluster_encoded_nightlights_df.csv')
df = pd.read_csv(path)
df.sample(10)

Unnamed: 0,image_name,cluster_lat,cluster_long,dist_lat,dist_long,income_per_cluster,district,nightlights
161,32.09329674615385_74.45676363076923_32.1680555...,32.093297,74.456764,32.168056,74.120556,339.492063,gujranwala,2.371574
14,31.817606_73.8563927846154_32.16805556_74.1205...,31.817606,73.856393,32.168056,74.120556,710.462963,gujranwala,1.856759
56,31.817606_74.03650403846154_32.16805556_74.120...,31.817606,74.036504,32.168056,74.120556,322.814815,gujranwala,1.895
19,32.01452796153846_73.8563927846154_32.16805556...,32.014528,73.856393,32.168056,74.120556,374.595238,gujranwala,1.856759
176,32.132681138461535_74.51680071538462_32.168055...,32.132681,74.516801,32.168056,74.120556,223.657848,gujranwala,2.678102
140,31.817606_74.39672654615386_32.16805556_74.120...,31.817606,74.396727,32.168056,74.120556,355.846561,gujranwala,2.772407
49,32.09329674615385_73.97646695384616_32.1680555...,32.093297,73.976467,32.168056,74.120556,390.518519,gujranwala,1.383935
37,32.17206553076923_73.91642986923078_32.1680555...,32.172066,73.91643,32.168056,74.120556,297.820106,gujranwala,1.434259
26,32.290218707692304_73.8563927846154_32.1680555...,32.290219,73.856393,32.168056,74.120556,235.417989,gujranwala,1.856759
40,32.290218707692304_73.91642986923078_32.168055...,32.290219,73.91643,32.168056,74.120556,352.847222,gujranwala,1.434259


In [168]:
df.columns

Index(['image_name', 'cluster_lat', 'cluster_long', 'dist_lat', 'dist_long',
       'income_per_cluster', 'district', 'nightlights'],
      dtype='object')

In [169]:
dist_info = df[['district', 'dist_lat', 'dist_long']]

In [170]:
dist_info.columns

Index(['district', 'dist_lat', 'dist_long'], dtype='object')

In [171]:
income_col = df['income_per_cluster']

In [172]:
sorted_income_col = income_col.sort_values(ascending=True).reset_index(drop=True)

In [173]:
labels = df[['cluster_lat', 'cluster_long', 'image_name', 'nightlights']]

In [174]:
labels.columns

Index(['cluster_lat', 'cluster_long', 'image_name', 'nightlights'], dtype='object')

In [175]:
sorted_labels = labels.sort_values(by='nightlights', ascending=True).reset_index(drop=True)

In [176]:
income_col.shape, labels.shape

((201,), (201, 4))

In [177]:
final_df = pd.concat([sorted_income_col, sorted_labels, dist_info], axis=1)

In [178]:
final_df.head(10)

Unnamed: 0,income_per_cluster,cluster_lat,cluster_long,image_name,nightlights,district,dist_lat,dist_long
0,161.183862,31.817606,73.976467,31.817606_73.97646695384616_32.16805556_74.120...,1.383935,gujranwala,32.168056,74.120556
1,178.756614,31.85699,73.976467,31.856990392307694_73.97646695384616_32.168055...,1.383935,gujranwala,32.168056,74.120556
2,179.944444,31.935759,73.976467,31.935759176923078_73.97646695384616_32.168055...,1.383935,gujranwala,32.168056,74.120556
3,181.123457,31.975144,73.976467,31.97514356923077_73.97646695384616_32.1680555...,1.383935,gujranwala,32.168056,74.120556
4,189.73545,32.014528,73.976467,32.01452796153846_73.97646695384616_32.1680555...,1.383935,gujranwala,32.168056,74.120556
5,205.249118,32.053912,73.976467,32.05391235384616_73.97646695384616_32.1680555...,1.383935,gujranwala,32.168056,74.120556
6,213.174603,32.093297,73.976467,32.09329674615385_73.97646695384616_32.1680555...,1.383935,gujranwala,32.168056,74.120556
7,213.507643,32.132681,73.976467,32.132681138461535_73.97646695384616_32.168055...,1.383935,gujranwala,32.168056,74.120556
8,215.715168,31.896375,73.976467,31.896374784615386_73.97646695384616_32.168055...,1.383935,gujranwala,32.168056,74.120556
9,219.954171,32.21145,73.976467,32.21144992307693_73.97646695384616_32.1680555...,1.383935,gujranwala,32.168056,74.120556


In [179]:
final_df.tail(10)

Unnamed: 0,income_per_cluster,cluster_lat,cluster_long,image_name,nightlights,district,dist_lat,dist_long
191,793.58634,32.014528,74.396727,32.01452796153846_74.39672654615386_32.1680555...,2.772407,gujranwala,32.168056,74.120556
192,804.925926,31.975144,74.396727,31.97514356923077_74.39672654615386_32.1680555...,2.772407,gujranwala,32.168056,74.120556
193,813.888889,31.935759,74.396727,31.935759176923078_74.39672654615386_32.168055...,2.772407,gujranwala,32.168056,74.120556
194,833.734568,31.896375,74.396727,31.896374784615386_74.39672654615386_32.168055...,2.772407,gujranwala,32.168056,74.120556
195,888.290123,32.053912,74.396727,32.05391235384616_74.39672654615386_32.1680555...,2.772407,gujranwala,32.168056,74.120556
196,923.936508,31.85699,74.396727,31.856990392307694_74.39672654615386_32.168055...,2.772407,gujranwala,32.168056,74.120556
197,1101.93739,31.817606,74.396727,31.817606_74.39672654615386_32.16805556_74.120...,2.772407,gujranwala,32.168056,74.120556
198,1289.783951,32.172066,74.396727,32.17206553076923_74.39672654615386_32.1680555...,2.772407,gujranwala,32.168056,74.120556
199,1462.310406,32.132681,74.396727,32.132681138461535_74.39672654615386_32.168055...,2.772407,gujranwala,32.168056,74.120556
200,1532.205802,32.093297,74.396727,32.09329674615385_74.39672654615386_32.1680555...,2.772407,gujranwala,32.168056,74.120556


Renaming the image names in `image_name` column for efficient data scraping and hence efficient model training (later in the code)

In [180]:
# Function to combine values
def combine_values(row_):
    col1_value = row_['image_name']
    col2_value = str(row_['nightlights'])
    combined_value = col1_value.replace('.png', f'_{col2_value}.png')
    return combined_value

In [181]:
# Apply the function to each row
final_df['cluster_name'] = final_df.apply(combine_values, axis=1)
final_df.drop('image_name', axis=1, inplace=True)
final_df

Unnamed: 0,income_per_cluster,cluster_lat,cluster_long,nightlights,district,dist_lat,dist_long,cluster_name
0,161.183862,31.817606,73.976467,1.383935,gujranwala,32.168056,74.120556,31.817606_73.97646695384616_32.16805556_74.120...
1,178.756614,31.856990,73.976467,1.383935,gujranwala,32.168056,74.120556,31.856990392307694_73.97646695384616_32.168055...
2,179.944444,31.935759,73.976467,1.383935,gujranwala,32.168056,74.120556,31.935759176923078_73.97646695384616_32.168055...
3,181.123457,31.975144,73.976467,1.383935,gujranwala,32.168056,74.120556,31.97514356923077_73.97646695384616_32.1680555...
4,189.735450,32.014528,73.976467,1.383935,gujranwala,32.168056,74.120556,32.01452796153846_73.97646695384616_32.1680555...
...,...,...,...,...,...,...,...,...
196,923.936508,31.856990,74.396727,2.772407,gujranwala,32.168056,74.120556,31.856990392307694_74.39672654615386_32.168055...
197,1101.937390,31.817606,74.396727,2.772407,gujranwala,32.168056,74.120556,31.817606_74.39672654615386_32.16805556_74.120...
198,1289.783951,32.172066,74.396727,2.772407,gujranwala,32.168056,74.120556,32.17206553076923_74.39672654615386_32.1680555...
199,1462.310406,32.132681,74.396727,2.772407,gujranwala,32.168056,74.120556,32.132681138461535_74.39672654615386_32.168055...


In [182]:
final_df.to_csv(os.path.join(file_save_path, 'finalized_df.csv'), index=False)
print('Data saved successfully !!')

Data saved successfully !!


# THE END !