In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from concurrent.futures import ThreadPoolExecutor
import os

import numpy as np
import pandas as pd
import sklearn
from tqdm.auto import tqdm

In [3]:
dataset_root_dir = '../../dataset_preprocessing/dhs_lsms'

In [4]:
df = pd.read_csv(os.path.join(dataset_root_dir, 'output_labels/dhs_final_labels.csv'))
df['survey'] = df['DHSID_EA'].str[:10]
df['cc'] = df['DHSID_EA'].str[:2]
df['path'] = dataset_root_dir + '/dhs_npzs/' + df['survey'] + '/' + df['DHSID_EA'] + '.npz'
path_years = df[['DHSID_EA', 'path', 'year']].apply(tuple, axis=1)
df.set_index('DHSID_EA', verify_integrity=True, inplace=True)
print(df['path'].iloc[0])
display(df.head())

../../dataset_preprocessing/dhs_lsms/dhs_npzs/AL-2008-5#/AL-2008-5#-00000001.npz


Unnamed: 0_level_0,cname,year,lat,lon,n_asset,asset_index,n_water,water_index,n_sanitation,sanitation_index,...,women_bmi,n_women_edu,n_women_bmi,cluster_id,adm1fips,adm1dhs,urban,survey,cc,path
DHSID_EA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AL-2008-5#-00000001,AL,2008,40.822652,19.838321,18.0,2.430596,18.0,3.444444,18.0,4.833333,...,24.365,18.0,18.0,1,,9999,R,AL-2008-5#,AL,../../dataset_preprocessing/dhs_lsms/dhs_npzs/...
AL-2008-5#-00000002,AL,2008,40.696846,20.007555,20.0,2.867678,20.0,4.7,20.0,4.95,...,23.104,20.0,20.0,2,,9999,R,AL-2008-5#,AL,../../dataset_preprocessing/dhs_lsms/dhs_npzs/...
AL-2008-5#-00000003,AL,2008,40.750037,19.974262,18.0,2.909049,18.0,4.5,18.0,4.777778,...,22.387778,18.0,18.0,3,,9999,R,AL-2008-5#,AL,../../dataset_preprocessing/dhs_lsms/dhs_npzs/...
AL-2008-5#-00000004,AL,2008,40.798931,19.863338,19.0,2.881122,19.0,4.947368,19.0,4.789474,...,27.0845,21.0,20.0,4,,9999,R,AL-2008-5#,AL,../../dataset_preprocessing/dhs_lsms/dhs_npzs/...
AL-2008-5#-00000005,AL,2008,40.746123,19.843885,19.0,2.54683,19.0,4.684211,19.0,4.526316,...,24.523125,16.0,16.0,5,,9999,R,AL-2008-5#,AL,../../dataset_preprocessing/dhs_lsms/dhs_npzs/...


In [5]:
label_cols = ['asset_index', 'under5_mort', 'women_bmi', 'women_edu', 'water_index', 'sanitation_index']

In [6]:
def calculate_nl_mean(path_and_year) -> tuple[np.ndarray, np.ndarray, int]:
    '''
    Args
    - path_year: tuple (path, year)
      - path: str, path to npz file containing single entry 'x'
        representing a (C, H, W) image
      - year: int

    Returns: (nl_mean, year)
    '''
    dhsid_ea, npz_path, year = path_and_year
    img = np.load(npz_path)['x']  # shape (C, H, W)
    nl_mean = img[-1].mean(dtype=np.float64)
    return dhsid_ea, nl_mean, year

In [7]:
results_df = pd.DataFrame(
    data=np.nan,
    columns=['nl_mean', 'year'],
    index=df.index #pd.Index(sorted(df['DHSID_EA']), name='DHSID_EA')
)
results_df.head()

Unnamed: 0_level_0,nl_mean,year
DHSID_EA,Unnamed: 1_level_1,Unnamed: 2_level_1
AL-2008-5#-00000001,,
AL-2008-5#-00000002,,
AL-2008-5#-00000003,,
AL-2008-5#-00000004,,
AL-2008-5#-00000005,,


In [8]:
with ThreadPoolExecutor(max_workers=30) as pool:
    inputs = path_years
    futures = pool.map(calculate_nl_mean, inputs)
    for dhsid_ea, nl_mean, year in tqdm(futures, total=len(inputs)):
        results_df.loc[dhsid_ea, ['nl_mean', 'year']] = (nl_mean, year)

  0%|          | 0/117644 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '../../dataset_preprocessing/dhs_lsms/dhs_npzs/AL-2008-5#/AL-2008-5#-00000001.npz'

In [None]:
results_df.to_csv('mean_nl.csv')

In [None]:
results_df['year'] = results_df['year'].astype(int)

In [None]:
SPLITS = {
    'train': [
        'AL', 'BD', 'CD', 'CM', 'GH', 'GU', 'HN', 'IA', 'ID', 'JO', 'KE', 'KM',
        'LB', 'LS', 'MA', 'MB', 'MD', 'MM', 'MW', 'MZ', 'NG', 'NI', 'PE', 'PH',
        'SN', 'TG', 'TJ', 'UG', 'ZM', 'ZW'],
    'val': [
        'BF', 'BJ', 'BO', 'CO', 'DR', 'GA', 'GN', 'GY', 'HT', 'NM', 'SL', 'TD',
        'TZ'],
    'test': [
        'AM', 'AO', 'BU', 'CI', 'EG', 'ET', 'KH', 'KY', 'ML', 'NP', 'PK', 'RW',
        'SZ']
}
SPLITS['trainval'] = SPLITS['train'] + SPLITS['val']

In [None]:
import scipy.stats
import sklearn.neighbors

In [None]:
results_df['cc'] = results_df.index.str[:2]

In [None]:
def run(knn, label, dmsp, trainsplit='train', testsplit='test'):
    if dmsp:
        year_mask = (df['year'] <= 2011)
    else:
        year_mask = (df['year'] > 2011)

    train_dhsids = df.index[year_mask & df['cc'].isin(SPLITS[trainsplit]) & df[label].notna()]
    test_dhsids = df.index[year_mask & df['cc'].isin(SPLITS[testsplit]) & df[label].notna()]

    train_X = results_df.loc[train_dhsids, 'nl_mean'].values.reshape(-1, 1)
    train_Y = df.loc[train_dhsids, label].values
    test_X = results_df.loc[test_dhsids, 'nl_mean'].values.reshape(-1, 1)
    test_Y = df.loc[test_dhsids, label].values

    knn.fit(train_X, train_Y)
    preds = knn.predict(test_X)
    return preds, test_Y

In [None]:
for label in label_cols:
    print(f'=========== {label:15s} ============')
    best_r2 = 0
    best_k = None
    for k in range(1, 21):
        knn = sklearn.neighbors.KNeighborsRegressor(
            n_neighbors=k, weights='uniform', algorithm='auto')
        preds_dmsp, labels_dmsp = run(knn, label, True, 'train', 'val')
        preds_viirs, labels_viirs = run(knn, label, False, 'train', 'val')
        r2 = scipy.stats.pearsonr(
            np.concatenate([preds_dmsp, preds_viirs]),
            np.concatenate([labels_dmsp, labels_viirs])
        )[0]**2
        print(f'k={k:2d}, {label:15s} r^2 = {r2:.3f}')
        if r2 > best_r2:
            best_r2 = r2
            best_k = k
    knn = sklearn.neighbors.KNeighborsRegressor(
            n_neighbors=best_k, weights='uniform', algorithm='auto')
    preds_dmsp, labels_dmsp = run(knn, label, True, 'trainval', 'test')
    preds_viirs, labels_viirs = run(knn, label, False, 'trainval', 'test')
    r2 = scipy.stats.pearsonr(
        np.concatenate([preds_dmsp, preds_viirs]),
        np.concatenate([labels_dmsp, labels_viirs])
    )[0]**2
    print(f'FINAL: k={best_k:2d}, {label:15s} r^2 = {r2:.2f}')