Using the images marked as valid per cluster, we pass them through the CNN and extract their feature vectors. the results are stored at a per-country basis. For example, all Malawi feature extractions will go into results/malawi_2016/cnn.

In [None]:
!pip install hub

In [None]:
# In case you haven't registered before on hub before. Run this cell
!hub register

In [None]:
!hub login

In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy

In [3]:
BASE_DIR = '.'
COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
RESULTS_DIR = os.path.join(BASE_DIR, 'results')
CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')
CNN_DIR = os.path.join(BASE_DIR, 'models', 'trained_model.pt')

In [4]:
os.makedirs(RESULTS_DIR, exist_ok=True)
for country in ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']:
    os.makedirs(os.path.join(RESULTS_DIR, country), exist_ok=True)

# Feature extract with CNN
If you have run this step before, you can skip it and run the commented out code in the next section to quick-start.

In [5]:
df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'))

In [6]:
df_images.head()

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,country,nightlights_bin,is_train
0,13.759163226605976_5.602468666465976_13.714247...,13.759163,5.602469,13.714247,5.557553,1.667379,0.0,ng,0,False
1,6.485523598426016_3.288079612774024_6.51546744...,6.485524,3.28808,6.515467,3.332995,14.054602,12.430817,ng,2,False
2,-13.222906_33.877838764205976_-13.222906_33.83...,-13.222906,33.877839,-13.222906,33.832923,1.809202,0.0,mw,0,False
3,-13.202325078598008_33.93874615719602_-13.2172...,-13.202325,33.938746,-13.217297,33.96869,2.155243,0.0,mw,0,False
4,6.552117002565977_3.2219584555639833_6.5072012...,6.552117,3.221958,6.507201,3.192015,3.832372,3.141591,ng,1,False


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} as backend')
model = torch.load(CNN_DIR, map_location=device)

Using cpu as backend


In [8]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=3, bias=True)
)

In [9]:
# rip off the final layers
model.classifier = model.classifier[:4]

In [10]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
)

In [13]:
from hub import Transform, dataset

In [14]:
transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

class ValTransformer(Transform):
    def meta(self):
        return {
            "nightlights_bin": {"shape": (1,), "dtype": "int", "dtag" : "text"},
            "image": {"shape": (1,), "dtype": "object", "chunksize": 100, "dtag" : "image"},
        }

    def forward(self, input):
        ds = {}
        ds["nightlights_bin"] = np.empty(1, dtype="int")
        ds["nightlights_bin"][0] = input["nightlights_bin"]

        ds["image"] = np.empty(1, object)
        ds["image"][0] = transformer(input["image"])
        return ds
    
def to_pair(input):
    sample=input["image"]
    target=input["nightlights_bin"]
    return (sample,target)

In [None]:
# Load data
ds = dataset.load("omdena/predicting-poverty-replication-full")

# Taking the initial validation subset
val_ds = ds[0:7293]
val_ds = dataset.generate(ValTransformer(),val_ds)

val_ds = val_ds.to_pytorch(to_pair)

dataloader = torch.utils.data.DataLoader(val_ds, batch_size=8,shuffle=False,num_workers=4)


model.eval()  
classes = [0, 1, 2]
# shape of final array will be (num_validation_images, 4096)
feats = np.zeros(((~df_images['is_train']).sum(), 4096))
image_order = []
i = 0

for inputs, _ in tqdm(dataloader):
    inputs = inputs.to(device)
    outputs = model(inputs)
    feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()
    i += len(inputs)

In [None]:
feats

In [None]:
forward_pass_df = df_images.loc[df_images["is_train"] == False]
#forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})
forward_pass_df['feat_index'] = forward_pass_df.index

forward_pass_df.head()

In [18]:
df_consumption = forward_pass_df

In [19]:
# have we maintained all validation images?
assert len(df_consumption) == (~df_images['is_train']).sum()

In [None]:
df_consumption.head()

## Aggregate Features
For each country, we aggregate the image features per cluster and save them to results/country/cnn

In [None]:
country_abbrv = ['mw', 'eth', 'ng']
country_dir = ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']

for ca, cd in zip(country_abbrv, country_dir):
    df_c = df_consumption[df_consumption['country'] == ca]
    group = df_c.groupby(['cluster_lat', 'cluster_lon'])
    x = np.zeros((len(group), 4096))
    cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array
    for i, g in enumerate(group):
        lat, lon = g[0]
        im_sub = df_consumption[(df_consumption['cluster_lat'] == lat) & (df_consumption['cluster_lon'] == lon)].reset_index(drop=True)
        agg_feats = np.zeros((len(im_sub), 4096))
        for j, d in im_sub.iterrows():
            agg_feats[j,:] = feats[d.feat_index]
        agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster

        x[i,:] = agg_feats
        cluster_list.append([lat, lon])
    # save to the correct directory
    save_dir = os.path.join(RESULTS_DIR, cd, 'cnn')
    os.makedirs(save_dir, exist_ok=True)
    np.save(os.path.join(save_dir, 'cluster_feats.npy'), x)
    pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order.pkl'), 'wb')) 
    