# **Extract and Aggregate Feature Vectors**

**Feature vectors** contains a lot of information about the image, such as which features were detected by the model [edges, textures, and other patterns].

- It is a numerical representation of an object in an image.

- Load the pre-trained VGG model

- For feature extraction only one layer in the classfier network is enough, remove the extra dense layers.

- Extract the feature vectors for each image in a cluster bypassing the image through the pre trainned VGG model.

- Store the output feature vector [Size = 4096] in a data frame.

- Merge the extracted feature with the image data frame by the image name.

- Aggregate features to get a single feature vector per cluster.

- Saves the feature vectors as numpy array.

### Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


### Add absolute path to the project folder

In [2]:
import sys

sys.path.append("/content/drive/MyDrive/UNECA_MachineLearning_Project/")

# See the full list of paths in sys.path
sys.path

['/content',
 '/env/python',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/usr/local/lib/python3.10/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.10/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/drive/MyDrive/UNECA_MachineLearning_Project/']

## Import libraries and modules

In [3]:
# Provides a way to interact with the operating system
import os

# Offers high-level file operations, such as copying, moving, and deleting files and directories.
import shutil

# A library for numerical computations in Python.
import numpy as np

# A library for data manipulation and analysis
import pandas as pd

#  Provides a progress bar for iterating over loops in Jupyter notebooks.
from tqdm.notebook import tqdm

# useful for saving and loading Python objects.
import pickle

# main library for deep learning in PyTorch
import torch

#  Contains various neural network modules and loss functions.
import torch.nn as nn

# Implements various optimization algorithms, such as SGD, Adam, etc.
import torch.optim as optim

# Import Numpy for numeric operations
import numpy as np

# It provides access to popular datasets, model architectures, and image transformations for computer vision tasks.
import torchvision
from torchvision import datasets, models, transforms

# A plotting library for creating visualizations.
import matplotlib.pyplot as plt

# Provides various time-related functions, such as measuring execution time.
import time

# Offers functions for creating shallow or deep copies of objects.
import copy

### Add Base Directory

In [4]:
# Sets the base directory variable
BASE_DIR = '/content/drive/MyDrive/UNECA_MachineLearning_Project/'

In [5]:
# Change the current working directory
os.chdir(BASE_DIR)

# Print the current working directory to verify the change
print("Current Working Directory:", os.getcwd())

Current Working Directory: /content/drive/MyDrive/UNECA_MachineLearning_Project


## Set other directory paths

In [6]:
COUNTRIES_DIR = os.path.join(BASE_DIR,  'countries')

PROCESSED_DIR = os.path.join(BASE_DIR, 'processed')

RESULTS_DIR = os.path.join(BASE_DIR, 'results')

CNN_IMAGE_DIR = os.path.join(BASE_DIR, 'cnn_images')

## Create the result sub-directory

In [8]:
# os.makedirs(RESULTS_DIR, exist_ok=True)
# for country in ['malawi_2016']:
#    os.makedirs(os.path.join(RESULTS_DIR, country), exist_ok=True)

## Locate the trained models

In [7]:
CNN_MODEL_DIR = os.path.join(BASE_DIR, 'models', 'trained_model_VGG11_145.pt')
CNN_MODEL_DIR

'/content/drive/MyDrive/UNECA_MachineLearning_Project/models/trained_model_VGG11_145.pt'

## Sets the device [cpu or gpu]

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} as backend')

Using cuda as backend


## Load pre-trained model

In [10]:
model_VGG11_145 = torch.load(CNN_MODEL_DIR, map_location=device)

## View the model

In [21]:
print(model_VGG11_145)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU(inplace=True)
    (14): MaxPool2d(ke

In [16]:
# Access the parameters of the loaded model
model_parameters = model_VGG11_145.parameters()
#print(model_parameters)

<generator object Module.parameters at 0x78b06aae8cf0>


In [17]:
# Access the state dictionary of the loaded model
model_state_dict = model_VGG11_145.state_dict()
#print(model_state_dict)

## Load the image dataframe

In [19]:
df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual_malawi2016.csv'))
df_images.shape

(33900, 10)

## View the classifier

In [None]:
model_VGG11_145.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=3, bias=True)
)

## Get the final layers of the classifier

In [26]:
model_VGG11_145.classifier = model_VGG11_145.classifier[:4]
model_VGG11_145.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
)

##  Extract Feature

In [29]:
transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

class ForwardPassDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, transformer):
        self.image_dir = image_dir
        self.image_list = os.listdir(self.image_dir)
        self.transformer = transformer

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, index):
        image_name = self.image_list[index]

        # Load image
        X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)

        # dataloaders need to return a label, but for the forward pass we don't really care
        return X, -1

    def filename_to_im_tensor(self, file):
        im = plt.imread(file)[:,:,:3]
        im = self.transformer(im)
        return im

model_VGG11_145.eval()
classes = [0, 1, 2]
# shape of final array will be (num_validation_images, 4096)
# we also want to record the image each index represents
feats = np.zeros(((~df_images['is_train']).sum(), 4096))
image_order = []
i = 0
for c in classes:
    # use the validation images to do the forward pass
    dataset = ForwardPassDataset(os.path.join(CNN_IMAGE_DIR, 'valid', str(c)), transformer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)
    image_order += dataset.image_list
    # forward pass for this class
    for inputs, _ in tqdm(dataloader):
        inputs = inputs.to(device)
        outputs = model_VGG11_145(inputs)
        feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()
        i += len(inputs)

  0%|          | 0/420 [00:00<?, ?it/s]

  0%|          | 0/257 [00:00<?, ?it/s]

  0%|          | 0/172 [00:00<?, ?it/s]

In [30]:
feats.shape

(6780, 4096)

##  Creates a dataframe from the image order

In [32]:
forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})
forward_pass_df.shape

(6780, 2)

In [33]:
forward_pass_df.head()

Unnamed: 0,image_name,feat_index
0,-13.260499235794024_33.76805515719602_-13.3054...,0
1,-13.275471157196016_33.783027078598_-13.305415...,1
2,-13.260499235794024_33.783027078598_-13.305415...,2
3,-13.335358842803984_33.797999_-13.305415_33.79...,3
4,-13.260499235794024_33.797999_-13.305415_33.79...,4


##  Merge the validation image and feature data frame

In [34]:
df_feature = pd.merge(left=df_images, right=forward_pass_df, on='image_name')
df_feature.shape

(6780, 11)

## Checks the length of the two data frame is equal

In [35]:
assert len(df_feature) == (~df_images['is_train']).sum()

In [38]:
df_feature.head()

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,nightlights_bin,country,is_train,feat_index
0,-17.125093842803985_35.17229723579403_-17.0951...,-17.125094,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,False,5360
1,-17.050234235794026_35.17229723579403_-17.0951...,-17.050234,35.172297,-17.09515,35.217213,1.423239,0.025206,1,mw,False,5361
2,-17.050234235794026_35.20224107859801_-17.0951...,-17.050234,35.202241,-17.09515,35.217213,1.423239,0.025206,1,mw,False,5362
3,-17.11012192140199_35.217213_-17.09515_35.2172...,-17.110122,35.217213,-17.09515,35.217213,1.423239,0.025206,1,mw,False,5363
4,-17.125093842803985_35.232184921401995_-17.095...,-17.125094,35.232185,-17.09515,35.217213,1.423239,0.025206,1,mw,False,5364


## Aggregate Features


In [52]:
country_abbrv = ['mw']
country_dir = ['malawi_2016']

for ca, cd in zip(country_abbrv, country_dir):
    df_c = df_feature[df_feature['country'] == ca]
    group = df_c.groupby(['cluster_lat', 'cluster_lon'])
    x = np.zeros((len(group), 4096))
    cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array
    for i, g in enumerate(group):
        lat, lon = g[0]
        im_sub = df_feature[(df_feature['cluster_lat'] == lat) & (df_feature['cluster_lon'] == lon)].reset_index(drop=True)
        agg_feats = np.zeros((len(im_sub), 4096))
        for j, d in im_sub.iterrows():
            agg_feats[j,:] = feats[d.feat_index]
        agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster

        x[i,:] = agg_feats
        cluster_list.append([lat, lon])
    # save to the correct directory
    save_dir = os.path.join(RESULTS_DIR, cd, 'vgg145')
    os.makedirs(save_dir, exist_ok=True)
    np.save(os.path.join(save_dir, 'cluster_feats_vgg145.npy'), x)
    pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order_vgg145.pkl'), 'wb'))

## Load the aggregate features

In [53]:
feats_path = 'results/malawi_2016/vgg145/cluster_feats_vgg145.npy'

# Load the .npy file
cluster_feats = np.load(feats_path)

In [54]:
cluster_feats.shape
# cluster_feats[0].shape

(780, 4096)

## Load the cluster order

In [56]:
# Specify the path to your .pkl file
cluster_path = 'results/malawi_2016/vgg145/cluster_order_vgg145.pkl'

# Load the .pkl file
with open(cluster_path, 'rb') as file:
    cluster_order= pickle.load(file)

# Assuming loaded_data is a list
# co1 = cluster_order[0]