## Steps
1. Upload a self-storage file
2. Parse out a set of the lat/lon
3. Download a dataset of the images for all lat/lons
4. Extract embeddings for all feature embedding methods

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import io as b_io
import os
from pathlib import Path

import dill

# Import necessary packages
from mosaiks import config as c
from mosaiks import transforms
from mosaiks.utils.imports import *

env variable MOSAIKS_HOME not defined; setting to: "/home/ubuntu/cs230/mosaiks-paper"
If not desired, please reset os.environ["MOSAIKS_NAME"]


In [2]:
import requests
import pandas as pd
from tqdm import tqdm
import torch
from torchvision import models
from torchvision import transforms
import dill
import time
import skimage.io
import skimage.transform
from mosaiks.featurization import chunks
from mosaiks.featurization import featurize, featurize_and_save
import io as python_io
from torch import nn
import pickle
import torchvision.transforms
from PIL import Image

In [3]:
filename = "train_df.tsv"
filename = c.data_dir + "/raw/" + filename

data = pd.read_csv(filename, sep='\t', header=0, index_col=False)
data

Unnamed: 0,price,domain,website,places_id,address,city,state,country,dimensions,dim1,...,oversized_doors,power_outlet,premium_location,rv_parking,parking,rv_parking_covered,rv_parking_uncovered,stair_access,unusual_size,wine_storage
0,72.0,storagesense.com,https://www.storagesense.com/location/usa/tx/h...,ChIJkxtdA8mzQIYROlwVeGwwh6I,18006 US-59,Humble,TX,United States,5' x 5',5.0,...,0,0,0,0,0,0,0,1,0,0
1,77.0,storagesense.com,https://www.storagesense.com/location/usa/tx/h...,ChIJkxtdA8mzQIYROlwVeGwwh6I,18006 US-59,Humble,TX,United States,5' x 5',5.0,...,0,0,0,0,0,0,0,0,0,0
2,105.0,storagesense.com,https://www.storagesense.com/location/usa/tx/h...,ChIJkxtdA8mzQIYROlwVeGwwh6I,18006 US-59,Humble,TX,United States,5' x 10',5.0,...,0,0,0,0,0,0,0,1,0,0
3,119.0,storagesense.com,https://www.storagesense.com/location/usa/tx/h...,ChIJkxtdA8mzQIYROlwVeGwwh6I,18006 US-59,Humble,TX,United States,5' x 10',5.0,...,0,0,0,0,0,0,0,0,0,0
4,118.0,storagesense.com,https://www.storagesense.com/location/usa/tx/h...,ChIJkxtdA8mzQIYROlwVeGwwh6I,18006 US-59,Humble,TX,United States,5' x 15',5.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22096,19.0,extraspace.com,https://www.extraspace.com/storage/facilities/...,ChIJEdlQ-KvV2IcRhNRyDF9stDc,1289 Vero Ln,Ellisville,MO,United States,5' x 5',5.0,...,0,0,0,0,0,0,0,0,1,0
22097,37.0,extraspace.com,https://www.extraspace.com/storage/facilities/...,ChIJEdlQ-KvV2IcRhNRyDF9stDc,1289 Vero Ln,Ellisville,MO,United States,4' x 5',4.0,...,0,0,0,0,0,0,0,0,0,0
22098,35.0,extraspace.com,https://www.extraspace.com/storage/facilities/...,ChIJEdlQ-KvV2IcRhNRyDF9stDc,1289 Vero Ln,Ellisville,MO,United States,5' x 5',5.0,...,0,0,0,0,0,0,0,0,0,0
22099,193.0,extraspace.com,https://www.extraspace.com/storage/facilities/...,ChIJEdlQ-KvV2IcRhNRyDF9stDc,1289 Vero Ln,Ellisville,MO,United States,10' x 11',10.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
latlons = data[['latitude', 'longitude', 'places_id']].drop_duplicates()
latlons = latlons.to_numpy()
print(latlons)

[[29.9796496 -95.2753524 'ChIJkxtdA8mzQIYROlwVeGwwh6I']
 [42.6765096 -84.5239802 'ChIJBXG-W2DBIogRs552_nWUOws']
 [30.6371446 -96.3810666 'ChIJJ6ZCoiSCRoYRjFNyzETgjMc']
 ...
 [43.6897019 -116.6167439 'ChIJYUFELt6zr1QRj4ciTNbBr1c']
 [38.3391359 -76.561331 'ChIJPeCXddVlt4kRaAvPcjj2HwU']
 [38.5971049 -90.585272 'ChIJEdlQ-KvV2IcRhNRyDF9stDc']]


In [8]:
api_key = "<insert api key here>"
url = "https://maps.googleapis.com/maps/api/staticmap?"
output_path = "../../data/raw/imagery/self-storage/"

In [7]:
# Helper function to generate the end file name 
def generate_image_file_name(lat, lon, zoom, pix):
  return f"{lat}_{lon}_{zoom}_{pix}_{pix}.png"

# test function to verify file name output
def print_output_file_names(latlons, limit):
  for latlon in latlons[:limit]:
    lat, lon = latlon[0], latlon[1]
    print(generate_image_file_name(lat, lon, 16, 640))
    
# function to actually download the images
def download_images(latlons, start=0, end=0):
  # get and save the images
  for idx in tqdm(range(start, end)):
    latlon = latlons[idx]
    lat, lon = latlon[0], latlon[1]
    request_url = url + f"center={lat},{lon}&zoom={16}&size={640}x{640}&maptype=satellite&key={api_key}"
    r = requests.get(url + f"center={lat},{lon}&zoom={16}&size={640}x{640}&maptype=satellite&key={api_key}")
    if (r.status_code != 200):
        print(f"Could not download idx {idx}, http code {r.status_code}")
        break
    filename = output_path + generate_image_file_name(lat, lon, 16, 640)
    with open(filename, 'wb') as f:
      f.write(r.content)

In [8]:
print_output_file_names(latlons, 10)
print(latlons.shape)
# download_images(latlons, 0, latlons.shape[0])

29.9796496_-95.2753524_16_640_640.png
42.6765096_-84.5239802_16_640_640.png
30.6371446_-96.3810666_16_640_640.png
35.9843818_-78.5389999_16_640_640.png
39.9787487_-76.7331381_16_640_640.png
34.7439492_-86.5976824_16_640_640.png
33.550876_-84.3705764_16_640_640.png
42.455787_-82.962877_16_640_640.png
35.767771_-80.8561524_16_640_640.png
36.0670826_-79.4254095_16_640_640.png
(1707, 3)


In [5]:
filename = "test_id_df.tsv"
filename = c.data_dir + "/raw/" + filename

data_test_id = pd.read_csv(filename, sep='\t', header=0, index_col=False)

latlons_test_id = data_test_id[['latitude', 'longitude', 'places_id']].drop_duplicates()
latlons_test_id = latlons_test_id.to_numpy()
print(latlons_test_id)

[[42.9750012 -73.8608553 'ChIJHz19G2ZA3okR7MeMy1ZyHqg']
 [30.3587044 -89.1606283 'ChIJ6wcat7w8nIgR055G7AZlRUs']
 [32.7674327 -97.4071377 'ChIJG87HxGJzToYRj1wV2KeAF2w']
 ...
 [33.6976813 -117.8172083 'ChIJT5IbDwPd3IAR06KtP9VK1HI']
 [38.456701 -122.6774424 'ChIJm2syBz1HhIARPITVa6_pkXs']
 [42.1509105 -72.4577793 'ChIJi_oB-UPD5okR_C1mlcYO-hk']]


In [9]:
print_output_file_names(latlons_test_id, 10)
print(latlons_test_id.shape)

42.9750012_-73.8608553_16_640_640.png
30.3587044_-89.1606283_16_640_640.png
32.7674327_-97.4071377_16_640_640.png
42.7469932_-84.5420907_16_640_640.png
30.0518011_-95.4375312_16_640_640.png
41.6381235_-88.0788409_16_640_640.png
40.3762789_-105.0789257_16_640_640.png
33.4603839_-82.183918_16_640_640.png
40.2345049_-75.6230819_16_640_640.png
42.3798917_-83.3148246_16_640_640.png
(427, 3)


In [77]:
# download_images(latlons_test_id, 0, latlons_test_id.shape[0])

100%|███████████████████████████████████████████████████████████████████████████████| 427/427 [01:58<00:00,  3.62it/s]


In [10]:
filename = "test_ood_df.tsv"
filename = c.data_dir + "/raw/" + filename

data_test_ood = pd.read_csv(filename, sep='\t', header=0, index_col=False)

latlons_test_ood = data_test_ood[['latitude', 'longitude', 'places_id']].drop_duplicates()
latlons_test_ood = latlons_test_ood.to_numpy()
print_output_file_names(latlons_test_ood, 10)
print(latlons_test_ood.shape)

40.8229805_-72.9790197_16_640_640.png
43.124286_-73.768612_16_640_640.png
43.1653212_-73.7051917_16_640_640.png
43.1239665_-70.7648303_16_640_640.png
36.7880659_-76.0656629_16_640_640.png
36.8018954_-76.3811516_16_640_640.png
43.3165168_-73.5950938_16_640_640.png
42.4973221_-70.9246327_16_640_640.png
38.0174119_-78.4176878_16_640_640.png
41.346803_-73.2568649_16_640_640.png
(162, 3)


In [80]:
# download_images(latlons_test_ood, 0, latlons_test_ood.shape[0])

100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:46<00:00,  3.48it/s]


## Now we featurize
Note we have two Dataset implementations, the first one is more general. The second one is what is needed to work with the MOSAIKs feature generation.

In [26]:
class SelfStorageImageDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data_home,
        latlons,
        transform=None
    ):
        self.latlons = latlons
        self.data_home = data_home
        self.transform=transform

    def __len__(self):
        return len(self.latlons)

    def __getitem__(self, i):
        latlon = self.latlons[i]
        im = io.load_img_from_local(latlon, self.data_home, 16, 640)
        if len(im.shape) < 3:
            im = np.stack((im, im, im), axis=2)
        if im.shape[-1] > 3:
            im = im[:, :, :3]
        if im.shape[-1] == 1:
            im = np.concatenate((im, im, im), axis=2)
        x_i = im.transpose(2, 0, 1)
        if self.transform is not None:
#             x_i = torch.from_numpy(x_i)
            x_i = self.transform(x_i)
        return x_i

In [54]:
def generate_key_name(latlon, image_dir, zoom, pix):
    lat, lon = latlon[0], latlon[1]
    outkey = f"{image_dir}/{lat}_{lon}_{zoom}_{pix}_{pix}.png"
    return outkey

class SelfStorageImageDatasetTwo(torch.utils.data.Dataset):
    def __init__(
        self,
        data_home,
        latlons,
        transform=None
    ):
        self.latlons = latlons
        self.data_home = data_home
        self.transform=transform

    def __len__(self):
        return len(self.latlons)

    def __getitem__(self, i):
        latlon = self.latlons[i]
        fpath = generate_key_name(latlon, self.data_home, 16, 640)
        im = Image.open(fpath)
        if self.transform is not None:
#             x_i = torch.from_numpy(x_i)
            im = self.transform(im)
        return im

In [39]:
print(latlons.shape)
print(latlons_test_id.shape)
print(latlons_test_ood.shape)

# We create a combined dataset
all_latlons = np.concatenate((latlons, latlons_test_id, latlons_test_ood), axis=0)
print(all_latlons.shape)
print(all_latlons[:5])

image_dir = c.data_dir + "/raw/imagery/self-storage/"

places_ids = all_latlons[:,2]
print(places_ids.shape)
print(places_ids[:5])

ll_formatted = all_latlons[:,:2]
print(ll_formatted.shape)
print(ll_formatted[:5])

(1707, 3)
(427, 3)
(162, 3)
(2296, 3)
[[29.9796496 -95.2753524 'ChIJkxtdA8mzQIYROlwVeGwwh6I']
 [42.6765096 -84.5239802 'ChIJBXG-W2DBIogRs552_nWUOws']
 [30.6371446 -96.3810666 'ChIJJ6ZCoiSCRoYRjFNyzETgjMc']
 [35.9843818 -78.5389999 'ChIJGxPXPm9TrIkR8DTsN0sjT24']
 [39.9787487 -76.7331381 'ChIJQe7mgp2OyIkRyP-yL8dVN00']]
(2296,)
['ChIJkxtdA8mzQIYROlwVeGwwh6I' 'ChIJBXG-W2DBIogRs552_nWUOws'
 'ChIJJ6ZCoiSCRoYRjFNyzETgjMc' 'ChIJGxPXPm9TrIkR8DTsN0sjT24'
 'ChIJQe7mgp2OyIkRyP-yL8dVN00']
(2296, 2)
[[29.9796496 -95.2753524]
 [42.6765096 -84.5239802]
 [30.6371446 -96.3810666]
 [35.9843818 -78.5389999]
 [39.9787487 -76.7331381]]


In [75]:
# This is creating mosaiks embeddings

def create_mosaiks_features():
    resize = torchvision.transforms.Resize((256, 256))
    to_tensor = torchvision.transforms.ToTensor()
    transform = torchvision.transforms.Compose([resize, to_tensor])
    dataset = SelfStorageImageDatasetTwo(image_dir, ll_formatted, transform=transform)

    base_image_dir = Path(c.data_dir) / "raw" / "imagery"
    image_folder = base_image_dir / f"CONTUS_UAR"
    out_fpath = Path(c.features_dir) / f"CONTUS_UAR_self_storage.pkl"

    X_lift, _, _ = featurize(image_folder, c, hijack=True, alt_dataset=dataset)

    data = {}
    
    bio_features = python_io.BytesIO()
    np.save(bio_features, np.vstack(X_lift))
    data["X"] = bio_features.getvalue()
    data["latlon"] = ll_formatted
    data["ids_X"] = places_ids
    with open(out_fpath, "wb") as f:
        dill.dump(data, f, protocol=4)

In [76]:
create_mosaiks_features()
    

dataset size 100000


  0%|                                                                                                                               | 0/2296 [00:00<?, ?it/s]

zca bias 0.001
filters shape (4096, 1, 3, 3)
BasicCoatesNgNet()
batch size: 8
generating features 0 to 1024


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2296/2296 [03:06<00:00, 12.29it/s]
  0%|                                                                                                                               | 0/2296 [00:00<?, ?it/s]

batch size: 8
generating features 1024 to 2048


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2296/2296 [03:07<00:00, 12.27it/s]
  0%|                                                                                                                               | 0/2296 [00:00<?, ?it/s]

batch size: 8
generating features 2048 to 3072


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2296/2296 [03:07<00:00, 12.23it/s]
  0%|                                                                                                                               | 0/2296 [00:00<?, ?it/s]

batch size: 8
generating features 3072 to 4096


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2296/2296 [03:07<00:00, 12.23it/s]


(2296, 8192)
featurization complete, featurized 2296 training points 8192 output features, took 749.4645299911499 seconds


In [77]:
with open(out_fpath, "rb") as f:
    mosaiks_data = np.load(f, allow_pickle=True)
print(len(mosaiks_data["X"]))
print(type(mosaiks_data["X"]))
print(len(mosaiks_data["latlon"]))
temp = b_io.BytesIO(mosaiks_data["X"])
features = np.load(temp, allow_pickle=True)
print(features.shape)
latlons_samp = mosaiks_data["latlon"]
ids_x = mosaiks_data["ids_X"]

X = pd.DataFrame(features, index=ids_x)
lls = pd.DataFrame(latlons_samp, index=ids_x, columns=["lat", "lon"])

lls
X

75235456
<class 'bytes'>
2296
(2296, 8192)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8182,8183,8184,8185,8186,8187,8188,8189,8190,8191
ChIJkxtdA8mzQIYROlwVeGwwh6I,0.384043,0.378428,0.357258,0.375894,0.366386,0.374853,0.364184,0.389692,0.365127,0.382412,...,0.368015,0.359384,0.375073,0.361658,0.330489,0.385581,0.392920,0.355400,0.376134,0.385526
ChIJBXG-W2DBIogRs552_nWUOws,0.382786,0.369494,0.366167,0.378435,0.393333,0.350969,0.369254,0.385442,0.383503,0.379393,...,0.372331,0.375535,0.368890,0.374361,0.346412,0.371554,0.384286,0.348090,0.381885,0.376672
ChIJJ6ZCoiSCRoYRjFNyzETgjMc,0.531949,0.474286,0.484620,0.463504,0.491835,0.466955,0.467441,0.515079,0.469910,0.495325,...,0.446749,0.430863,0.476550,0.454680,0.409660,0.454743,0.547895,0.426807,0.510109,0.472175
ChIJGxPXPm9TrIkR8DTsN0sjT24,0.469900,0.426188,0.429943,0.426913,0.442456,0.422287,0.423612,0.461889,0.429026,0.445179,...,0.411375,0.402877,0.426170,0.421360,0.376102,0.418415,0.481126,0.396052,0.455610,0.429907
ChIJQe7mgp2OyIkRyP-yL8dVN00,0.594074,0.567072,0.563303,0.553790,0.586708,0.547097,0.555695,0.568327,0.560192,0.573928,...,0.536736,0.532844,0.562912,0.541674,0.510243,0.557990,0.587526,0.519417,0.583221,0.561136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ChIJRR3QlPfX54kRk_-jdB-UnJk,0.481023,0.443626,0.444473,0.469252,0.466068,0.470426,0.441204,0.487363,0.462942,0.470110,...,0.453040,0.439552,0.465897,0.445648,0.414380,0.446862,0.498718,0.434712,0.478542,0.463243
ChIJ176SKpF744kRqd4b-EaZuW4,0.528737,0.496978,0.485690,0.522888,0.494507,0.489379,0.499818,0.532606,0.504708,0.514994,...,0.497608,0.482142,0.501553,0.485365,0.432071,0.507652,0.541611,0.474635,0.516905,0.503688
ChIJQa049JwR3okRPMVp5oeCFns,0.272330,0.266939,0.256600,0.296160,0.276993,0.248945,0.282464,0.276742,0.290259,0.273600,...,0.266632,0.266409,0.256605,0.271981,0.229940,0.274591,0.277199,0.264117,0.271437,0.275412
ChIJr_-EegO354kRitPgb5lyFmo,0.434869,0.417079,0.414522,0.402296,0.414108,0.417585,0.405776,0.430738,0.399981,0.421437,...,0.403362,0.389197,0.418295,0.394851,0.391658,0.414348,0.440817,0.399178,0.423498,0.419227


### Note
The following code also exists in multitask.ipynb. I was too lazy to extract it out into an actual python module.

In [49]:
def resize_images(images):
    images_resized = []
    for im in images:
        im = im[:, :, :3]
        images_resized.append(
            skimage.transform.resize(
                im, (224, 224), mode="constant", anti_aliasing=True
            )
        )
    images = np.stack(images_resized, axis=0)
    return images

def resnet152_features(images, model, batch_size=60, gpu=True):
    results = []
    if gpu:
        model = model.cuda()
    for images_chunk in chunks(images, batch_size):
        if len(images_chunk.shape) < 4:
            images_chunk = images[np.newaxis, :, :, :]
        images_chunk = images_chunk.astype("float32").transpose(0, 3, 1, 2)
        images_torch = torch.from_numpy(images_chunk)
        if gpu:
            images_torch = images_torch.cuda()
        x = model.conv1(images_torch)
        x = model.bn1(x)
        x = model.relu(x)
        x = model.maxpool(x)
        x = model.layer1(x)
        x = model.layer2(x)
        x = model.layer3(x)
        x = model.layer4(x)
        x = model.avgpool(x)
        x = x.view(x.size(0), -1)
        x = x.cpu().data.numpy()
        results.append(x)
    torch.cuda.empty_cache()
    return np.concatenate(results, axis=0)

def resnet18multitask_features(images, model, batch_size=60, gpu=True):
    results = []
    if gpu:
        model = model.cuda()
    for images_chunk in chunks(images, batch_size):
        if len(images_chunk.shape) < 4:
            images_chunk = images[np.newaxis, :, :, :]
        images_chunk = images_chunk.astype("float32").transpose(0, 3, 1, 2)
        images_torch = torch.from_numpy(images_chunk)
        if gpu:
            images_torch = images_torch.cuda()
        x = model.resnet18(images_torch)
        x = x.view(x.size(0), -1)
        x = x.cpu().data.numpy()
        results.append(x)
    torch.cuda.empty_cache()
    return np.concatenate(results, axis=0)

def resnet18_features(images, model, batch_size=60, gpu=True):
    results = []
    if gpu:
        model = model.cuda()
    for images_chunk in chunks(images, batch_size):
        if len(images_chunk.shape) < 4:
            images_chunk = images[np.newaxis, :, :, :]
        images_chunk = images_chunk.astype("float32").transpose(0, 3, 1, 2)
        images_torch = torch.from_numpy(images_chunk)
        if gpu:
            images_torch = images_torch.cuda()
        x = model.conv1(images_torch)
        x = model.bn1(x)
        x = model.relu(x)
        x = model.maxpool(x)
        x = model.layer1(x)
        x = model.layer2(x)
        x = model.layer3(x)
        x = model.layer4(x)
        x = model.avgpool(x)
        x = x.view(x.size(0), -1)
        x = x.cpu().data.numpy()
        results.append(x)
    torch.cuda.empty_cache()
    return np.concatenate(results, axis=0)

def full_featurize(dataset, model_ft, model_type, batch_size, num_workers):
    dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
    )
    output_features = []
    data = {}
    ids = []
    print(len(dataloader))
    for j, X_batch in enumerate(dataloader):
        print("batch:", j)
        t = time.time()
        print(X_batch.shape)
        X_batch = resize_images(X_batch.numpy())
        print(X_batch.shape)
        if model_type == 'resnet152':
            X_features = resnet152_features(X_batch, model_ft)
        elif model_type == 'resnet18multitask':
            X_features = resnet18multitask_features(X_batch, model_ft)
        else:
            X_features = resnet18_features(X_batch, model_ft)
        e = time.time()
        print(f"batch: {j} took {e - t}")
        output_features.append(X_features)
    bio_features = python_io.BytesIO()
    np.save(bio_features, np.vstack(output_features))
    data["X"] = bio_features.getvalue()
    return data

In [16]:
# temporarily place this here for convenience

class MultiTaskModel(nn.Module):
    def __init__(self):
        super(MultiTaskModel, self).__init__()
        #shared part
        self.resnet18 = models.resnet18(pretrained=False)
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Identity()
        
        self.sampling = nn.ModuleList()
        self.sampling.add_module('UAR', nn.Linear(num_ftrs, len(tasks_UAR)))
        self.sampling.add_module('POP', nn.Linear(num_ftrs, len(tasks_POP)))

    def forward(self, X, sampling):
        # shared part
        resnet_output = self.resnet18(X)

        # sampling specific parts
        if sampling == 'UAR':
            return self.sampling.UAR(resnet_output)
        elif sampling == 'POP':
            return self.sampling.POP(resnet_output)

In [7]:
model_path = "../../data/int/deep_models/a175be07-88cb-4b9c-89c9-c5c3dcae333d/checkpoints/test/epoch_40_POP.pickle"
with open(model_path, "rb") as f:
    model_checkpoint = pickle.load(f)
print(model_checkpoint["val_r2"])
    
tasks_UAR = ["treecover", "elevation", "population",]
tasks_POP = ["nightlights", "income", "roads", "housing",]
    
# bio = b_io.BytesIO(model_checkpoint["model_bytes"])
# multitaskmodel = MultiTaskModel()
# multitaskmodel.load_state_dict(torch.load(bio))

[0.74134808 0.35807314 0.39815477 0.42959545]


In [61]:
task_name = "housing"

resnet18_path = f"../../data/output/cnn_comparison/resnet18_{task_name}.pickle"
with open(resnet18_path, "rb") as f:
    resnet18_results = pickle.load(f)
    resnet18_model = nightlights_results["model"]
    print(resnet18_model)
    test_r2 = resnet18_results["test_r2"]
    train_r2 = resnet18_results["train_r2"]
    initial_lr = resnet18_results["initial_lr"]
    print(test_r2)
    print(train_r2)
    print(initial_lr)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [62]:
model_type = "resnet18"
out_file = c.features_dir + f"/{model_type}_self_storage.pkl"
this_c = c.features["pretrained"]

if model_type == "resnet152":
    model_ft = getattr(models, this_c["model_type"])(pretrained=True)
elif model_type == "resnet18multitask":
    model_ft = multitaskmodel
else:
    model_ft = resnet18_model
    out_file = c.features_dir + f"/{model_type}_{task_name}_self_storage.pkl"

    
dataset = SelfStorageImageDataset(image_dir, ll_formatted)

print(len(dataset))

results_dict = full_featurize(dataset, model_ft, model_type, this_c["batch_size"], 4)
results_dict["latlon"] = ll_formatted
results_dict["ids_X"] = places_ids
with open(out_file, "wb") as f:
    dill.dump(results_dict, f, protocol=4)

2296
18
batch: 0
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 0 took 4.1650307178497314
batch: 1
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 1 took 3.6168813705444336
batch: 2
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 2 took 3.0275909900665283
batch: 3
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 3 took 3.346144199371338
batch: 4
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 4 took 2.6356213092803955
batch: 5
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 5 took 3.473970413208008
batch: 6
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 6 took 2.8530502319335938
batch: 7
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 7 took 3.300962209701538
batch: 8
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 8 took 3.070094585418701
batch: 9
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 9 took 3.1492321491241455
batch: 10
torch.Size([128, 3, 640, 640])
(128, 224, 224, 3)
batch: 10 took 3

In [63]:
# see that we can load the data correctly

with open(out_file, "rb") as f:
    data = np.load(f, allow_pickle=True)
print(len(data["X"]))
print(type(data["X"]))
print(len(data["latlon"]))
temp = b_io.BytesIO(data["X"])
features = np.load(temp, allow_pickle=True)
print(features.shape)
latlons_samp = data["latlon"]
ids_x = data["ids_X"]

X = pd.DataFrame(features, index=ids_x)
lls = pd.DataFrame(latlons_samp, index=ids_x, columns=["lat", "lon"])

lls
X

4702336
<class 'bytes'>
2296
(2296, 512)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
ChIJkxtdA8mzQIYROlwVeGwwh6I,0.362102,0.776538,0.003151,0.648336,0.374993,0.403151,0.309506,1.370439,0.809958,0.380229,...,0.011459,0.230209,0.428672,0.683530,0.651230,0.212848,0.396864,0.305671,0.296939,0.341683
ChIJBXG-W2DBIogRs552_nWUOws,0.366421,0.812872,0.000000,0.639938,0.390527,0.226219,0.291094,1.524958,0.812397,0.410513,...,0.000000,0.287815,0.440244,0.632527,0.801431,0.084567,0.480261,0.351250,0.310578,0.420584
ChIJJ6ZCoiSCRoYRjFNyzETgjMc,0.399603,0.824294,0.000943,0.591977,0.385562,0.390371,0.342763,1.413088,0.776307,0.348283,...,0.022370,0.222215,0.460231,0.689687,0.737539,0.183094,0.424870,0.312123,0.348687,0.299461
ChIJGxPXPm9TrIkR8DTsN0sjT24,0.345631,0.772980,0.012099,0.571844,0.389972,0.338027,0.282987,1.463810,0.770247,0.505308,...,0.010233,0.260444,0.476785,0.657909,0.761358,0.139124,0.448243,0.216099,0.376791,0.445477
ChIJQe7mgp2OyIkRyP-yL8dVN00,0.375611,0.837803,0.001239,0.582366,0.311899,0.229566,0.344947,1.429547,0.847129,0.352953,...,0.000000,0.272994,0.426854,0.744326,0.688746,0.179017,0.442166,0.373351,0.346714,0.337356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ChIJRR3QlPfX54kRk_-jdB-UnJk,0.360553,0.735428,0.000000,0.557830,0.293469,0.266132,0.382882,1.373090,0.916476,0.364418,...,0.001090,0.348099,0.490418,0.693179,0.623266,0.166420,0.447391,0.501788,0.363978,0.398947
ChIJ176SKpF744kRqd4b-EaZuW4,0.404990,0.738164,0.001969,0.588642,0.361527,0.326554,0.468034,1.213148,0.864551,0.316781,...,0.029658,0.207367,0.486504,0.730461,0.736059,0.165427,0.398496,0.425707,0.398306,0.444022
ChIJQa049JwR3okRPMVp5oeCFns,0.437733,0.726147,0.023331,0.500812,0.309464,0.362536,0.353452,1.084302,0.796487,0.463814,...,0.051688,0.401305,0.384011,0.653364,0.529130,0.210341,0.414739,0.465969,0.322216,0.318723
ChIJr_-EegO354kRitPgb5lyFmo,0.296701,0.739057,0.000000,0.674663,0.393355,0.245173,0.343771,1.373771,0.687160,0.392951,...,0.003943,0.224305,0.407906,0.598342,0.825396,0.105832,0.424477,0.246631,0.332108,0.499387
