## Steps

So note that we have two different sampling methods, "UAR" and "POP", and the labels we have for our 7 tasks are split between them. Thus, we must train on two different image datasets at once.

Our multitask model will also have 7 heads, so we must edit our ResNet18 to accomplish that

We should also save the model parameters for future use.

## Creating our training and test dataset

In [1]:
import io as python_io
import time
import math
from tqdm import tqdm
import sklearn.metrics
import torch
from loguru import logger
from torchvision import transforms
from torch import nn
from torchvision import models
import numpy as np
from pathlib import Path
import pickle
from mosaiks import config as c
from mosaiks import transforms as m_transforms
from mosaiks.featurization import RemoteSensingSubgridDataset
from mosaiks.utils import io, spatial
from mosaiks.solve import data_parser as parse

env variable MOSAIKS_HOME not defined; setting to: "/home/ubuntu/cs230/mosaiks-paper"
If not desired, please reset os.environ["MOSAIKS_NAME"]


In [174]:
import pandas as pd
from torchinfo import summary
from tensorboardX import SummaryWriter
from tqdm.contrib import tzip
import torch.nn.functional as F
import uuid

In [97]:
tasks_UAR = ["treecover", "elevation", "population",]
tasks_POP = ["nightlights", "income", "roads", "housing",]


tasks = [
    "treecover",
    "elevation",
    "population",
    "nightlights",
    "income",
    "roads",
    "housing",
]

data_home = Path(c.data_dir) / "raw" / "imagery"
data_home_UAR = data_home / "CONTUS_UAR"
data_home_POP = data_home / "CONTUS_POP"

In [98]:
def grab_labels(task):
    c_local = io.get_filepaths(c, task)
    c_app = getattr(c_local, task)
    Y = io.get_Y(c_local, c_app["colname"])
    lons, lats = spatial.ids_to_ll(
        Y.index,
        c.grid_dir,
        c_local.grid["area"],
        c_local.images["zoom_level"],
        c_local.images["n_pixels"],
    )
    latlons = np.vstack((np.array(lats), np.array(lons))).T.astype("float64")
    ids, Y, latlons = m_transforms.dropna_and_transform(
        Y.index.values, Y.values, latlons, c_app
    )
    return Y, latlons, ids

In [99]:
def split_train_test(ids, Y, ratio=0.8):
    seed = 0
    r = np.random.RandomState(seed=seed)
    
    n = ids.shape[0]
    
    test_n = round((1 - ratio) * n)
    train_n = n - test_n
    
    shuffled_idx = r.choice(n, n, replace=False)
    train_idx = shuffled_idx[:train_n]
    test_idx = shuffled_idx[train_n:]
    
    return ids[train_idx], Y[train_idx], ids[test_idx], Y[test_idx]

In [100]:
dfs_UAR = pd.DataFrame()
for task in tasks_UAR:
    Y_task, ll_task, ids_task = grab_labels(task)

    Y_and_ids = np.vstack([Y_task, ids_task]).T
    
    df = pd.DataFrame(Y_and_ids, columns=[task, "id"])
    df = df.set_index("id")
    if dfs_UAR.empty:
        dfs_UAR = df
    else:
        dfs_UAR = dfs_UAR.merge(df, how='outer', on='id')
        
display(dfs_UAR)
    
dfs_POP = pd.DataFrame()
for task in tasks_POP:
    Y_task, ll_task, ids_task = grab_labels(task)

    Y_and_ids = np.vstack([Y_task, ids_task]).T
    
    df = pd.DataFrame(Y_and_ids, columns=[task, "id"])
    df = df.set_index("id")
    if dfs_POP.empty:
        dfs_POP = df
    else:
        dfs_POP = dfs_POP.merge(df, how='outer', on='id')
        
display(dfs_POP)

Unnamed: 0_level_0,treecover,elevation,population
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000105,91.223158,1462.463364,
10001067,11.715897,1919.119465,2.234753
10001080,0.0,1814.329174,0.385035
10001219,0.162632,2079.739485,0.010085
1000122,89.316842,1544.912699,
...,...,...,...
99976,96.171579,156.416732,
999914,0.003158,1534.612079,1.980177
999942,0.0,1524.94773,
999949,0.0,1430.260728,


Unnamed: 0_level_0,nightlights,income,roads,housing
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000114,0.0,53277.708883,0.0,
10001457,2.793769,85920.0,5856.211968,5.003458
10001458,2.374133,85920.0,3708.122387,5.144845
10001459,2.813775,62022.434843,5458.893879,5.091291
10001463,1.613503,61151.985279,2711.385912,5.042022
...,...,...,...,...
999962,3.164506,74250.611331,6009.25,
999963,2.678835,73374.52862,5856.974,
999964,3.302048,73351.713585,8132.549,
999965,3.07826,,6581.161,


In [144]:
(ids_train_UAR, Y_train_UAR, ids_test_UAR, Y_test_UAR) = split_train_test(dfs_UAR.index.to_numpy(), dfs_UAR.loc[:, dfs_UAR.columns != 'id'].to_numpy(dtype='float32'))
(ids_train_POP, Y_train_POP, ids_test_POP, Y_test_POP) = split_train_test(dfs_POP.index.to_numpy(), dfs_POP.loc[:, dfs_POP.columns != 'id'].to_numpy(dtype='float32'))

print(Y_train_UAR.shape)
print(Y_train_POP.shape)
print(Y_train_UAR.dtype)
print(Y_train_POP.dtype)

for i in range(len(tasks_POP)):
    print(f"NaN count in train, task {tasks_POP[i]}: {np.sum(np.isnan(Y_train_POP[:,i]))}/80000")
    print(f"NaN count in test, task {tasks_POP[i]}: {np.sum(np.isnan(Y_test_POP[:,i]))}/20000")

(80000, 3)
(80000, 4)
float32
float32
NaN count in train, task nightlights: 1/80000
NaN count in test, task nightlights: 0/20000
NaN count in train, task income: 6927/80000
NaN count in test, task income: 1696/20000
NaN count in train, task roads: 0/80000
NaN count in test, task roads: 0/20000
NaN count in train, task housing: 38064/80000
NaN count in test, task housing: 9581/20000


In [136]:
def transform_img_inputs(augment):
    out = [transforms.ToPILImage(), transforms.CenterCrop(224)]
    if augment:
        out += [transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.RandomRotation(20)]
    out += [transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
    return transforms.Compose(out)

In [112]:
def get_dataloader(data_home, Y, ids, batch_size=16, shuffle=True, num_workers=4, augment=False):
    transform = transform_img_inputs(augment)
    r_grid = RemoteSensingSubgridDataset(data_home, Y, ids, transform=transform)
    return torch.utils.data.DataLoader(r_grid, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

## Model

In [113]:
class MultiTaskModel(nn.Module):
    def __init__(self):
        super(MultiTaskModel, self).__init__()
        #shared part
        self.resnet18 = models.resnet18(pretrained=False)
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Identity()
        
        self.sampling = nn.ModuleList()
        self.sampling.add_module('UAR', nn.Linear(num_ftrs, len(tasks_UAR)))
        self.sampling.add_module('POP', nn.Linear(num_ftrs, len(tasks_POP)))

    def forward(self, X, sampling):
        # shared part
        resnet_output = self.resnet18(X)

        # sampling specific parts
        if sampling == 'UAR':
            return self.sampling.UAR(resnet_output)
        elif sampling == 'POP':
            return self.sampling.POP(resnet_output)

In [205]:
class MultiTaskModelTwo(nn.Module):
    def __init__(self):
        super(MultiTaskModelTwo, self).__init__()
        #shared part
        self.resnet18 = models.resnet18(pretrained=False)
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Identity()
        num_hidden = int(num_ftrs / 2)
        
        self.sampling = nn.ModuleList()
        self.sampling.add_module('UAR', nn.Sequential(nn.Linear(num_ftrs, num_hidden), nn.ReLU(), nn.Dropout(p=0.2), nn.Linear(num_hidden, len(tasks_UAR))))
        self.sampling.add_module('POP', nn.Sequential(nn.Linear(num_ftrs, num_hidden), nn.ReLU(), nn.Dropout(p=0.2), nn.Linear(num_hidden, len(tasks_POP))))

    def forward(self, X, sampling):
        # shared part
        resnet_output = self.resnet18(X)

        # sampling specific parts
        if sampling == 'UAR':
            return self.sampling.UAR(resnet_output)
        elif sampling == 'POP':
            return self.sampling.POP(resnet_output)

In [114]:
class MultiTaskModelWrapper(nn.Module):
    def __init__(self):
        super(MultiTaskModelWrapper, self).__init__()
        self.log_vars = nn.Parameter(torch.zeros((len(tasks))))
        
    def forward(self, outputs, labels, criterion, sampling):
        mask = (torch.isnan(labels) == False)
        outputs = outputs * mask
        labels = torch.nan_to_num(labels, nan=0.0)
        if sampling == 'UAR':
            loss_treecover = criterion(outputs[:,0], labels[:, 0])
            precision_treecover = torch.exp(-self.log_vars[0])
            loss_treecover = precision_treecover * loss_treecover + self.log_vars[0]
            
            loss_elevation = criterion(outputs[:,1], labels[:, 1])
            precision_elevation = torch.exp(-self.log_vars[1])
            loss_elevation = precision_elevation * loss_elevation + self.log_vars[1]
            
            loss_population = criterion(outputs[:,2], labels[:, 2])
            precision_population = torch.exp(-self.log_vars[2])
            loss_population = precision_population * loss_population + self.log_vars[2]
            
            return loss_treecover + loss_elevation + loss_population
        elif sampling == "POP":
            loss_nightlights = criterion(outputs[:,0], labels[:, 0])
            precision_nightlights = torch.exp(-self.log_vars[3])
            loss_nightlights = precision_nightlights * loss_nightlights + self.log_vars[3]
            
            loss_income = criterion(outputs[:,1], labels[:, 1])
            precision_income = torch.exp(-self.log_vars[4])
            loss_income = precision_income * loss_income + self.log_vars[4]
            
            loss_roads = criterion(outputs[:,2], labels[:, 2])
            precision_roads = torch.exp(-self.log_vars[5])
            loss_roads = precision_roads * loss_roads + self.log_vars[5]
            
            loss_housing = criterion(outputs[:,3], labels[:, 3])
            precision_housing = torch.exp(-self.log_vars[6])
            loss_housing = precision_housing * loss_housing + self.log_vars[6]
            
            return loss_nightlights + loss_income + loss_roads + loss_housing

In [206]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ft = MultiTaskModel().to(device)
model_2_ft = MultiTaskModelTwo().to(device)
loss_ft = MultiTaskModelWrapper().to(device)

In [207]:
print(summary(model_ft, input_size=(32, 3, 224, 224), sampling="POP"))
print(summary(loss_ft))
for name, param in model_ft.named_parameters():
    if param.requires_grad:
        print(name)
print(summary(model_2_ft, sampling="POP"))
for name, param in model_2_ft.named_parameters():
    if param.requires_grad:
        print(name)

Layer (type:depth-idx)                        Output Shape              Param #
MultiTaskModel                                --                        --
├─ModuleList: 1-1                             --                        --
├─ResNet: 1-2                                 [32, 512]                 --
│    └─Conv2d: 2-1                            [32, 64, 112, 112]        9,408
│    └─BatchNorm2d: 2-2                       [32, 64, 112, 112]        128
│    └─ReLU: 2-3                              [32, 64, 112, 112]        --
│    └─MaxPool2d: 2-4                         [32, 64, 56, 56]          --
│    └─Sequential: 2-5                        [32, 64, 56, 56]          --
│    │    └─BasicBlock: 3-1                   [32, 64, 56, 56]          73,984
│    │    └─BasicBlock: 3-2                   [32, 64, 56, 56]          73,984
│    └─Sequential: 2-6                        [32, 128, 28, 28]         --
│    │    └─BasicBlock: 3-3                   [32, 128, 28, 28]         230,144
│  

In [121]:
def train_model(
    model,
    loss_model,
    criterion,
    train_dataloaders,
    test_dataloaders,
    optimizer,
    scheduler,
    mean_UAR,
    std_UAR,
    mean_POP,
    std_POP,
    num_epochs=50,
    log_loc="./pytorch.logs",
    save_dir=Path(c.data_dir)/"int"/"deep_models",
):
    since = time.time()
    summary_writer = SummaryWriter(Path(log_loc)/"1234")
    global_step = 0
    
    preds = {}
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logger.debug("Using torch.device: {}".format(device))
    for epoch in range(num_epochs):
        logger.debug("Epoch {}/{}".format(epoch + 1, num_epochs))
        logger.debug("-" * 10)
        
        for phase in ["train", "test"]:
            if phase == "train":
                model.train()
            else:
                model.eval()

            all_labels = {sample : [] for sample in tasks}
            all_predictions = {sample : [] for sample in tasks}
            all_ids = {sample : [] for sample in tasks}

            counter = 0
            lr = optimizer.param_groups[0]["lr"]
            summary_writer.add_scalar(
                tag="learning_rate", scalar_value=lr, global_step=global_step
            )

            dataloaders = train_dataloaders if phase == "train" else test_dataloaders

            num_batches = len(dataloaders["UAR"])
            logger.debug("Total batches: {}".format(num_batches))
            end_time = time.time()
            debug_time = time.time()
            
            loss_sum = 0
            
            for data_UAR, data_POP in tqdm(tzip(dataloaders["UAR"], dataloaders["POP"]), total=num_batches):
                counter += 1
                global_step += 1
                
                ids_UAR, inputs_UAR, labels_UAR = data_UAR
                ids_POP, inputs_POP, labels_POP = data_POP

                for i in range(len(tasks_UAR)):
                    all_labels[tasks_UAR[i]] += list(labels_UAR.numpy()[:,i])
                    all_ids[tasks_UAR[i]] += list(ids_UAR)
                    
                for i in range(len(tasks_POP)):
                    all_labels[tasks_POP[i]] += list(labels_POP.numpy()[:,i])
                    all_ids[tasks_POP[i]] += list(ids_POP)

                inputs_UAR = inputs_UAR.float()
                labels_UAR = labels_UAR.float()
                inputs_UAR = inputs_UAR.to(device)
                labels_UAR = labels_UAR.to(device)
                
                inputs_POP = inputs_POP.float()
                labels_POP = labels_POP.float()
                inputs_POP = inputs_POP.to(device)
                labels_POP = labels_POP.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == "train"):
                    outputs_UAR = model.forward(inputs_UAR, "UAR")
                    outputs_POP = model.forward(inputs_POP, "POP")

                    for i in range(len(tasks_UAR)):
                        all_predictions[tasks_UAR[i]] += list(outputs_UAR.detach().cpu().numpy()[:,i])

                    for i in range(len(tasks_POP)):
                        all_predictions[tasks_POP[i]] += list(outputs_POP.detach().cpu().numpy()[:,i])

                    reg_UAR = torch.norm(model.sampling.UAR.weight, p=1)
                    reg_POP = torch.norm(model.sampling.POP.weight, p=1) 
                        
                    loss_UAR = loss_model.forward(outputs_UAR, labels_UAR, criterion, "UAR")
                    loss_POP = loss_model.forward(outputs_POP, labels_POP, criterion, "POP")
                    loss = loss_UAR + loss_POP + (reg_UAR + reg_POP) * 0.1
                    
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                        summary_writer.add_scalar(
                            tag="train_loss",
                            scalar_value=loss.item(),
                            global_step=global_step,
                        )
                    else:
                        summary_writer.add_scalar(
                            tag="val_loss",
                            scalar_value=loss.item(),
                            global_step=global_step,
                        )
                loss_sum += loss.item()
                yeet = 100
                if counter % yeet == 0:
                    logger.debug("Time for {} batches: {}. Avg loss: {}".format(yeet, time.time() - debug_time, loss_sum / yeet))
                    loss_sum = 0
                    debug_time = time.time()

                    # Testin some stuff here
                    for i in range(len(tasks_UAR)):
                        samp = tasks_UAR[i]
                        # print(samp)
                        temp_labels = np.array(all_labels[samp])
                        # print(temp_labels[:10])
                        temp_pred = np.array(all_predictions[samp])
                        # print(temp_pred[:10])

                        mask = (np.isnan(temp_labels) == False)
                        # print(mask[:10])
                        temp_pred = temp_pred[mask]
                        temp_labels = temp_labels[mask]

                        temp_pred = temp_pred * std_UAR[i] + mean_UAR[i]
                        temp_labels = temp_labels * std_UAR[i] + mean_UAR[i]

                        r2_score = sklearn.metrics.r2_score(temp_labels, temp_pred)
                        logger.debug("Aggregate R2 for {0}: {1}".format(samp, r2_score))

                    for i in range(len(tasks_POP)):
                        samp = tasks_POP[i]
                        temp_labels = np.array(all_labels[samp])
                        temp_pred = np.array(all_predictions[samp])

                        mask = (np.isnan(temp_labels) == False)
                        temp_pred = temp_pred[mask]
                        temp_labels = temp_labels[mask]

                        temp_pred = temp_pred * std_POP[i] + mean_POP[i]
                        temp_labels = temp_labels * std_POP[i] + mean_POP[i]

                        r2_score = sklearn.metrics.r2_score(temp_labels, temp_pred)
                        logger.debug("Aggregate R2 for {0}: {1}".format(samp, r2_score))

            logger.debug(
                "Epoch {0} Phase {1} complete".format(epoch, phase)
            )
    
    time_elapsed = time.time() - since
    logger.debug(
        "Training complete in {:.0f}m {:.0f}s".format(
            time_elapsed // 60, time_elapsed % 60
        )
    )
    
    return model
            

In [208]:
def train_model_POP(
    model_uuid,
    model,
    loss_model,
    criterion,
    train_dataloaders,
    test_dataloaders,
    optimizer,
    scheduler,
    mean_POP,
    std_POP,
    num_epochs=50,
    log_loc="./pytorch.logs",
    save_dir=Path(c.data_dir)/"int"/"deep_models",
    use_loss_model = False
):
    since = time.time()
    summary_writer = SummaryWriter(Path(log_loc)/"1234POP")
    global_step = 0
    
    preds = {}
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logger.debug("Using torch.device: {}".format(device))
    logger.debug("model uuid: {}".format(model_uuid))
    
    task_num = len(tasks_POP)
    
    for epoch in range(num_epochs):
        logger.debug("Epoch {}/{}".format(epoch + 1, num_epochs))
        logger.debug("-" * 10)
        
        for phase in ["train", "test"]:
            if phase == "train":
                model.train()
            else:
                model.eval()

            all_labels = {sample : [] for sample in tasks}
            all_predictions = {sample : [] for sample in tasks}
            all_ids = {sample : [] for sample in tasks}

            counter = 0
            lr = optimizer.param_groups[0]["lr"]
            summary_writer.add_scalar(
                tag="learning_rate", scalar_value=lr, global_step=global_step
            )

            dataloaders = train_dataloaders if phase == "train" else test_dataloaders

            num_batches = len(dataloaders["POP"])
            logger.debug("Total batches: {}".format(num_batches))
            end_time = time.time()
            debug_time = time.time()
            
            loss_sum = 0
            
            for data_POP in tqdm(dataloaders["POP"]):
                counter += 1
                global_step += 1
                
                ids_POP, inputs_POP, labels_POP = data_POP
                    
                for i in range(len(tasks_POP)):
                    all_labels[tasks_POP[i]] += list(labels_POP.numpy()[:,i])
                    all_ids[tasks_POP[i]] += list(ids_POP)
                
                inputs_POP = inputs_POP.float()
                labels_POP = labels_POP.float()
                inputs_POP = inputs_POP.to(device)
                labels_POP = labels_POP.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == "train"):
                    outputs_POP = model.forward(inputs_POP, "POP")

                    for i in range(task_num):
                        all_predictions[tasks_POP[i]] += list(outputs_POP.detach().cpu().numpy()[:,i])
                        
                    if not use_loss_model:
                        mask = (torch.isnan(labels_POP) == False)
                        outputs_POP = outputs_POP * mask
#                         print(f"output: {outputs_POP.shape}")
                        labels_POP = torch.nan_to_num(labels_POP, nan=0.0)
#                         print(f"labels: {labels_POP.shape}")
                        loss_POP = criterion(outputs_POP, labels_POP)
#                         print(f"loss: {loss_POP.shape}")
                        weight = F.softmax(torch.randn(task_num), dim=-1).to(device)
#                         print(f"weight: {weight.shape}")
#                         reg_POP = torch.norm(model.sampling.POP.weight, p=1)
                        reg_2_POP = torch.norm(model.sampling.POP[3].weight, p=1)
#                         print(f"reg: {reg_POP.shape}")
#                         loss = torch.sum(loss_POP*weight) + reg_POP * 0.1
                        loss = torch.sum(loss_POP*weight) + reg_2_POP * 0.1
#                         print(f"loss final: {loss.shape}")
                        
                    else:
                        reg_POP = torch.norm(model.sampling.POP.weight, p=1) 
                        loss_POP = loss_model.forward(outputs_POP, labels_POP, criterion, "POP")
                        loss = loss_POP + reg_POP * 0.1
                    
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                        summary_writer.add_scalar(
                            tag="train_loss",
                            scalar_value=loss.item(),
                            global_step=global_step,
                        )
                    else:
                        summary_writer.add_scalar(
                            tag="val_loss",
                            scalar_value=loss.item(),
                            global_step=global_step,
                        )
                loss_sum += loss.item()
                
            logger.debug("Time for {} batches: {}. Avg loss: {}".format(num_batches, time.time() - debug_time, loss_sum / num_batches))
            loss_sum = 0
            debug_time = time.time()

            # Testin some stuff here
            r2_scores = []
            for i in range(len(tasks_POP)):
                samp = tasks_POP[i]
                temp_labels = np.array(all_labels[samp])
                temp_pred = np.array(all_predictions[samp])

                mask = (np.isnan(temp_labels) == False)
                temp_pred = temp_pred[mask]
                temp_labels = temp_labels[mask]

                temp_pred = temp_pred * std_POP[i] + mean_POP[i]
                temp_labels = temp_labels * std_POP[i] + mean_POP[i]

                r2_score = sklearn.metrics.r2_score(temp_labels, temp_pred)
                r2_scores.append(r2_score)
                logger.debug("Aggregate R2 for {0}: {1}".format(samp, r2_score))
            
            r2_scores = np.array(r2_scores)
            
            preds[phase] = (all_labels, all_predictions, all_ids)

            bio = python_io.BytesIO()
            torch.save(model.state_dict(), bio)
            model_checkpoint = {}
            model_checkpoint["model_bytes"] = bio.getvalue()
            model_checkpoint["val_r2"] = r2_scores
            model_checkpoint["epoch"] = epoch
            model_checkpoint["preds"] = preds[phase]
            model_checkpoint["domain_names"] = tasks_POP

            if save_dir is not None:
                this_save_path = (
                    save_dir
                    / str(model_uuid)
                    / "checkpoints"
                    / phase
                    / f"epoch_{epoch}_POP.pickle"
                )
                this_save_path.parent.mkdir(exist_ok=True, parents=True)
                with open(this_save_path, "wb") as f:
                    pickle.dump(model_checkpoint, f, protocol=4)
                
            if phase == "test":
                scheduler.step()

            logger.debug(
                "Epoch {0} Phase {1} complete".format(epoch + 1, phase)
            )
    
    time_elapsed = time.time() - since
    logger.debug(
        "Training complete in {:.0f}m {:.0f}s".format(
            time_elapsed // 60, time_elapsed % 60
        )
    )
    
    return model
            

In [122]:
def run(
    ids_train_UAR,
    Y_train_UAR,
    ids_test_UAR,
    Y_test_UAR,
    ids_train_POP,
    Y_train_POP,
    ids_test_POP,
    Y_test_POP,
    model,
    data_home_UAR,
    data_home_POP,
    loss,
    num_epochs=25,
    initial_lr=0.001,
    log_loc="./pytorch.logs",
    save_dir=Path(c.data_dir)/"int"/"deep_models",
    batch_size=16,
):
    mean_UAR = np.nanmean(Y_train_UAR, axis=0)
    std_UAR = np.nanstd(Y_train_UAR, axis=0)
    Y_train_UAR = (Y_train_UAR - mean_UAR) / std_UAR
    Y_test_UAR = (Y_test_UAR - mean_UAR) / std_UAR
    
    mean_POP = np.nanmean(Y_train_POP, axis=0)
    std_POP = np.nanstd(Y_train_POP, axis=0)
    Y_train_POP = (Y_train_POP - mean_POP) / std_POP
    Y_test_POP = (Y_test_POP - mean_POP) / std_POP
    
    train_dataloaders = {}
    test_dataloaders = {}
    
    train_dataloaders["UAR"] = get_dataloader(
        data_home_UAR,
        Y_train_UAR,
        ids_train_UAR,
        batch_size=batch_size,
        augment=True)
    
    test_dataloaders["UAR"] = get_dataloader(
        data_home_UAR,
        Y_test_UAR,
        ids_test_UAR,
        batch_size=batch_size)
    
    train_dataloaders["POP"] = get_dataloader(
        data_home_POP,
        Y_train_POP,
        ids_train_POP,
        batch_size=batch_size,
        augment=True)
    
    test_dataloaders["POP"] = get_dataloader(
        data_home_POP,
        Y_test_POP,
        ids_test_POP,
        batch_size=batch_size)
    
    if loss == "mse":
        criterion = torch.nn.MSELoss()
    else:
        criterion = torch.nn.L1Loss()
        
    loss_model = MultiTaskModelWrapper()
        
    optimizer_ft = torch.optim.SGD(list(model.parameters()) + list(loss_model.parameters()), lr=initial_lr, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer_ft, milestones=[10], gamma=0.5)
    
    if torch.cuda.is_available():
        model.cuda()
    
    return train_model(
        model,
        loss_model,
        criterion,
        train_dataloaders,
        test_dataloaders,
        optimizer_ft,
        scheduler,
        mean_UAR,
        std_UAR,
        mean_POP,
        std_POP,
        num_epochs=num_epochs,
        log_loc=log_loc,
        save_dir=save_dir,
    )

In [194]:
def run_POP(
    model_uuid,
    ids_train_POP,
    Y_train_POP,
    ids_test_POP,
    Y_test_POP,
    model,
    data_home_POP,
    loss,
    num_epochs=50,
    initial_lr=0.001,
    log_loc="./pytorch.logs",
    save_dir=Path(c.data_dir)/"int"/"deep_models",
    batch_size=16,
):
    mean_POP = np.nanmean(Y_train_POP, axis=0)
    std_POP = np.nanstd(Y_train_POP, axis=0)
    Y_train_POP = (Y_train_POP - mean_POP) / std_POP
    Y_test_POP = (Y_test_POP - mean_POP) / std_POP
    
    train_dataloaders = {}
    test_dataloaders = {}
    
    train_dataloaders["POP"] = get_dataloader(
        data_home_POP,
        Y_train_POP,
        ids_train_POP,
        batch_size=batch_size,
        augment=True)
    
    test_dataloaders["POP"] = get_dataloader(
        data_home_POP,
        Y_test_POP,
        ids_test_POP,
        batch_size=batch_size)
    
    if loss == "mse":
        criterion = torch.nn.MSELoss(reduction='none')
    else:
        criterion = torch.nn.L1Loss()
        
    loss_model = MultiTaskModelWrapper()
        
    optimizer_ft = torch.optim.SGD(list(model.parameters()) + list(loss_model.parameters()), lr=initial_lr, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer_ft, milestones=[10], gamma=0.5)
    
    if torch.cuda.is_available():
        model.cuda()
    
    return train_model_POP(
        model_uuid,
        model,
        loss_model,
        criterion,
        train_dataloaders,
        test_dataloaders,
        optimizer_ft,
        scheduler,
        mean_POP,
        std_POP,
        num_epochs=num_epochs,
        log_loc=log_loc,
        save_dir=save_dir,
    )

In [209]:
trained_model = run(ids_train_UAR,
    Y_train_UAR,
    ids_test_UAR,
    Y_test_UAR,
    ids_train_POP,
    Y_train_POP,
    ids_test_POP,
    Y_test_POP,
    MultiTaskModel(),
    data_home_UAR,
    data_home_POP,
    'mse',
    num_epochs=25)

2022-05-26 08:42:39.705 | DEBUG    | __main__:train_model:23 - Using torch.device: cuda:0
2022-05-26 08:42:39.706 | DEBUG    | __main__:train_model:25 - Epoch 1/25
2022-05-26 08:42:39.706 | DEBUG    | __main__:train_model:26 - ----------
2022-05-26 08:42:39.708 | DEBUG    | __main__:train_model:47 - Total batches: 5000
  0%|                                                                                                                               | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|                                                                                                                               | 0/5000 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model_uuid = uuid.uuid4()
trained_model_POP = run_POP(
    model_uuid,
    ids_train_POP,
    Y_train_POP,
    ids_test_POP,
    Y_test_POP,
    MultiTaskModelTwo(),
    data_home_POP,
    'mse',
    num_epochs=50)

2022-05-26 08:42:44.353 | DEBUG    | __main__:train_model_POP:23 - Using torch.device: cuda:0
2022-05-26 08:42:44.354 | DEBUG    | __main__:train_model_POP:24 - model uuid: 100b5f83-e18a-47fc-8cef-14c1ea992e36
2022-05-26 08:42:44.355 | DEBUG    | __main__:train_model_POP:29 - Epoch 1/50
2022-05-26 08:42:44.356 | DEBUG    | __main__:train_model_POP:30 - ----------
2022-05-26 08:42:44.358 | DEBUG    | __main__:train_model_POP:51 - Total batches: 5000
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:48<00:00,  5.27it/s]
2022-05-26 08:58:32.814 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 948.4551112651825. Avg loss: 11.71222709441185
2022-05-26 08:58:32.834 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.441328720127262
2022-05-26 08:58:32.852 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: -0.002148097491162959
2022-05-26 08:58:32.

2022-05-26 10:00:10.264 | DEBUG    | __main__:train_model_POP:168 - Epoch 4 Phase test complete
2022-05-26 10:00:10.265 | DEBUG    | __main__:train_model_POP:29 - Epoch 5/50
2022-05-26 10:00:10.266 | DEBUG    | __main__:train_model_POP:30 - ----------
2022-05-26 10:00:10.267 | DEBUG    | __main__:train_model_POP:51 - Total batches: 5000
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:49<00:00,  5.27it/s]
2022-05-26 10:15:59.765 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 949.4961717128754. Avg loss: 10.857405980825424
2022-05-26 10:15:59.785 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5433707800175563
2022-05-26 10:15:59.802 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.07503911667573482
2022-05-26 10:15:59.818 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.18181730839688193
2022-05-26 10:15:59.83

2022-05-26 11:17:29.153 | DEBUG    | __main__:train_model_POP:29 - Epoch 9/50
2022-05-26 11:17:29.154 | DEBUG    | __main__:train_model_POP:30 - ----------
2022-05-26 11:17:29.156 | DEBUG    | __main__:train_model_POP:51 - Total batches: 5000
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:50<00:00,  5.26it/s]
2022-05-26 11:33:19.930 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 950.7729995250702. Avg loss: 10.734818002128602
2022-05-26 11:33:19.951 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5443408534857533
2022-05-26 11:33:19.971 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.09927401077332987
2022-05-26 11:33:19.991 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.18384569977907317
2022-05-26 11:33:20.011 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.06889958567226684
2022

2022-05-26 12:34:55.181 | DEBUG    | __main__:train_model_POP:30 - ----------
2022-05-26 12:34:55.183 | DEBUG    | __main__:train_model_POP:51 - Total batches: 5000
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:49<00:00,  5.26it/s]
2022-05-26 12:50:45.017 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 949.8326027393341. Avg loss: 10.187632575416565
2022-05-26 12:50:45.035 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5638359959593466
2022-05-26 12:50:45.052 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.1483532910583758
2022-05-26 12:50:45.071 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.2008479040491129
2022-05-26 12:50:45.088 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.11939812208096034
2022-05-26 12:50:46.930 | DEBUG    | __main__:train_model_POP:168 - Epoch 13 Phase t

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:52<00:00,  5.25it/s]
2022-05-26 14:08:18.565 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 952.104722738266. Avg loss: 10.125953396081924
2022-05-26 14:08:18.585 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5706864708751189
2022-05-26 14:08:18.602 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.1538489742382564
2022-05-26 14:08:18.618 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.20819643392819764
2022-05-26 14:08:18.635 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.12479318765534353
2022-05-26 14:08:20.437 | DEBUG    | __main__:train_model_POP:168 - Epoch 17 Phase train complete
2022-05-26 14:08:20.439 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|███████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:54<00:00,  5.24it/s]
2022-05-26 15:25:55.514 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 954.1262092590332. Avg loss: 10.01568753399849
2022-05-26 15:25:55.534 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5686541022124891
2022-05-26 15:25:55.553 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.16327176646148822
2022-05-26 15:25:55.570 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.20437989662358302
2022-05-26 15:25:55.589 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.1340051041175616
2022-05-26 15:25:57.436 | DEBUG    | __main__:train_model_POP:168 - Epoch 21 Phase train complete
2022-05-26 15:25:57.437 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|███████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:52<00:00,  5.25it/s]
2022-05-26 16:43:29.166 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 952.6131336688995. Avg loss: 10.03398869857788
2022-05-26 16:43:29.188 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5743995247926896
2022-05-26 16:43:29.207 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.17114439075403598
2022-05-26 16:43:29.227 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.20873486330805846
2022-05-26 16:43:29.248 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.15818969958450246
2022-05-26 16:43:31.169 | DEBUG    | __main__:train_model_POP:168 - Epoch 25 Phase train complete
2022-05-26 16:43:31.171 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|██████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:52<00:00,  5.25it/s]
2022-05-26 18:01:04.243 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 952.5454728603363. Avg loss: 9.942668542718888
2022-05-26 18:01:04.263 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5758922396903252
2022-05-26 18:01:04.281 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.17976759751205074
2022-05-26 18:01:04.298 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.22134803700396533
2022-05-26 18:01:04.316 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.16688128201478447
2022-05-26 18:01:06.152 | DEBUG    | __main__:train_model_POP:168 - Epoch 29 Phase train complete
2022-05-26 18:01:06.154 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|██████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:50<00:00,  5.26it/s]
2022-05-26 19:18:31.499 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 950.6912062168121. Avg loss: 9.806932395887374
2022-05-26 19:18:31.522 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5880681523640847
2022-05-26 19:18:31.543 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.1734016886084584
2022-05-26 19:18:31.564 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.22580293793037354
2022-05-26 19:18:31.585 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.1864382202415571
2022-05-26 19:18:33.496 | DEBUG    | __main__:train_model_POP:168 - Epoch 33 Phase train complete
2022-05-26 19:18:33.497 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|████████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:51<00:00,  5.25it/s]
2022-05-26 20:35:56.920 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 951.8877396583557. Avg loss: 9.650676646280289
2022-05-26 20:35:56.940 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5962076759456973
2022-05-26 20:35:56.958 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.18896941742244855
2022-05-26 20:35:56.974 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.2378899246786096
2022-05-26 20:35:56.990 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.19689914001490672
2022-05-26 20:35:58.859 | DEBUG    | __main__:train_model_POP:168 - Epoch 37 Phase train complete
2022-05-26 20:35:58.861 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|███████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:51<00:00,  5.25it/s]
2022-05-26 21:53:23.475 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 951.657045841217. Avg loss: 9.67161221113205
2022-05-26 21:53:23.494 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.5765196370593665
2022-05-26 21:53:23.513 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.19816995321754838
2022-05-26 21:53:23.529 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.24020066133619278
2022-05-26 21:53:23.546 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.2192686897961078
2022-05-26 21:53:25.425 | DEBUG    | __main__:train_model_POP:168 - Epoch 41 Phase train complete
2022-05-26 21:53:25.427 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|█████████████████████████████████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [15:52<00:00,  5.25it/s]
2022-05-26 23:10:54.437 | DEBUG    | __main__:train_model_POP:118 - Time for 5000 batches: 952.6325891017914. Avg loss: 9.56696378698349
2022-05-26 23:10:54.457 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for nightlights: 0.59614227691977
2022-05-26 23:10:54.475 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for income: 0.19573453877551394
2022-05-26 23:10:54.492 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for roads: 0.249931527078756
2022-05-26 23:10:54.510 | DEBUG    | __main__:train_model_POP:138 - Aggregate R2 for housing: 0.2191161523145717
2022-05-26 23:10:56.345 | DEBUG    | __main__:train_model_POP:168 - Epoch 45 Phase train complete
2022-05-26 23:10:56.347 | DEBUG    | __main__:train_model_POP:51 - Total batches: 1250
100%|████████████████████████████████████████████████████████████████

In [None]:
# try save params
model_param_path = c.code_dir + "/cs230/temp_trained_pop_params.pt"
torch.save(trained_model_POP.state_dict(), model_param_path)

# now try load
loaded_model_POP = MultiTaskModel()
loaded_model_POP.load_state_dict(torch.load(model_param_path))

In [None]:
summary(trained_model_POP)
summary(loaded_model_POP)

NameError: name 'all_labels' is not defined