In [1]:
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
try:
    from torchvision import transforms, utils
except:
    !conda install --yes torchvision --no-channel-priority
    from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
try:
    from torchvision import transforms, utils
except:
    !pip install torchvision
    from torchvision import transforms, utils
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 50

In [3]:
ladiDataImport = pd.read_csv("http://ladi.s3-us-west-2.amazonaws.com/Labels/ladi_aggregated_responses_url.tsv",delimiter='\t',header='infer')


In [4]:
# Strip off bracket and comma from the Answer catagory
ladiDataImport["Answer"] = ladiDataImport["Answer"].str.strip('[|]')
# split list of responses into multiple rows
ladiDataImport["Answer"] = ladiDataImport["Answer"].str.split(",",expand = True)
# remove the single quote character from either end of string
ladiDataImport["Answer"] = ladiDataImport["Answer"].str.strip('\'')
# add a column to help with aggregation when pivoting
ladiDataImport["response_count"] = 1
# Create a matrix with the number of workers who answered given label for given image
# using pivot table; filling in nan values with 0
label_matrix = ladiDataImport.pivot_table(values='response_count', 
                                          index='url', 
                                          columns='Answer', 
                                          aggfunc='sum',
                                          fill_value=0)

In [5]:
#labels = ['infrastructure:bridge','infrastructure:building','infrastructure:dam-levee','infrastructure:none','infrastructure:railway','infrastructure:road','infrastructure:utility-line']
#labelsWithoutNone = ['infrastructure:bridge','infrastructure:building','infrastructure:dam-levee','infrastructure:railway','infrastructure:road','infrastructure:utility-line']
#communication tower,no pipe,no water tower
labels = ['infrastructure:bridge','infrastructure:building','infrastructure:none','infrastructure:road']
labelsWithoutNone = ['infrastructure:bridge','infrastructure:building','infrastructure:road']
#communication tower,no pipe,no water tower

def filterRows(row):

    arr = [row[x] for x in labels]
    return arr.count(0)

def proc(row): 
    arr = [row[x] for x in labelsWithoutNone]
    if row['infrastructure:none'] > max(arr):
#         print('true')
#         print(row)
        for x in labelsWithoutNone:
            row[x] = 0
        row['infrastructure:none'] = 1
    else:
#         print('false')
#         print(row)
        row['infrastructure:none'] = 0
        for x in labelsWithoutNone:
            if row[x] > 0: row[x] = 1
    return row
    

infrastructure_matrix = label_matrix[labels]
infrastructure_matrix = infrastructure_matrix[infrastructure_matrix.apply(filterRows,axis=1) != len(labels)]
infrastructure_matrix = infrastructure_matrix.apply(proc,axis =1)
# m= label_matrix[['infrastructure:bridge','infrastructure:building','infrastructure:dam-levee','infrastructure:none','infrastructure:railway','infrastructure:road','infrastructure:utility-line']]
# m = m[m.apply(filterRows,axis=1) != len(labels)]
# m.head(1000)
# infrastructure_matrix = infrastructure_matrix[infrastructure_matrix.apply(filterRows,axis=1) != 6]
# for i in labels:
#     print("")
#     print(i)
#     print(infrastructure_matrix[infrastructure_matrix[i] != 0].count())
#     print("")
# #1982 bridges
# #6294 buildings
# #65 comm towers 
# #137 dam-levees 
# #3325 none
# #101 pipe
# #108 railway
# #1609 road
# #192 utility line
# #75 water tower

In [6]:
sample_size=1200
samples = infrastructure_matrix[infrastructure_matrix['infrastructure:bridge'] == 1].sample(sample_size)
for col in infrastructure_matrix.columns[1:]:
    samples = samples.append(infrastructure_matrix[infrastructure_matrix[col] == 1].sample(sample_size))
samples = samples.reset_index().drop_duplicates(subset='url', keep='first').set_index('url')


In [7]:
metadata_csv = 'infrastructure_sample_metadata.csv'
label_csv = 'infrastructure_sample_label.csv'

In [8]:
# Load ladi_images_metadata.csv
metadata = pd.read_csv('http://ladi.s3-us-west-2.amazonaws.com/Labels/ladi_images_metadata.csv')

# sampling
# sample_size=1000
# flood_sample = flood_examples.sample(sample_size)
# non_flood_sample = non_flood_examples.sample(sample_size)

# creating a df with True/False labels for flooding
# training_flood = pd.DataFrame(index=flood_sample.index, data={'label':True}).reset_index()
# training_non_flood = pd.DataFrame(index=non_flood_sample.index, data={'label':False}).reset_index()
# label_df = pd.concat([training_flood, training_non_flood], ignore_index=True)

samples.to_csv(label_csv)

# create list of urls to download
samples.reset_index()['url'].to_csv('infrastructure_urls_to_download.csv', index=False, header=False)

#get metada
infrastructure_metadata = metadata[metadata['url'].isin(samples.index)]
# flood_metadata = metadata[metadata['url'].isin(flood_sample.index)]
# not_flood_metadata = metadata[metadata['url'].isin(non_flood_sample.index)]
# training_metadata = pd.concat([flood_metadata, not_flood_metadata], ignore_index=True)

infrastructure_metadata.to_csv(metadata_csv)

In [9]:
!mkdir -p training_images
!wget --content-disposition --trust-server-names -i infrastructure__to_download.csv -P training_images/

--2021-07-31 17:14:05--  https://ladi.s3-us-west-2.amazonaws.com/Images/FEMA_CAP/9099/615182/DSC_1098_928ca14d-8a78-46f2-8268-ade3d7b4644a.jpg
Resolving ladi.s3-us-west-2.amazonaws.com (ladi.s3-us-west-2.amazonaws.com)... 52.218.224.1
Connecting to ladi.s3-us-west-2.amazonaws.com (ladi.s3-us-west-2.amazonaws.com)|52.218.224.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7437762 (7.1M) [image/jpeg]
Saving to: ‘training_images/DSC_1098_928ca14d-8a78-46f2-8268-ade3d7b4644a.jpg’


2021-07-31 17:14:06 (15.0 MB/s) - ‘training_images/DSC_1098_928ca14d-8a78-46f2-8268-ade3d7b4644a.jpg’ saved [7437762/7437762]

--2021-07-31 17:14:06--  https://ladi.s3-us-west-2.amazonaws.com/Images/FEMA_CAP/9161/616222/FEMA_REG_5d8e4494-f1fc-4aac-80ba-4f47f3ba7f04.jpg
Reusing existing connection to ladi.s3-us-west-2.amazonaws.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 5855795 (5.6M) [image/jpeg]
Saving to: ‘training_images/FEMA_REG_5d8e4494-f1fc-4aac-80ba-4f47f3b

In [9]:
scale = transforms.Resize(768)
crop = transforms.RandomCrop(512)
rotate = transforms.RandomRotation(20)
flip_demo = transforms.RandomHorizontalFlip(1) # flip with 100% chance just to demo
flip = transforms.RandomHorizontalFlip(p=0.5)
composed = transforms.Compose([scale,
                               crop,
                               rotate,
                               flip_demo])

In [10]:
# convenient function for showing the images
def show_image(image):
    plt.imshow(image)
    # pause a bit so that plots are updated
    plt.pause(0.01)

def convert_url_to_local_path(url):
    '''
    gets the location of the downloaded image
    '''
    return 'training_images/'+url.split('/')[-1]

class InfrastructureSampleDataset(Dataset):

    def __init__(self, metadata_csv, label_csv, transform = None):
        """
        Args:
            metadata_csv (string): Path to the csv file with metadata.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.infrastructure_sample_metadata = pd.read_csv(metadata_csv)
        # get the path in the shared directory
        self.infrastructure_sample_metadata['local_path'] = self.infrastructure_sample_metadata['url'].apply(convert_url_to_local_path)
        self.infrastructure_sample_label = pd.read_csv(label_csv)
        self.infrastructure_sample_data = pd.merge(self.infrastructure_sample_metadata, 
                                        self.infrastructure_sample_label,
                                       on="url")
        self.transform = transform
        
    def __len__(self):
        return len(self.infrastructure_sample_metadata)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        ## Load images from local directory. There is no need to redownload images to local machine. ##
        local_path = self.infrastructure_sample_metadata.iloc[idx]['local_path']
        url = self.infrastructure_sample_metadata.iloc[idx]['url']
        try:
            image = Image.fromarray(io.imread(local_path))
            img_name = local_path
        except:
            image = Image.fromarray(io.imread(url))
            img_name = url
        uuid = self.infrastructure_sample_data.iloc[idx, 1]
        timestamp = self.infrastructure_sample_data.iloc[idx, 2]
        gps_lat = self.infrastructure_sample_data.iloc[idx, 3]
        gps_lon = self.infrastructure_sample_data.iloc[idx, 4]
        gps_alt = self.infrastructure_sample_data.iloc[idx, 5]
        file_size = self.infrastructure_sample_data.iloc[idx, 6]
        width = self.infrastructure_sample_data.iloc[idx, 7]
        height = self.infrastructure_sample_data.iloc[idx, 8]
        label = torch.tensor(self.infrastructure_sample_data.iloc[idx, -4:].tolist(), dtype=torch.float)
        
        if self.transform:
            image = self.transform(image)

        sample = {'image': image, 
                  'image_name': img_name, 
                  'infrastructure_labels': label, 
                  'uuid': uuid, 
                  'timestamp': timestamp, 
                  'gps_lat': gps_lat, 
                  'gps_lon': gps_lon, 
                  'gps_alt': gps_alt, 
                  'orig_file_size': file_size, 
                  'orig_width': width, 
                  'orig_height': height}

        return sample
    def __str__(self):
        return self.infrastructure_sample_data.to_string()
    def get_labels(self):
        return torch.tensor(self.infrastructure_sample_data.iloc[:, -4:].values)
    def dataset(self):
        return self.infrastructure_sample_data
    def dataset_type(self):
        return type(self.infrastructure_sample_data)
    def get_columns(self):
        return self.infrastructure_sample_data.columns

In [11]:
infrastructure_sample_dataset = InfrastructureSampleDataset(metadata_csv = metadata_csv, label_csv = label_csv)

In [12]:
transformed_dataset = InfrastructureSampleDataset(metadata_csv = metadata_csv, 
                                       label_csv = label_csv, 
                                       transform = transforms.Compose([scale, 
                                                                       crop, 
                                                                       rotate, 
                                                                       flip, 
                                                                       transforms.ToTensor()]
                                                                     )
                                      )

In [13]:
# dataloader = DataLoader(transformed_dataset, batch_size=4, shuffle=True, num_workers=4)

In [14]:
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader

batch_size = 16
test_split_ratio = .1
shuffle_dataset = True
random_seed = 42
# num_workers = 1

# Creating data indices for training and validation splits:
dataset_size = len(transformed_dataset)
indices = list(range(dataset_size))
split = int(np.floor(test_split_ratio * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, test_indices = indices[split:], indices[:split]

# Creating data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = torch.utils.data.DataLoader(transformed_dataset, batch_size=batch_size,
                                           sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(transformed_dataset, batch_size=batch_size,
                                                sampler=test_sampler)

In [15]:
import torch.nn as nn
import torch.nn.functional as F
try:
    from cnn_finetune import make_model
except:
    !pip install cnn-finetune
    from cnn_finetune import make_model

In [16]:
net = make_model('resnet50', num_classes=4, pretrained=True).cuda()

In [17]:
def pos_weights(labels):
    pos = labels.sum(axis=0)
    neg = torch.ones(labels.shape[-1])*labels.shape[0] - pos
    return (neg / pos).cuda()
print(pos_weights(transformed_dataset.get_labels()))

tensor([2.2655, 0.8917, 2.6083, 2.5726], device='cuda:0')


In [18]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights(transformed_dataset.get_labels()))
optimizer = optim.Adam(net.parameters(), lr=1e-4)

In [None]:
def get_checkpoint_path(epoch):
    return f'epoch_checkpoints/infrastructure_checkpoint_epoch{epoch}.pth'

torch.backends.cudnn.benchmark = True # flag for some GPU optimizations
starting_epoch = 1
additional_epochs = 5
net.train()
if starting_epoch > 1:
    net.load_state_dict(torch.load(get_checkpoint_path(starting_epoch-1)))
for epoch in range(starting_epoch, starting_epoch+additional_epochs):  # loop over the dataset multiple times
    running_loss, running_acc = 0.0, torch.tensor([0.0, 0.0, 0.0, 0.0]).cuda()
    print("start")
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
#         print(data['image'].shape)
#         print(data['damage_labels'])
#         print(data['damage_labels'].shape)
        inputs = data['image'].cuda()
        
#         labels = zip(data['damage:flood/water'], data['damage:rubble'], data['damage:smoke/fire'], data['damage:none']).cuda()
#         labels = data[['damage:flood/water','damage:rubble','damage:smoke/fire','damage:none']].cuda()
        labels = data['infrastructure_labels'].cuda()
        
#         labels = t.cuda()
        # casting int to long for loss calculation#
#         labels = labels.float()

        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
#         outputs = torch.sigmoid(net(inputs))
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        
        running_loss += loss.item()
        
        pred = (torch.sigmoid(outputs) > 0.5).float()
        correct = (pred == labels).float().sum(axis=0)
        running_acc += correct
        print_freq = 1
        if (i+1) % print_freq == 0:  
            print(f'[Epoch {epoch}/{additional_epochs}, Batch {i+1}]    Loss: {running_loss/print_freq}    Binary Accuracies: {running_acc/(len(labels)*print_freq)}    Combined Accuracy: {(running_acc/(len(labels)*print_freq)).mean()}')
            running_loss, running_acc = 0.0, torch.tensor([0.0, 0.0, 0.0, 0.0]).cuda()
    # save the model
    PATH = get_checkpoint_path(epoch)
    torch.save(net.state_dict(), PATH)
print('Finished Training')

start
[Epoch 1/5, Batch 1]    Loss: 0.8708738684654236    Binary Accuracies: tensor([0.6875, 0.6250, 0.6875, 0.6250], device='cuda:0')    Combined Accuracy: 0.65625
[Epoch 1/5, Batch 2]    Loss: 0.7518773078918457    Binary Accuracies: tensor([0.8125, 0.7500, 0.8125, 0.6250], device='cuda:0')    Combined Accuracy: 0.75
[Epoch 1/5, Batch 3]    Loss: 0.8640252947807312    Binary Accuracies: tensor([0.5625, 0.7500, 0.6875, 0.5000], device='cuda:0')    Combined Accuracy: 0.625
[Epoch 1/5, Batch 4]    Loss: 1.0157722234725952    Binary Accuracies: tensor([0.3750, 0.5625, 0.5625, 0.4375], device='cuda:0')    Combined Accuracy: 0.484375
[Epoch 1/5, Batch 5]    Loss: 0.716407299041748    Binary Accuracies: tensor([0.6875, 0.8125, 0.8125, 0.6875], device='cuda:0')    Combined Accuracy: 0.75
[Epoch 1/5, Batch 6]    Loss: 0.7304362058639526    Binary Accuracies: tensor([0.7500, 0.6875, 0.8125, 0.5000], device='cuda:0')    Combined Accuracy: 0.6875
[Epoch 1/5, Batch 7]    Loss: 0.8558260202407837 

In [20]:
net.load_state_dict(torch.load(PATH))

outputs = net(images.cuda())
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join('%5s' % predicted[j].cpu()
                              for j in range(batch_size)))

NameError: name 'images' is not defined

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        images = data['image'].cuda()
        labels = data['damage:flood/water'].cuda()
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test images: %d %%' % (
    100 * correct / total))

In [None]:
truth_labels = []
predicted_labels = []
with torch.no_grad():
    for data in test_loader:
        images = data['image'].cuda()
        labels = data['damage:flood/water'].cuda()
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        truth_labels.append(labels.cpu())
        predicted_labels.append(predicted.cpu())
truth_labels = np.concatenate([x.numpy() for x in truth_labels])
predicted_labels = np.concatenate([x.numpy() for x in predicted_labels])

In [None]:
import sklearn.metrics
confusion_matrix = sklearn.metrics.confusion_matrix(truth_labels, predicted_labels)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix, ['flood','no flood'])
disp.plot()