In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import ToTensor
import torchvision.models as models
from PIL import Image
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sb

from functools import partial
from collections import OrderedDict

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import time

hidden_width = 32
hidden_nblocks = 4
train_max_epoch = 15

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
L2_param = 1e-4

In [2]:
def preprocess_df(df):
    # convert timecodes to year and month columns
    datetimes = pd.to_datetime(df['time'])
    df['month'] = datetimes.dt.month
    df['year'] = datetimes.dt.year

    df['month_cyclic'] = 7 - abs(df['month'] - 7)

df = pd.read_csv('data_stephen_fix_header.csv', header=[0])
preprocess_df(df)

In [3]:
df.head()

Unnamed: 0,latitude,longitude,time,borehole,depth,frozen,cryostructures,visible_ice,ASTM_2488,materials,organic_cover,top_of_interval,bottom_of_interval,month,year,month_cyclic
0,69.16162,-133.08682,2012-03-21T00:00:00Z,0170-1-10,0.15,0,,,TOPSOIL,Organics,0.3,0.0,0.3,3,2012,3
1,69.16162,-133.08682,2012-03-21T00:00:00Z,0170-1-10,0.85,1,,Pure ice,ICE,Ice,0.3,0.3,1.4,3,2012,3
2,69.16162,-133.08682,2012-03-21T00:00:00Z,0170-1-10,1.9,1,Nf,No visible ice,SW-SM,Coarse till,0.3,1.4,2.4,3,2012,3
3,69.16162,-133.08682,2012-03-21T00:00:00Z,0170-1-10,5.4,1,Nf,No visible ice,GW-GM,Coarse till,0.3,2.4,8.4,3,2012,3
4,69.16105,-133.0888,2012-03-21T00:00:00Z,0170-1-12,1.2,1,Nf,No visible ice,GP-GM,Coarse till,0.0,0.0,2.4,3,2012,3


In [4]:
df_cp = df.filter(['depth', 'month_cyclic', 'latitude', 'longitude', 'year'])
df_cp.head()

Unnamed: 0,depth,month_cyclic,latitude,longitude,year
0,0.15,3,69.16162,-133.08682,2012
1,0.85,3,69.16162,-133.08682,2012
2,1.9,3,69.16162,-133.08682,2012
3,5.4,3,69.16162,-133.08682,2012
4,1.2,3,69.16105,-133.0888,2012


In [5]:
from torch.utils.data import Dataset
class Geo90Dataset(Dataset):
    def __init__(self, data_root, df, base_lat, base_lng, chip_size=32, label_name = 'frozen'):
        
        self.base_lat = base_lat
        self.base_lng = base_lng
        
        self.df = df
        
        self.chip_size = chip_size
        self.label_name = label_name
        
        self.trans = transforms.ToTensor()
        
        self.n_channels = len(os.listdir(data_root))
        self.preloaded = torch.zeros(self.n_channels, 6000, 6000)
        
        for i, file in enumerate(os.listdir(data_root)):
            # name = file.split('_')[0]
            # print(name)
            self.preloaded[i] = self.trans(Image.open(data_root + os.path.sep + file))
        
        print('Dataset initialized')
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        bh_id = row.at['borehole']
        lat = row.at['latitude']
        lng = row.at['longitude']
        

        pixel_len = 5/6000
        

        lat_index_start = np.round((self.base_lat - lat) / pixel_len - self.chip_size/2).astype(int)
        lat_index_end = lat_index_start + self.chip_size
        
        lng_index_start = np.round((lng - self.base_lng) / pixel_len - self.chip_size/2).astype(int)
        lng_index_end = lng_index_start + self.chip_size
        
        image = self.preloaded[:, lat_index_start:lat_index_end,lng_index_start:lng_index_end]
        
        # surface = torch.tensor(row.filter(['depth'])).float()
        surface = torch.tensor(row.filter(['depth', 'month_cyclic', 'latitude', 'longitude', 'year'])).float()
        
        frozen = torch.tensor(row.at['frozen']).float()
        
        # bh_id = row.at['borehole']
        
        return {'image': image, 'surface_data': surface, 'frozen': frozen}#, 'bh_id': bh_id}

In [6]:
base_lat = 70
base_lng = -135

full_dataset = Geo90Dataset("geomorph_data", df, base_lat, base_lng, chip_size = 32)

Dataset initialized


In [37]:
# get rid of -9999 values
image = full_dataset[0]['image']
n_channels = list(image.shape)[0]

for data in full_dataset:
    image = data['image']
    for i in range(n_channels):
        channel = image[i]
        ind = (channel == -9999)
#         mean_val = torch.mean(channel[~ind])
        channel[ind] = 0
        data['image'][i] = channel
        
        

In [9]:
batchsize = 20
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size

train_data, test_data = torch.utils.data.random_split(full_dataset, [train_size, test_size], generator=torch.Generator().manual_seed(42))

# Unit Variance

In [7]:
infinity = float('inf')

image = full_dataset[0]['image']
n_channels = list(image.shape)[0]

for j, data in enumerate(full_dataset):
    image = data['image']
    for i in range(n_channels):
        channel = image[i]
        ind = (channel == infinity)
        if torch.any(ind):
            print("sample {}, channel {}".format(j,i))

In [10]:
image = full_dataset[0]['image']
n_samples = len(train_data)
n_channels = list(image.shape)[0]

scalers = []
for i in range(n_channels):
    
    scaler = StandardScaler()
    X = torch.empty((n_samples, full_dataset.chip_size, full_dataset.chip_size))
    
    for j, data in enumerate(train_data):
        #print(data['image'][i].shape)
        # print(X[j].shape)
        X[j] = data['image'][i]
    X = torch.reshape(X, (-1,1))
    scaler.fit(X)
    scalers.append(scaler)
    
    def scale_data(subset):
        for data in subset:
            X = data['image'][i]
            X_flat = torch.reshape(X, (-1,1))
            X_trans = scaler.transform(X_flat)
            data['image'][i] = torch.reshape(torch.Tensor(X_trans), (full_dataset.chip_size, full_dataset.chip_size))
    
    scale_data(train_data)
    scale_data(test_data)
    print("Channel {} scaled.".format(i))
       

Channel 0 scaled.
Channel 1 scaled.
Channel 2 scaled.
Channel 3 scaled.
Channel 4 scaled.
Channel 5 scaled.
Channel 6 scaled.
Channel 7 scaled.
Channel 8 scaled.
Channel 9 scaled.
Channel 10 scaled.
Channel 11 scaled.
Channel 12 scaled.
Channel 13 scaled.
Channel 14 scaled.


# MaxMin

In [39]:
a = train_data[87]['image'][10]
a

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [35]:
torch.min(a)

tensor(nan)

In [32]:
train_data[87]['surface_data']

tensor([  11.5500,    3.0000,   69.4475, -133.7629, 2013.0000])

In [11]:
image_flat = torch.reshape(train_data[87]['image'], (n_channels, -1))
torch.min(image_flat[:, :1022], dim = -1)


torch.return_types.min(
values=tensor([0.0976, 0.0977, 0.0980, 0.0983, 0.0974, 0.0976, 0.0976, 0.0976, 0.0940,
        0.0976, 0.0953, 0.0976, 0.0976, 0.0941, 0.0976]),
indices=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [12]:
test_data[500]['image'][10]

tensor([[0.0953, 0.0953, 0.0953,  ..., 0.0953, 0.0953, 0.0953],
        [0.0953, 0.0953, 0.0953,  ..., 0.0953, 0.0953, 0.0953],
        [0.0953, 0.0953, 0.0953,  ..., 0.0953, 0.0953, 0.0953],
        ...,
        [0.0953, 0.0953, 0.0953,  ..., 0.0953, 0.0953, 0.0953],
        [0.0953, 0.0953, 0.0953,  ..., 0.0953, 0.0953, 0.0953],
        [0.0953, 0.0953, 0.0953,  ..., 0.0953, 0.0953, 0.0953]])

In [43]:
image = train_data[0]['image']
n_channels = list(image.shape)[0]
image_scaler = torch.zeros(n_channels, 3)
image_flat = torch.reshape(image, (n_channels, -1))

# ind = (image_flat == -9999)
# mean_val = torch.mean(image_flat[~ind], dim = -1)
# image_flat[ind] = mean_val

image_scaler[:,0] = torch.min(image_flat, dim=-1).values
image_scaler[:,1] = torch.max(image_flat, dim=-1).values

for i, data in enumerate(train_data):
    image = data['image']
    image_flat = torch.reshape(image, (n_channels, -1))
    
#     ind = (image_flat == -9999)
#     mean_val = torch.mean(image_flat[~ind], dim = -1)
#     image_flat[ind] = mean_val

    image_min = torch.min(image_flat, dim=-1).values
    image_max = torch.max(image_flat, dim=-1).values
    image_scaler[:,0] = torch.minimum(image_min, image_scaler[:,0])
    # print("iteration {}".format(i))
    #print(torch.minimum(image_min, image_scaler[:,0]))
    # print(torch.maximum(image_max, image_scaler[:,1]))
    image_scaler[:,1] = torch.maximum(image_max, image_scaler[:,1])

image_scaler[:,2] = image_scaler[:,1] - image_scaler[:,0]

#if min == max, let the divisor (max-min) be 1
ind = image_scaler[:,2] == 0
mean_divisor = torch.mean(image_scaler[~ind, 2])
image_scaler[ind,2] = mean_divisor

image_scaler

tensor([[-9.9986e-01,  9.9995e-01,  1.9998e+00],
        [-9.9988e-01,  9.9991e-01,  1.9998e+00],
        [ 0.0000e+00,  3.5995e+02,  3.5995e+02],
        [-9.1768e+01,  7.7070e+01,  1.6884e+02],
        [-4.0262e+00,  1.0211e+01,  1.4237e+01],
        [-3.3761e+00,  8.4137e+00,  1.1790e+01],
        [ 0.0000e+00,  3.9970e+03,  3.9970e+03],
        [-3.4000e-03,  2.8610e-03,  6.2610e-03],
        [-1.0609e-03,  1.5045e-03,  2.5653e-03],
        [-1.6431e-01,  2.1378e-01,  3.7809e-01],
        [-3.7510e-03,  2.0001e-03,  5.7511e-03],
        [-1.8733e-01,  2.1753e-01,  4.0485e-01],
        [-1.6195e-01,  2.0886e-01,  3.7081e-01],
        [ 0.0000e+00,  1.8424e+01,  1.8424e+01],
        [ 0.0000e+00,  3.9216e-02,  3.9216e-02],
        [-1.8360e-01,  2.1204e-01,  3.9564e-01],
        [-4.0668e-03,  1.8909e-03,  5.9576e-03],
        [ 0.0000e+00,  6.6115e+00,  6.6115e+00],
        [ 0.0000e+00,  3.9970e+03,  3.9970e+03],
        [ 0.0000e+00,  5.2410e+01,  5.2410e+01],
        [ 0.0000e+00

In [44]:
def normalize_chips(data_subset):
    for data in data_subset:
        image = data['image']
        
        for i in range(n_channels):
            image[i] = (image[i] - image_scaler[i,0]) / image_scaler[i,2]
        
        data['image'] = image

normalize_chips(train_data)
normalize_chips(test_data)

In [None]:
len(train_data)

In [50]:
test_data[34]['image'][2]

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 4.3452e-24, 4.3452e-24,
         7.0579e-24],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 8.7998e-09, 1.0749e-08,
         1.0749e-08],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0451e-08, 1.0451e-08,
         1.1758e-08]])