<a href="https://colab.research.google.com/github/amedyukhina/biomassters/blob/main/biomassters_dataprep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile requirements.txt

pandas==1.3.5
tqdm==4.64.0
numpy==1.21.6
scikit-image==0.18.3
matplotlib==3.2.2
rasterio==1.2.0
boto3==1.26.16

Overwriting requirements.txt


In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import drive
import os
import rasterio
import pandas as pd
from tqdm import tqdm
import numpy as np
from skimage import io

import boto3
from botocore import UNSIGNED
from botocore.config import Config

from cachetools import cached, TTLCache

import warnings
from tqdm.notebook import tqdm
import shutil

# Our rasters contain no geolocation info, so silence this warning from rasterio
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)



In [4]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Prepare list of chip IDs

In [5]:
feature_path = 'gdrive/MyDrive/biomassters/info/features_metadata.csv'
data_path = 'gdrive/MyDrive/biomassters/data/'
train_img_dir = 'train_features'
test_img_dir = 'test_features'
label_dir = 'train_agbm'
ntrain = 200
nval = 20
nmeans = 100

In [6]:
df = pd.read_csv(feature_path)

In [7]:
df_pred = df[df['split'] == 'test'].reset_index(drop=True)
pred_ids = np.unique(df_pred['chip_id'])
len(df_pred), len(pred_ids)

(63348, 2773)

In [8]:
df = df[df['split'] == 'train'].reset_index(drop=True)
all_ids = np.unique(df['chip_id'])
len(df), len(all_ids)

(189078, 8689)

In [9]:
np.random.seed(42)
np.random.shuffle(all_ids)

In [10]:
train_ids = all_ids[:ntrain]
val_ids = all_ids[ntrain:ntrain+nval]

### Set up data loading

In [11]:
BUCKET_NAME = 'drivendata-competition-biomassters-public-us'
os.environ["AWS_NO_SIGN_REQUEST"] = 'YES'
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [12]:
cache = TTLCache(maxsize=1000, ttl=86400)

In [13]:
@cached(cache)
def get_image(fn):
  obj = s3.get_object(Bucket=BUCKET_NAME, Key=fn)
  with rasterio.open(obj['Body']) as src:
    img = src.read()
  return img

### Calculate mean and std of the dataset + save images to gdrive

In [14]:
# training data
os.makedirs(data_path + train_img_dir, exist_ok=True)
os.makedirs(data_path + label_dir, exist_ok=True)
imgs_s1 = []
imgs_s2 = []
it = 0
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for chip_id in tqdm(train_ids):
        for month in range(12):
            img = get_image(os.path.join(train_img_dir, 
                                        rf"{chip_id}_S1_{month:02d}.tif")).astype(np.float32)   
            io.imsave(os.path.join(data_path, train_img_dir, 
                                  rf"{chip_id}_S1_{month:02d}.tif"), 
                      img.astype(np.float32))   
            if it < nmeans:                     
                imgs_s1.append(img)

            try:
                img = get_image(os.path.join(train_img_dir, 
                                            rf"{chip_id}_S2_{month:02d}.tif")).astype(np.float32)
                io.imsave(os.path.join(data_path, train_img_dir, 
                                      rf"{chip_id}_S2_{month:02d}.tif"), 
                          img.astype(np.uint16))   
                if it < nmeans:                         
                    imgs_s2.append(img)
            except:
                pass

        label = get_image(os.path.join(label_dir, 
                                        rf"{chip_id}_agbm.tif"))
        io.imsave(os.path.join(data_path, label_dir, 
                                rf"{chip_id}_agbm.tif"), 
                  label.astype(np.uint16))
        it += 1

imgs_s1 = np.array(imgs_s1)
imgs_s2 = np.array(imgs_s2)

  0%|          | 0/200 [00:00<?, ?it/s]

In [15]:
imgs_s1.shape, imgs_s2.shape

((1200, 4, 256, 256), (973, 11, 256, 256))

In [16]:
imgs = imgs_s1
MEANS = np.array([imgs[:,i].mean() for i in range(imgs.shape[1])])
STDS = np.array([imgs[j,i].std() for j in range(imgs.shape[0]) 
  for i in range(imgs.shape[1])]).reshape((imgs.shape[:2])).mean(0)
np.save(data_path + 's1_mean_std.npy', [MEANS, STDS])
MEANS, STDS

(array([  -11.454795,   -18.013327, -1193.3553  , -1199.2169  ],
       dtype=float32),
 array([2.2330995, 2.964378 , 2.0271204, 2.6701245], dtype=float32))

In [17]:
imgs = imgs_s2
MEANS = np.array([imgs[:,i].mean() for i in range(imgs.shape[1])])
STDS = np.array([imgs[j,i].std() for j in range(imgs.shape[0]) 
  for i in range(imgs.shape[1])]).reshape((imgs.shape[:2])).mean(0)
np.save(data_path + 's2_mean_std.npy', [MEANS, STDS])
MEANS, STDS

(array([1642.8818  , 1621.115   , 1598.5988  , 1918.794   , 2490.597   ,
        2604.4023  , 2753.5115  , 2696.2065  , 1048.9395  ,  710.3785  ,
          21.115498], dtype=float32),
 array([1026.1313  ,  994.45984 , 1044.3563  , 1076.0272  , 1126.5492  ,
        1129.6202  , 1215.7893  , 1130.656   ,  521.7572  ,  426.1423  ,
          11.847122], dtype=float32))

In [20]:
# validation data
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for chip_id in tqdm(val_ids):
        for month in range(12):
            img = get_image(os.path.join(train_img_dir, 
                                        rf"{chip_id}_S1_{month:02d}.tif")).astype(np.float32)   
            io.imsave(os.path.join(data_path, train_img_dir, 
                                  rf"{chip_id}_S1_{month:02d}.tif"), 
                      img.astype(np.float32))   

            try:
                img = get_image(os.path.join(train_img_dir, 
                                            rf"{chip_id}_S2_{month:02d}.tif")).astype(np.float32)
                io.imsave(os.path.join(data_path, train_img_dir, 
                                      rf"{chip_id}_S2_{month:02d}.tif"), 
                          img.astype(np.uint16))   
            except:
                pass

        label = get_image(os.path.join(label_dir, 
                                        rf"{chip_id}_agbm.tif"))
        io.imsave(os.path.join(data_path, label_dir, 
                                rf"{chip_id}_agbm.tif"), 
                  label.astype(np.uint16))


  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
# inference data
os.makedirs(data_path + test_img_dir, exist_ok=True)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for chip_id in tqdm(pred_ids):
        for month in range(12):
            img = get_image(os.path.join(test_img_dir, 
                                        rf"{chip_id}_S1_{month:02d}.tif")).astype(np.float32)   
            io.imsave(os.path.join(data_path, test_img_dir, 
                                  rf"{chip_id}_S1_{month:02d}.tif"), 
                      img.astype(np.float32))   

            try:
                img = get_image(os.path.join(test_img_dir, 
                                            rf"{chip_id}_S2_{month:02d}.tif")).astype(np.float32)
                io.imsave(os.path.join(data_path, test_img_dir, 
                                      rf"{chip_id}_S2_{month:02d}.tif"), 
                          img.astype(np.uint16))   
            except:
                pass

  0%|          | 0/2773 [00:00<?, ?it/s]