In [1]:
import cv2
from glob import glob
import pandas as pd
import numpy as np
import random

import torch

import rasterio
from rasterio.mask import mask
import geopandas as gpd

import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [2]:
df_train = gpd.read_file("datasets/Varuna Hackathon 2022/training_area/traindata.shp")
df_test = gpd.read_file("datasets/Varuna Hackathon 2022/testing_area/testdata.shp")

grid_aoi = gpd.read_file("datasets/Varuna Hackathon 2022/Grid_AOI/AOI.shp")

df_train['area'] = df_train['geometry'].apply(lambda x: x.area)
df_test['area'] = df_test['geometry'].apply(lambda x: x.area)

df_train

Unnamed: 0,years,crop_type,geometry,area
0,2021,1,"POLYGON ((774604.014 1671240.055, 774606.506 1...",7395.766301
1,2021,1,"POLYGON ((773367.837 1670278.438, 773389.600 1...",12760.987757
2,2021,1,"POLYGON ((771648.671 1673990.960, 771707.027 1...",41971.752161
3,2021,3,"POLYGON ((771306.117 1674174.738, 771426.678 1...",8416.322100
4,2021,2,"POLYGON ((763750.339 1678588.825, 763879.691 1...",13915.426881
...,...,...,...,...
1312,2021,3,"POLYGON ((769004.617 1679027.544, 769130.199 1...",19370.802033
1313,2021,3,"POLYGON ((772624.509 1673384.254, 772770.291 1...",5249.190746
1314,2021,1,"POLYGON ((774333.243 1669788.192, 774337.269 1...",17355.845596
1315,2021,3,"POLYGON ((768880.988 1687424.354, 768996.565 1...",19346.460336


In [3]:
def extract_feature(x):
    file_sep = x.split('.')
    name_feature = file_sep[0].split('_')
    return [name_feature[2],file_sep[-1]]

file_path = glob('datasets/Varuna Hackathon 2022/sentinel-2 image/*/*/*/*')
df_img = pd.DataFrame([[i]+i.split('/')[-4:] for i in file_path],columns=['raw_path','year','yyyymmdd','filetype','filename'])
df_img = df_img[df_img['filetype']=="IMG_DATA"].reset_index(drop=True)

df_file_feat = pd.DataFrame([extract_feature(df_img['filename'][i]) for i in range(df_img.shape[0])],columns=['band','file_format'])
df_img = pd.concat([df_img,df_file_feat],axis=1)
df_img = df_img[df_img.file_format=="jp2"].reset_index(drop=True)
df_img['mm'] = df_img.yyyymmdd.apply(lambda x: x[4:6])
df_img['dd'] = df_img.yyyymmdd.apply(lambda x: x[6:])
df_img

Unnamed: 0,raw_path,year,yyyymmdd,filetype,filename,band,file_format,mm,dd
0,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_AOT.jp2,AOT,jp2,01,02
1,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B01.jp2,B01,jp2,01,02
2,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B02.jp2,B02,jp2,01,02
3,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B03.jp2,B03,jp2,01,02
4,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B04.jp2,B04,jp2,01,02
...,...,...,...,...,...,...,...,...,...
2125,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_B12.jp2,B12,jp2,12,27
2126,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_B8A.jp2,B8A,jp2,12,27
2127,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_SCL.jp2,SCL,jp2,12,27
2128,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_TCI.jp2,TCI,jp2,12,27


In [4]:
dates = df_img.yyyymmdd.unique()

In [5]:
masks_train = np.load("./datasets/masks_train.npy")
masks_test = np.load("./datasets/masks_test.npy")
CLOUD_masks = np.load("./datasets/CLOUD_mask.npy")
# image_features = np.load("./datasets/image_features.npy")
image_features = np.load("./datasets/image_features_Blue_NIR08_NDVI_GNDVI_SAVI.npy")

# Configuration

In [6]:
class config:
    input_size = 16
    stride = 3
    area_thresh = 0.5

# Find Cloud index from each day in each area

In [40]:
dates = df_img.yyyymmdd.unique()

In [41]:
def compute_cloud_index(mask,CLOUD_mask):
    return (mask&CLOUD_mask).sum()/mask.sum()
data = []
for idx_day,CLOUD_mask in enumerate(CLOUD_masks):
    for idx_area,area_mask in enumerate(masks_train):
        data += [[dates[idx_day],idx_area,compute_cloud_index(area_mask,CLOUD_mask)]]
df_CLOUD_mask_score_train = pd.DataFrame(data,columns=['yyyymmdd','area_id','score'])
df_CLOUD_mask_score_train

Unnamed: 0,yyyymmdd,area_id,score
0,20200102,0,1.000000
1,20200102,1,1.000000
2,20200102,2,0.554762
3,20200102,3,0.976190
4,20200102,4,0.400000
...,...,...,...
187009,20211227,1312,0.000000
187010,20211227,1313,0.000000
187011,20211227,1314,0.000000
187012,20211227,1315,0.000000


In [42]:
data = []
for idx_day,CLOUD_mask in enumerate(CLOUD_masks):
    for idx_area,area_mask in enumerate(masks_test):
        data += [[dates[idx_day],idx_area,compute_cloud_index(area_mask,CLOUD_mask)]]
df_CLOUD_mask_score_test = pd.DataFrame(data,columns=['yyyymmdd','area_id','score'])
df_CLOUD_mask_score_test

Unnamed: 0,yyyymmdd,area_id,score
0,20200102,0,0.045161
1,20200102,1,0.000000
2,20200102,2,1.000000
3,20200102,3,1.000000
4,20200102,4,1.000000
...,...,...,...
80225,20211227,560,0.061538
80226,20211227,561,0.000000
80227,20211227,562,0.049342
80228,20211227,563,0.000000


In [17]:
from line_notify import send_line_notify
send_line_notify('Finish Prepare Data')

'{"status":200,"message":"ok"}'

In [43]:
df_CLOUD_mask_score_train.to_csv('./datasets/CLOUD_mask_score_train.csv',index=None)
df_CLOUD_mask_score_test.to_csv('./datasets/CLOUD_mask_score_test.csv',index=None)

In [44]:
df_CLOUD_mask_score_train = pd.read_csv('./datasets/CLOUD_mask_score_train.csv')
df_CLOUD_mask_score_train['yyyymm'] = df_CLOUD_mask_score_train['yyyymmdd'].apply(lambda x: str(x)[:-2])
df_CLOUD_mask_score_train

Unnamed: 0,yyyymmdd,area_id,score,yyyymm
0,20200102,0,1.000000,202001
1,20200102,1,1.000000,202001
2,20200102,2,0.554762,202001
3,20200102,3,0.976190,202001
4,20200102,4,0.400000,202001
...,...,...,...,...
187009,20211227,1312,0.000000,202112
187010,20211227,1313,0.000000,202112
187011,20211227,1314,0.000000,202112
187012,20211227,1315,0.000000,202112


In [45]:
data = []
for yyyymm in df_CLOUD_mask_score_train.yyyymm.unique():
    for area_id in df_CLOUD_mask_score_train.area_id.unique():
        df_selected = df_CLOUD_mask_score_train[(df_CLOUD_mask_score_train.area_id==area_id)&(df_CLOUD_mask_score_train.yyyymm==yyyymm)].reset_index(drop=True)
        data += [df_selected.loc[df_selected['score'].argmin()].to_dict()]
df_date_selected_train = pd.DataFrame(data)
df_date_selected_train

Unnamed: 0,yyyymmdd,area_id,score,yyyymm
0,20200107,0,0.0,202001
1,20200107,1,0.0,202001
2,20200107,2,0.0,202001
3,20200107,3,0.0,202001
4,20200107,4,0.0,202001
...,...,...,...,...
31603,20211207,1312,0.0,202112
31604,20211207,1313,0.0,202112
31605,20211202,1314,0.0,202112
31606,20211207,1315,0.0,202112


In [46]:
df_date_selected_train.to_csv('./datasets/date_selected_train.csv',index=None)

In [47]:
df_CLOUD_mask_score_test = pd.read_csv('./datasets/CLOUD_mask_score_test.csv')
df_CLOUD_mask_score_test['yyyymm'] = df_CLOUD_mask_score_test['yyyymmdd'].apply(lambda x: str(x)[:-2])
data = []
for yyyymm in df_CLOUD_mask_score_test.yyyymm.unique():
    for area_id in df_CLOUD_mask_score_test.area_id.unique():
        df_selected = df_CLOUD_mask_score_test[(df_CLOUD_mask_score_test.area_id==area_id)&(df_CLOUD_mask_score_test.yyyymm==yyyymm)].reset_index(drop=True)
        data += [df_selected.loc[df_selected['score'].argmin()].to_dict()]
df_date_selected_test = pd.DataFrame(data)
df_date_selected_test.to_csv('./datasets/date_selected_test.csv',index=None)
df_date_selected_test

Unnamed: 0,yyyymmdd,area_id,score,yyyymm
0,20200107,0,0.0,202001
1,20200102,1,0.0,202001
2,20200107,2,0.0,202001
3,20200107,3,0.0,202001
4,20200107,4,0.0,202001
...,...,...,...,...
13555,20211202,560,0.0,202112
13556,20211207,561,0.0,202112
13557,20211207,562,0.0,202112
13558,20211202,563,0.0,202112


# Preprocessing

In [7]:
df_date_selected_train = pd.read_csv('./datasets/date_selected_train.csv')
df_date_selected_test = pd.read_csv('./datasets/date_selected_test.csv')

In [8]:
df_date_selected_train['year'] = df_date_selected_train.yyyymm.apply(lambda x: str(x)[:4])
df_date_selected_train = df_date_selected_train[df_date_selected_train['year']=='2021'].reset_index(drop=True)
df_date_selected_test['year'] = df_date_selected_test.yyyymm.apply(lambda x: str(x)[:4])
df_date_selected_test = df_date_selected_test[df_date_selected_test['year']=='2021'].reset_index(drop=True)
df_date_selected_train

Unnamed: 0,yyyymmdd,area_id,score,yyyymm,year
0,20210101,0,0.0,202101,2021
1,20210101,1,0.0,202101,2021
2,20210101,2,0.0,202101,2021
3,20210101,3,0.0,202101,2021
4,20210111,4,0.0,202101,2021
...,...,...,...,...,...
15799,20211207,1312,0.0,202112,2021
15800,20211207,1313,0.0,202112,2021
15801,20211202,1314,0.0,202112,2021
15802,20211207,1315,0.0,202112,2021


In [9]:
len(df_date_selected_train.area_id.unique()),len(df_date_selected_train.yyyymmdd.unique()),len(dates)

(1317, 52, 142)

In [10]:
masks_train.shape,CLOUD_masks.shape,image_features.shape

((1317, 2051, 2051), (142, 2051, 2051), (142, 2051, 2051, 5))

In [11]:
def crop_data(x,full_feat = None):
    sum_0 = x.sum(axis=0)
    sum_1 = x.sum(axis=1)
    min_max_0 = np.nonzero(sum_0)[0]    
    min_max_1 = np.nonzero(sum_1)[0]
    min_0, max_0 = min(min_max_0), max(min_max_0)    
    min_1, max_1 = min(min_max_1), max(min_max_1)
    if full_feat is not None:
        return x[min_1:max_1+1,min_0:max_0+1],full_feat[min_1:max_1+1,min_0:max_0+1]
    return x[min_1:max_1+1,min_0:max_0+1]

def pad_zero_square_img(img,min_size = 16):
    if img.ndim==2:
        H,W = img.shape
        if min_size>W:
            d = min_size-W
            pad_left = np.zeros((H,d//2+d%2))
            pad_right = np.zeros((H,d//2))
            img = np.concatenate([pad_left,img,pad_right],axis=1)
        if min_size>H:
            d = min_size-H
            pad_top = np.zeros((d//2+d%2,max(min_size,W)))
            pad_btm = np.zeros((d//2,max(min_size,W)))
            img = np.concatenate([pad_top,img,pad_btm],axis=0)
        return img
    else:
        H,W,n_dim = img.shape
        if min_size>W:
            d = min_size-W
            pad_left = np.zeros((H,d//2+d%2,n_dim))
            pad_right = np.zeros((H,d//2,n_dim))
            img = np.concatenate([pad_left,img,pad_right],axis=1)
        if min_size>H:
            d = min_size-H
            pad_top = np.zeros((d//2+d%2,max(min_size,W),n_dim))
            pad_btm = np.zeros((d//2,max(min_size,W),n_dim))
            img = np.concatenate([pad_top,img,pad_btm],axis=0)
        return img
    
def slicing_array(x,window_size = 16, stride = 1, mask = None, area_thresh = 0.7):
    outputs = []
    x_shape = x.shape
    H,W = x_shape[0],x_shape[1]
    for h in range((H-window_size)//stride+1):
        for w in range((W-window_size)//stride+1):
            if mask is not None:
                cropped_mask = mask[h*stride:h*stride+window_size,w*stride:w*stride+window_size]
                if cropped_mask.sum()/window_size/window_size > area_thresh:
                    outputs += [x[h*stride:h*stride+window_size,w*stride:w*stride+window_size]]
            else:
                outputs += [x[h*stride:h*stride+window_size,w*stride:w*stride+window_size]]
            # print(h*stride,h*stride+window_size,w*stride,w*stride+window_size)
    return outputs
    
for area_id in df_date_selected_train.area_id.unique():
    features = []
    mask = masks_train[area_id]
    for yyyymm in sorted(df_date_selected_train.yyyymm.unique()):
        df_selected = df_date_selected_train[(df_date_selected_train.area_id==area_id)&(df_date_selected_train.yyyymm==yyyymm)].reset_index(drop=True)
        df_selected = df_selected.to_dict('records')[0]
        selected_date = df_selected['yyyymmdd']
        # img_data = {k:v for k,v in df_img[df_img.yyyymmdd==str(selected_date)][['band','raw_path']].values}
        month_feature = image_features[np.where(dates==str(selected_date))[0][0]]
        cmask, month_feature = crop_data(mask.copy(),full_feat = month_feature)
        features += [month_feature]
    features = np.concatenate(features,axis=-1)
    masked_features = features*cmask[:,:,None]
    padded_masked_features = pad_zero_square_img(masked_features,min_size = config.input_size)
    cmask = pad_zero_square_img(cmask,min_size = config.input_size)
    if min(padded_masked_features.shape[:2])<24:
        slicing_masked_features = slicing_array(padded_masked_features,window_size = config.input_size, stride = config.stride, mask = None, area_thresh = config.area_thresh)
    elif min(padded_masked_features.shape[:2])<32:
        slicing_masked_features = slicing_array(padded_masked_features,window_size = config.input_size, stride = config.stride, mask = cmask, area_thresh = 0.5*config.area_thresh)
    else:
        slicing_masked_features = slicing_array(padded_masked_features,window_size = config.input_size, stride = config.stride, mask = cmask, area_thresh = config.area_thresh)
    # save slicing_masked_features to npy
    for idx,i in enumerate(slicing_masked_features):
        np.save(f"./datasets/training_data_5d/train/train_{area_id:04d}_{idx:03d}_{df_train.crop_type[area_id]}.npy",i)

In [12]:
for area_id in df_date_selected_test.area_id.unique():
    features = []
    mask = masks_test[area_id]
    for yyyymm in sorted(df_date_selected_test.yyyymm.unique()):
        df_selected = df_date_selected_test[(df_date_selected_test.area_id==area_id)&(df_date_selected_test.yyyymm==yyyymm)].reset_index(drop=True)
        df_selected = df_selected.to_dict('records')[0]
        selected_date = df_selected['yyyymmdd']
        # img_data = {k:v for k,v in df_img[df_img.yyyymmdd==str(selected_date)][['band','raw_path']].values}
        month_feature = image_features[np.where(dates==str(selected_date))[0][0]]
        cmask, month_feature = crop_data(mask.copy(),full_feat = month_feature)
        features += [month_feature]
    features = np.concatenate(features,axis=-1)
    masked_features = features*cmask[:,:,None]
    padded_masked_features = pad_zero_square_img(masked_features,min_size = config.input_size)
    cmask = pad_zero_square_img(cmask,min_size = config.input_size)
    if min(padded_masked_features.shape[:2])<24:
        slicing_masked_features = slicing_array(padded_masked_features,window_size = config.input_size, stride = 1, mask = None, area_thresh = config.area_thresh)
    elif min(padded_masked_features.shape[:2])<32:
        slicing_masked_features = slicing_array(padded_masked_features,window_size = config.input_size, stride = 1, mask = cmask, area_thresh = 0.5*config.area_thresh)
    else:
        slicing_masked_features = slicing_array(padded_masked_features,window_size = config.input_size, stride = 1, mask = cmask, area_thresh = config.area_thresh)
    # save slicing_masked_features to npy
    for idx,i in enumerate(slicing_masked_features):
        np.save(f"./datasets/training_data_5d/test/test_{area_id:04d}_{idx:03d}.npy",i)

In [13]:
from line_notify import send_line_notify
send_line_notify('Data is ready to train')

'{"status":200,"message":"ok"}'

In [14]:
from glob import glob
training_data = glob('./datasets/training_data_5d/train/*.npy')
testing_data = glob('./datasets/training_data_5d/test/*.npy')
len(training_data),len(testing_data)

(5919, 11816)

In [15]:
%cd ./datasets
!zip -qr training_data_5d.zip training_data_5d/
%cd ..

/content/datasets
/content


# Preprocessing for Self Supervised Learning

In [175]:
len(dates),image_features.shape,masks_train.shape,masks_test.shape

(142, (142, 2051, 2051, 3), (1317, 2051, 2051), (565, 2051, 2051))

In [182]:
df_dates = pd.DataFrame([(i,i[:-2],i[:-4]) for i in dates],columns= ['full_date','yyyymm','year'])
df_dates

Unnamed: 0,full_date,yyyymm,year
0,20200102,202001,2020
1,20200107,202001,2020
2,20200112,202001,2020
3,20200117,202001,2020
4,20200122,202001,2020
...,...,...,...
137,20211207,202112,2021
138,20211212,202112,2021
139,20211217,202112,2021
140,20211222,202112,2021


In [228]:
for year in ["2020","2021"]:
    for area_id in range(len(masks_train)):
        
        mask = masks_train[area_id]
        df_year_dates = df_dates[df_dates.year==year]
        
        slicing_masked_features = []
        for ii in range(2):
            features = []
            for yyyymm in sorted(df_year_dates.yyyymm.unique()):
                selected_date = random.choice(list(df_year_dates[df_year_dates.yyyymm==yyyymm].full_date))
                month_feature = image_features[np.where(dates==str(selected_date))[0][0]]
                cmask, month_feature = crop_data(mask.copy(),full_feat = month_feature)
                features += [month_feature]
            features = np.concatenate(features,axis=-1)
            masked_features = features*cmask[:,:,None]
            padded_masked_features = pad_zero_square_img(masked_features,min_size = config.input_size)
            cmask = pad_zero_square_img(cmask,min_size = config.input_size)
            slicing_masked_feature = slicing_array(padded_masked_features,window_size = config.input_size, stride = config.stride, mask = None, area_thresh = config.area_thresh)
            slicing_masked_features += [slicing_masked_feature]
        for idx,i in enumerate(slicing_masked_features[0]):
            j = slicing_masked_features[1][idx]
            np.save(f"./datasets/self_supervised_training_data/train_{area_id:04d}_{year}_{idx:04d}_{df_train.crop_type[area_id]}.npy",np.concatenate([i[None,:],j[None,:]]))

for year in ["2020","2021"]:
    for area_id in range(len(masks_test)):
        mask = masks_test[area_id]
        df_year_dates = df_dates[df_dates.year==year]
        
        slicing_masked_features = []
        for ii in range(2):
            features = []
            for yyyymm in sorted(df_year_dates.yyyymm.unique()):
                selected_date = random.choice(list(df_year_dates[df_year_dates.yyyymm==yyyymm].full_date))
                month_feature = image_features[np.where(dates==str(selected_date))[0][0]]
                cmask, month_feature = crop_data(mask.copy(),full_feat = month_feature)
                features += [month_feature]
            features = np.concatenate(features,axis=-1)
            masked_features = features*cmask[:,:,None]
            padded_masked_features = pad_zero_square_img(masked_features,min_size = config.input_size)
            cmask = pad_zero_square_img(cmask,min_size = config.input_size)
            slicing_masked_feature = slicing_array(padded_masked_features,window_size = config.input_size, stride = config.stride, mask = None, area_thresh = config.area_thresh)
            slicing_masked_features += [slicing_masked_feature]
        for idx,i in enumerate(slicing_masked_features[0]):
            j = slicing_masked_features[1][idx]
            np.save(f"./datasets/self_supervised_training_data/test_{area_id:04d}_{year}_{idx:04d}.npy",np.concatenate([i[None,:],j[None,:]]))

                        