In [1]:
import cv2
from glob import glob
import pandas as pd
import numpy as np

import torch

import rasterio
from rasterio.mask import mask
import geopandas as gpd

import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [2]:
df_train = gpd.read_file("datasets/Varuna Hackathon 2022/training_area/traindata.shp")
df_test = gpd.read_file("datasets/Varuna Hackathon 2022/testing_area/testdata.shp")

grid_aoi = gpd.read_file("datasets/Varuna Hackathon 2022/Grid_AOI/AOI.shp")

df_train['area'] = df_train['geometry'].apply(lambda x: x.area)
df_test['area'] = df_test['geometry'].apply(lambda x: x.area)

df_train

Unnamed: 0,years,crop_type,geometry,area
0,2021,1,"POLYGON ((774604.014 1671240.055, 774606.506 1...",7395.766301
1,2021,1,"POLYGON ((773367.837 1670278.438, 773389.600 1...",12760.987757
2,2021,1,"POLYGON ((771648.671 1673990.960, 771707.027 1...",41971.752161
3,2021,3,"POLYGON ((771306.117 1674174.738, 771426.678 1...",8416.322100
4,2021,2,"POLYGON ((763750.339 1678588.825, 763879.691 1...",13915.426881
...,...,...,...,...
1312,2021,3,"POLYGON ((769004.617 1679027.544, 769130.199 1...",19370.802033
1313,2021,3,"POLYGON ((772624.509 1673384.254, 772770.291 1...",5249.190746
1314,2021,1,"POLYGON ((774333.243 1669788.192, 774337.269 1...",17355.845596
1315,2021,3,"POLYGON ((768880.988 1687424.354, 768996.565 1...",19346.460336


In [3]:
def extract_feature(x):
    file_sep = x.split('.')
    name_feature = file_sep[0].split('_')
    return [name_feature[2],file_sep[-1]]

file_path = glob('datasets/Varuna Hackathon 2022/sentinel-2 image/*/*/*/*')
df_img = pd.DataFrame([[i]+i.split('/')[-4:] for i in file_path],columns=['raw_path','year','yyyymmdd','filetype','filename'])
df_img = df_img[df_img['filetype']=="IMG_DATA"].reset_index(drop=True)

df_file_feat = pd.DataFrame([extract_feature(df_img['filename'][i]) for i in range(df_img.shape[0])],columns=['band','file_format'])
df_img = pd.concat([df_img,df_file_feat],axis=1)
df_img = df_img[df_img.file_format=="jp2"].reset_index(drop=True)
df_img['mm'] = df_img.yyyymmdd.apply(lambda x: x[4:6])
df_img['dd'] = df_img.yyyymmdd.apply(lambda x: x[6:])
df_img

Unnamed: 0,raw_path,year,yyyymmdd,filetype,filename,band,file_format,mm,dd
0,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_AOT.jp2,AOT,jp2,01,02
1,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B01.jp2,B01,jp2,01,02
2,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B02.jp2,B02,jp2,01,02
3,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B03.jp2,B03,jp2,01,02
4,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2020,20200102,IMG_DATA,47PQS_20200102_B04.jp2,B04,jp2,01,02
...,...,...,...,...,...,...,...,...,...
2125,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_B12.jp2,B12,jp2,12,27
2126,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_B8A.jp2,B8A,jp2,12,27
2127,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_SCL.jp2,SCL,jp2,12,27
2128,datasets/Varuna Hackathon 2022/sentinel-2 imag...,2021,20211227,IMG_DATA,47PQS_20211227_TCI.jp2,TCI,jp2,12,27


In [9]:
df_img.yyyymmdd.unique()

array(['20200102', '20200107', '20200112', '20200117', '20200122',
       '20200127', '20200201', '20200206', '20200211', '20200216',
       '20200221', '20200226', '20200302', '20200307', '20200312',
       '20200317', '20200322', '20200327', '20200401', '20200406',
       '20200411', '20200421', '20200426', '20200501', '20200506',
       '20200511', '20200516', '20200521', '20200526', '20200531',
       '20200605', '20200610', '20200615', '20200620', '20200625',
       '20200630', '20200705', '20200710', '20200715', '20200720',
       '20200725', '20200730', '20200804', '20200809', '20200814',
       '20200819', '20200824', '20200829', '20200903', '20200908',
       '20200913', '20200918', '20200923', '20200928', '20201003',
       '20201008', '20201013', '20201018', '20201023', '20201028',
       '20201102', '20201107', '20201112', '20201117', '20201122',
       '20201127', '20201202', '20201207', '20201212', '20201217',
       '20201227', '20210101', '20210106', '20210111', '202101

In [39]:
masks_train = np.load("./datasets/masks_train.npy")
masks_test = np.load("./datasets/masks_test.npy")
CLOUD_masks = np.load("./datasets/CLOUD_mask.npy")

# Configuration

In [5]:
class config:
    input_size = 16

# Find Cloud index from each day in each area

In [40]:
dates = df_img.yyyymmdd.unique()

In [None]:
def compute_cloud_index(mask,CLOUD_mask):
    return (mask&CLOUD_mask).sum()/mask.sum()
data = []
for idx_day,CLOUD_mask in enumerate(CLOUD_masks):
    for idx_area,area_mask in enumerate(masks_train):
        data += [[dates[idx_day],idx_area,compute_cloud_index(area_mask,CLOUD_mask)]]
df_CLOUD_mask_score_train = pd.DataFrame(data,columns=['yyyymmdd','area_id','score'])
df_CLOUD_mask_score_train

In [None]:
data = []
for idx_day,CLOUD_mask in enumerate(CLOUD_masks):
    for idx_area,area_mask in enumerate(masks_test):
        data += [[dates[idx_day],idx_area,compute_cloud_index(area_mask,CLOUD_mask)]]
df_CLOUD_mask_score_test = pd.DataFrame(data,columns=['yyyymmdd','area_id','score'])
df_CLOUD_mask_score_test

In [17]:
from line_notify import send_line_notify
send_line_notify('Finish Prepare Data')

'{"status":200,"message":"ok"}'

In [None]:
df_CLOUD_mask_score_train.to_csv('./datasets/CLOUD_mask_score_train.csv',index=None)
df_CLOUD_mask_score_test.to_csv('./datasets/CLOUD_mask_score_test.csv',index=None)

In [None]:
df_CLOUD_mask_score_train = pd.read_csv('./datasets/CLOUD_mask_score_train.csv')
df_CLOUD_mask_score_train['yyyymm'] = df_CLOUD_mask_score_train['yyyymmdd'].apply(lambda x: str(x)[:-2])
df_CLOUD_mask_score_train

In [None]:
data = []
for yyyymm in df_CLOUD_mask_score_train.yyyymm.unique():
    for area_id in df_CLOUD_mask_score_train.area_id.unique():
        df_selected = df_CLOUD_mask_score_train[(df_CLOUD_mask_score_train.area_id==area_id)&(df_CLOUD_mask_score_train.yyyymm==yyyymm)].reset_index(drop=True)
        data += [df_selected.loc[df_selected['score'].argmin()].to_dict()]
df_date_selected_train = pd.DataFrame(data)
df_date_selected_train

In [None]:
df_date_selected_train.to_csv('./datasets/date_selected_train.csv',index=None)

In [None]:
df_CLOUD_mask_score_test = pd.read_csv('./datasets/CLOUD_mask_score_test.csv')
df_CLOUD_mask_score_test['yyyymm'] = df_CLOUD_mask_score_test['yyyymmdd'].apply(lambda x: str(x)[:-2])
data = []
for yyyymm in df_CLOUD_mask_score_test.yyyymm.unique():
    for area_id in df_CLOUD_mask_score_test.area_id.unique():
        df_selected = df_CLOUD_mask_score_test[(df_CLOUD_mask_score_test.area_id==area_id)&(df_CLOUD_mask_score_test.yyyymm==yyyymm)].reset_index(drop=True)
        data += [df_selected.loc[df_selected['score'].argmin()].to_dict()]
df_date_selected_test = pd.DataFrame(data)
df_date_selected_test.to_csv('./datasets/date_selected_test.csv',index=None)
df_date_selected_test