# Make Baseline Dataset

Create the dataset that we will use to develop a baseline score. 

### Imports

In [17]:
from src.data.make_baseline_dataset import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



## Steps:

The following steps are done for all timesteps (TS) and bands (B). 

There are 11 timesteps and 14 bands - `11 x 14 = 154`, which will be interesting later

### 1. Mask raster with shapefile

Using the given `train` and `test` shapefiles, "mask" the polygons from the raster (image).

### 2. Reduce masked bands to descriptive stats

Each masked band gives a numpy array of variable shape. For the baseline, we will calculate descriptive stats for each mask, such as the `mean` and `std` of the values. 

Exactly _what_ we calculate will most likely be fiddled with to create better models.

### 3. Save to interim


### 0. Load Data

In [18]:
dataset = 'train'

print('Reading shapefile...', end='')
shp_df = read_shapefile(dataset)
print('done.')

safe_dirs = get_safe_dirs()
safe_dir = safe_dirs[0]

# Get timestamp (str)
date = date_from_safedir(safe_dir)

print('Reading image bands...', end='')
img_band_fpaths = glob(f'{safe_dir}/**/IMG_DATA/*.jp2', recursive=True)
print('done.')

print(f'Found {len(img_band_fpaths)} image bands to process')

Reading shapefile...done.
Reading image bands...done.
Found 14 image bands to process


In [19]:
shp_df.geometry.head()

Field_Id
1    POLYGON ((613924.3324000851 6793991.295069702,...
2    POLYGON ((614404.0717397591 6794020.579634549,...
3    POLYGON ((614320.2146632321 6793964.242091182,...
4    POLYGON ((614560.5326844405 6794002.249461887,...
7    POLYGON ((614596.6564746551 6794269.865436315,...
Name: geometry, dtype: object

In [21]:
# Prototype on one band
img_fpath = img_band_fpaths[1]

date = date_from_safedir(safe_dir)

band = band_from_imgpath(img_fpath)
print('Processing band ', band)
with rasterio.open(img_fpath) as raster:
    
    print('Masking raster...', end='')
    masks = mask_raster(shp_df.geometry, raster)
    print(f'{len(masks)} farms successfully masked')
    
    check_mask_data(masks[list(masks.keys())[0]])

    # Calculate descriptive stats for each mask
    # and get the names of features calculated
    mask_data = {idx: reduce_img(img) for idx, img in masks.items()}

    mask_df = to_dataframe(mask_data, band)
    
mask_df.head()

Processing band  TCI
Masking raster...

Multi-band data might cause kak


2344 farms successfully masked


Unnamed: 0_level_0,TCI_width,TCI_height,TCI_size,TCI_mean,TCI_std
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19,20,13,260,91.308974,49.464187
20,11,23,253,65.762846,38.705559
21,21,6,126,39.497354,44.557866
22,23,14,322,74.063147,39.331146
24,8,28,224,62.53125,35.584728


In [22]:
# Now do that for multiple bands and combine
mask_dfs = []
for img_fpath in img_band_fpaths[:3]:
    
    band = band_from_imgpath(img_fpath)
    print('Processing band ', band)
    with rasterio.open(img_fpath) as raster:

        print('Masking raster...', end='')
        masks = mask_raster(shp_df.geometry, raster)
        print(f'{len(masks)} farms successfully masked')
        
        check_mask_data(masks[list(masks.keys())[0]])

        # Calculate descriptive stats for each mask
        # and get the names of features calculated
        mask_data = {idx: reduce_img(img) for idx, img in masks.items()}

        mask_df = to_dataframe(mask_data, band)
    
    mask_dfs.append(mask_df)

Processing band  B08
Masking raster...2344 farms successfully masked
Processing band  TCI
Masking raster...

Multi-band data might cause kak


2344 farms successfully masked
Processing band  B09
Masking raster...2344 farms successfully masked


In [23]:
df = pd.concat(mask_dfs, axis=1)
df = df.merge(shp_df.Crop_Id_Ne, left_index=True, right_on='Field_Id', how='left')
df.head()

Unnamed: 0_level_0,B08_width,B08_height,B08_size,B08_mean,B08_std,TCI_width,TCI_height,TCI_size,TCI_mean,TCI_std,B09_width,B09_height,B09_size,B09_mean,B09_std,Crop_Id_Ne
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
19,20,13,260,1949.619231,991.590718,20,13,260,91.308974,49.464187,4,3,12,500.0,500.239776,8
20,11,23,253,1591.383399,931.566898,11,23,253,65.762846,38.705559,3,5,15,192.466667,384.978591,8
21,21,6,126,863.079365,970.21506,21,6,126,39.497354,44.557866,4,2,8,112.5,297.647022,8
22,23,14,322,1608.388199,846.305729,23,14,322,74.063147,39.331146,5,3,15,472.666667,442.541097,8
24,8,28,224,1491.866071,847.789315,8,28,224,62.53125,35.584728,3,5,15,288.066667,408.429589,8
