# Find farm IDs by label

Find the Farm IDs present in each label (JFP and JEP)

Save a shapefile for each JEP and JFP in `interim/shapefiles/{train,test}_{jfp,jep}`


In [13]:
import os
import gc
import rasterio
import pandas as pd

import sys
sys.path.append('../')

# import geopandas as gpd
from src.utils import read_shapefile, mask_raster

from config import interim_data_dir

In [13]:
farm_ids = {'train': {'JEP': [], 'JFP': []}, 'test': {'JEP': [], 'JFP': []}}

for dataset in ['train','test']:
    shp_df = read_shapefile(dataset)
    
    for label in ['JEP', 'JFP']:

        # Select a image
        img_fpath = f'../data/interim/images/2017-01-01/B02_{label}.jp2'

        with rasterio.open(img_fpath) as raster:
            masks = mask_raster(shp_df.geometry, raster, return_missing=False)

        farm_ids[dataset][label] = list(masks.keys())

        del masks
        gc.collect()

In [15]:
import pickle

with open('../data/interim/farm_ids.pkl', 'wb') as f:
    pickle.dump(farm_ids, f)

## Put into methods

In [4]:
def create_farm_ids(output_fpath):
    """
    Creates a pickle dump of farm ids
    present in datasets (train/test) and 
    labels (JEP & JFP)
    """
    
    farm_ids = {'train': {'JEP': [], 'JFP': []}, 'test': {'JEP': [], 'JFP': []}}
    
    for dataset in ['train','test']:
        shp_df = read_shapefile(dataset)

        for label in ['JEP', 'JFP']:

            # Select an image to load
            img_fpath = os.path.join(interim_data_dir, f'images/2017-01-01/B02_{label}.jp2')
            
            if not os.path.isfile(img_fpath):
                raise FileNotFoundError('''
                Images have not been reordered into interim data dir.
                \n\nSee readme to run invoke command for re-odering image data. 
                ''')

            with rasterio.open(img_fpath) as raster:
                masks = mask_raster(shp_df.geometry, raster, return_missing=False)

            farm_ids[dataset][label] = list(masks.keys())

            del masks
            gc.collect()
    
    with open(output_fpath, 'wb') as f:
        pickle.dump(farm_ids, f)
    
    return farm_ids


def get_farm_ids(dataset='train', label='JFP'):
    """
    Gets a list of farm ids present in the 
    selected dataset and label
    """
    
    import pickle
    from config import interim_data_dir
    
    ids_fpath = os.path.join(interim_data_dir, 'farm_ids.pkl')
    
    if not os.path.isfile(ids_fpath):
        farm_ids = create_farm_ids(ids_fpath)
    
    else:
        with open(ids_fpath, 'rb') as f:
            farm_ids = pickle.load(f)
    
    return farm_ids[dataset][label]

In [5]:
farm_ids = get_farm_ids(dataset='train',label='JEP')

## Generate new train and test shape files for JEP and JFP dumps

In [9]:
train_shp = read_shapefile('train')
test_shp = read_shapefile('test')

## JEP

In [32]:
train_ids = get_farm_ids('train','JEP')
train_shp_jep = train_shp.loc[train_ids]

print(train_shp_jep.shape)

test_ids = get_farm_ids('test','JEP')
test_shp_jep = test_shp.loc[test_ids]

print(test_shp_jep.shape)

(2344, 4)
(1004, 3)


In [33]:
train_shp_jep.to_file(os.path.join(interim_data_dir, 'shapefiles', 'train_jep'))
test_shp_jep.to_file(os.path.join(interim_data_dir, 'shapefiles', 'test_jep'))

## JFP

In [34]:
train_ids = get_farm_ids('train','JFP')
train_shp_jfp = train_shp.loc[train_ids]

print(train_shp_jfp.shape)

test_ids = get_farm_ids('test','JFP')
test_shp_jfp = test_shp.loc[test_ids]

print(test_shp_jfp.shape)

(489, 4)
(232, 3)


In [35]:
train_shp_jfp.to_file(os.path.join(interim_data_dir, 'shapefiles', 'train_jfp'))
test_shp_jfp.to_file(os.path.join(interim_data_dir, 'shapefiles', 'test_jfp'))

In [30]:
import geopandas as gpd
test = gpd.read_file(os.path.join(interim_data_dir,'shapefiles','test_jep/test_jep.shp'))

In [31]:
test.head()

Unnamed: 0,Area,Subregion,geometry
0,1.32259,3,"POLYGON ((614428.5851635702 6794207.23529591, ..."
1,0.955864,3,"POLYGON ((614617.1386412303 6794163.687688111,..."
2,11.5098,3,"POLYGON ((614352.4574846183 6794416.783909474,..."
3,1.4776,3,"POLYGON ((610535.7965434708 6799766.182449248,..."
4,5.0423,3,"POLYGON ((610595.6245513126 6799764.027455708,..."
