# Find farm IDs by label

Find the Farm IDs present in each label (JFP and JEP)

In [12]:
import os
import gc
import rasterio
import pandas as pd

import sys
sys.path.append('../')

# import geopandas as gpd
from src.utils import read_shapefile, mask_raster

In [13]:
farm_ids = {'train': {'JEP': [], 'JFP': []}, 'test': {'JEP': [], 'JFP': []}}

for dataset in ['train','test']:
    shp_df = read_shapefile(dataset)
    
    for label in ['JEP', 'JFP']:

        # Select a image
        img_fpath = f'../data/interim/images/2017-01-01/B02_{label}.jp2'

        with rasterio.open(img_fpath) as raster:
            masks = mask_raster(shp_df.geometry, raster, return_missing=False)

        farm_ids[dataset][label] = list(masks.keys())

        del masks
        gc.collect()

In [15]:
import pickle

with open('../data/interim/farm_ids.pkl', 'wb') as f:
    pickle.dump(farm_ids, f)

## Put into methods

In [25]:
def create_farm_ids(output_fpath):
    """
    Creates a pickle dump of farm ids
    present in datasets (train/test) and 
    labels (JEP & JFP)
    """
    
    farm_ids = {'train': {'JEP': [], 'JFP': []}, 'test': {'JEP': [], 'JFP': []}}
    
    for dataset in ['train','test']:
        shp_df = read_shapefile(dataset)

        for label in ['JEP', 'JFP']:

            # Select an image to load
            img_fpath = os.path.join(interim_data_dir, f'images/2017-01-01/B02_{label}.jp2')
            
            if not os.path.isfile(img_fpath):
                raise FileNotFoundError('''
                Images have not been reordered into interim data dir.
                \n\nSee readme to run invoke command for re-odering image data. 
                ''')

            with rasterio.open(img_fpath) as raster:
                masks = mask_raster(shp_df.geometry, raster, return_missing=False)

            farm_ids[dataset][label] = list(masks.keys())

            del masks
            gc.collect()
    
    with open(output_fpath, 'wb') as f:
        pickle.dump(farm_ids, f)
    
    return farm_ids


def get_farm_ids(dataset='train', label='JFP'):
    """
    Gets a list of farm ids present in the 
    selected dataset and label
    """
    
    import pickle
    from config import interim_data_dir
    
    ids_fpath = os.path.join(interim_data_dir, 'farm_ids.pkl')
    
    if not os.path.isfile(ids_fpath):
        farm_ids = create_farm_ids(ids_fpath)
    
    else:
        with open(ids_fpath, 'rb') as f:
            farm_ids = pickle.load(f)
    
    return farm_ids[dataset][label]

In [26]:
farm_ids = get_farm_ids(dataset='train',label='JEP')