In [1]:
import os
import yaml
import copy
import glob
import pickle
import datetime
import argparse
import numpy as np
import multiprocessing
import geopandas as gpd
import rasterio
from rasterio.windows import Window
from rasterio.merge import merge

from src.data.load import clean_test_shapefiles, clean_random_shapefile
from src.data.prepare import prepare_data, get_crop_type_x_y_pos, get_valid_cropland_x_y
from src.utils.logger import get_log_dir, get_logger
from src.model.crop_type import test, predict, sample_unlabeled_idx #, evaluate_by_feature_importance

In [2]:
def croptype_predict(args):
    # read configure file
    with open(args.config_filename) as f:
        config = yaml.load(f)
    data_kwargs = config.get('data')
    model_kwargs = config.get('model')
    train_kwargs = config.get('train')
    predict_kwargs = config.get('predict')
    # data path kwargs
    print("getting data kwargs")
    img_dir = data_kwargs.get('img_dir')
    ancillary_dir = data_kwargs.get('ancillary_dir')
    # train kwargs
    print("getting train kwargs")
    cv_type = train_kwargs.get('cv_type')
    tiles_x = train_kwargs.get('tiles_x')
    tiles_y = train_kwargs.get('tiles_y')
    shape = train_kwargs.get('shape')
    buffer_radius = train_kwargs.get('buffer_radius')
    n_fold = train_kwargs.get('n_fold')
    random_state = train_kwargs.get('random_state')
    hp_search_by = train_kwargs.get('hp_search_by')
    train_from = train_kwargs.get('train_from')
    # model kwargs
    print("getting model kwargs")
    fill_missing = model_kwargs.get('fill_missing')
    check_missing = model_kwargs.get('check_missing')
    scaling = model_kwargs.get('scaling')
    study_scaling = model_kwargs.get('study_scaling')
    engineer_feature = model_kwargs.get('engineer_feature')
    new_bands_name = model_kwargs.get('new_bands_name')
    smooth = model_kwargs.get('smooth')
    models_name = model_kwargs.get('models_name')
    pretrained = model_kwargs.get('pretrained')
    # predict kwargs
    print("getting predict kwargs")
    predict_labels_only = predict_kwargs.get('predict_labels_only')
    color_by_height = predict_kwargs.get('color_by_height')

    testing = False
    

    # logger
    log_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    
    log_filename = f'crop_type_{log_time}_on_{pretrained}.log' if not testing else f'crop_type_testing_{log_time}_on_{pretrained}.log'
    logger = get_logger(get_log_dir('./logs/'), __name__, log_filename, level='INFO')
    logger.info(config)


    logger.info('#### Test Crop Type Model')
    test_near_dir = img_dir + '43SFR/raster/' if not testing else img_dir + '43SFR/raster_sample/'
    test_far_dir = img_dir + '43RGQ/raster/' if not testing else img_dir + '43RGQ/raster_sample/'
    predict_dir = img_dir + args.tile_id + '/raster/' if not testing else img_dir + args.tile_id + '/raster_sample/'
    
    return pretrained, predict_dir, test_near_dir, test_far_dir, logger, smooth, engineer_feature, scaling, new_bands_name, fill_missing, check_missing, args.vis_stack, args.vis_profile, 'apple', args.vis_afterprocess

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('--config_filename', type=str,
                    default='./data/config/crop_type.yaml')

parser.add_argument('--tile_ids', nargs='+', default=['43SFR'])
parser.add_argument('--action', type=str, default='test_from_scratch',
                    choices=['test_from_cropland', 'test_from_scratch', 'predict_from_scratch', 'predict_from_cropland', 'test_together'])
parser.add_argument('--vis_stack', type=bool, default=False)
parser.add_argument('--vis_profile', type=bool, default=False)
parser.add_argument('--vis_afterprocess', type=bool, default=False)

args = parser.parse_args("")

args_list, tile_ids = [], args.tile_ids
print(f'Parallizing to {len(tile_ids)} processes...')
for tile_id in tile_ids:
    args.tile_id = tile_id
    args_list.append(copy.deepcopy(args))  # deep copy 
process_pool = multiprocessing.Pool(processes=len(tile_ids))
#process_pool.map(croptype_predict, args_list)

pretrained, predict_dir, test_near_dir, test_far_dir, logger, smooth, engineer_feature, scaling, new_bands_name, fill_missing, check_missing, vis_stack, vis_profile, vis_profile,vis_afterprocess = croptype_predict(args)

Parallizing to 1 processes...
getting data kwargs
getting train kwargs
getting model kwargs
getting predict kwargs
2022-03-07 10:44:59,850 - INFO - Log directory: ./logs/


  config = yaml.load(f)


2022-03-07 10:44:59,872 - INFO - {'data': {'img_dir': 'N:/dataorg-datasets/MLsatellite/sentinel2_images/images_danya/', 'ancillary_dir': 'K:/2021-data-org/4. RESEARCH_n/ML/MLsatellite/Data/layers_india/ancilliary_data/'}, 'train': {'cv_type': 'block', 'tiles_x': 4, 'tiles_y': 4, 'shape': 'square', 'buffer_radius': 0, 'n_fold': 3, 'random_state': 24, 'hp_search_by': 'grid', 'train_from': 'cropland'}, 'model': {'fill_missing': 'linear', 'check_missing': False, 'scaling': 'as_reflectance', 'study_scaling': False, 'engineer_feature': 'temporal+spatial', 'new_bands_name': ['ndvi'], 'smooth': False, 'check_SAC': False, 'models_name': ['ocsvm', 'pul', 'pul-w'], 'pretrained': '20220105-135132_rfc'}, 'predict': {'predict_labels_only': True, 'color_by_height': True}}
2022-03-07 10:44:59,892 - INFO - #### Test Crop Type Model


### Prepare data

In [4]:
if args.action == 'test_from_scratch':
    
    label_path = f'./data/ground_truth/test_labels_combined/polygons_surveys_20210716_20210825_20211213_20220103.shp'
    mandi_path, shimla_path, kullu_path = clean_random_shapefile(label_path)
    label_path_dict = {'kullu': kullu_path}#, 'mandi': mandi_path, 'shimla': shimla_path}

    test_dir_dict = {'kullu': test_near_dir} #, 'mandi': test_far_dir, 'shimla': test_far_dir}

    pretrained = [pretrained] if isinstance(pretrained, str) else pretrained
    
    for district in test_dir_dict.keys():
        logger.info(f'### Test on {district}')
        test_dir = test_dir_dict[district]
        
        # prepare data
        df_te, meta, feature_names, polygons_list = \
            prepare_data(logger=logger, dataset=f'test_{district}', feature_dir=test_dir,
                            label_path=label_path_dict[district], window=None, smooth=smooth,
                            engineer_feature=engineer_feature, scaling=scaling, new_bands_name=new_bands_name,
                            fill_missing=fill_missing, check_missing=check_missing,
                            vis_stack=args.vis_stack, vis_profile=args.vis_profile, vis_profile_type='cropland',
                            vis_afterprocess=args.vis_afterprocess)
        n_feature = len(feature_names)

Converting 1 multi-polygons to polygons...


  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,


2022-03-07 10:46:09,727 - INFO - ### Test on kullu
2022-03-07 10:46:09,753 - INFO - ### Prepare data
2022-03-07 10:46:09,801 - INFO - # Stack timestamps weekly
2022-03-07 10:47:16,787 - INFO -   [1/53] 2019-12-30 (x2020-01-03, 2020-01-05, )
2022-03-07 10:49:02,363 - INFO -   [2/53] 2020-01-06 (2020-01-10, x2020-01-08, )
2022-03-07 10:52:15,608 - INFO -   [3/53] 2020-01-13 (x2020-01-13, 2020-01-15, x2020-01-18, )
2022-03-07 10:56:25,154 - INFO -   [4/53] 2020-01-20 (2020-01-20, x2020-01-23, 2020-01-25, )
2022-03-07 10:58:55,267 - INFO -   [5/53] 2020-01-27 (x2020-02-02, x2020-01-28, )
2022-03-07 11:01:56,618 - INFO -   [6/53] 2020-02-03 (2020-02-09, 2020-02-04, x2020-02-07, )
2022-03-07 11:04:15,471 - INFO -   [7/53] 2020-02-10 (x2020-02-12, 2020-02-14, )
2022-03-07 11:06:46,189 - INFO -   [8/53] 2020-02-17 (2020-02-19, x2020-02-22, x2020-02-17, )
2022-03-07 11:11:48,826 - INFO -   [9/53] 2020-02-24 (2020-02-24, 2020-02-24, x2020-02-27, )
2022-03-07 11:16:24,056 - INFO -   [10/53] 2020-

### Get positive samples

In [6]:
def get_crop_type_x_y_pos_test(logger, df, n_feature, dataset):
    # was in prepare.py
    
    mask_valid = (df.label.values != 0)
    df_valid = df[mask_valid]
    x_valid = df_valid.iloc[:, :n_feature].values
    y_valid = df_valid.loc[:, 'label'].values
    #logger.info(f'df_{dataset}.shape {df_valid.shape}, x_{dataset}.shape {x_valid.shape}, y_{dataset}.shape {y_valid.shape}')
    #logger.info(f'y_{dataset} with 2 classes:')
    count_classes(logger, df_valid.label.values)
    return df_valid, x_valid, y_valid


df_pos, x_pos, y_pos = \
    get_crop_type_x_y_pos_test(logger, df=df_te, n_feature=n_feature, dataset='from_scratch')


2022-03-04 14:46:57,624 - INFO -   label = 1, pixel number = 6743, percentage = 100.0%


In [7]:
def get_unlabeled_pixels(logger, df, size, dataset):
    # was in prepare.py
    
    mask_valid = (df.label.values == 0)
    idx_to_sample = df[mask_valid].index
    # Could also apply gaussian filter here
    sampled_idx = np.random.choice(idx_to_sample, size, replace=False)
    df_unl = df.iloc[sampled_idx, :]
    x_unl = df_unl.iloc[:, :n_feature].values
    y_unl = df_unl.loc[:, 'label'].values
    count_classes(logger, df_unl.label.values)

    return df_unl, x_unl, y_unl


df_unl, x_unl, y_unl = \
    get_unlabeled_pixels(logger, df=df_te, size=df_pos.shape[0], dataset='from_scratch')


2022-03-04 14:49:32,491 - INFO -   label = 0, pixel number = 6743, percentage = 100.0%


In [10]:
# concatenate data
import pandas as pd

df_pu = pd.concat([df_pos, df_unl], axis=0).sample(frac=1) #shuffle rows
x_pu = np.concatenate((x_pos, x_unl), axis=0)
y_pu = np.concatenate((y_pos, y_unl), axis=0)
#coords_pu = gpd.GeoDataFrame(pd.concat([coords_pos, coords_unl], axis=0))

In [22]:
# load pretrained model

logger.info("Loading the best pretrained model...")
best_estimator = pickle.load(open(f'model/{pretrained[0]}.pkl', 'rb'))

2022-03-04 15:04:01,722 - INFO - Loading the best pretrained model...


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


### Test

In [38]:
from src.model.util import convert_partial_predictions
from src.data.write import save_predictions_geotiff
from sklearn.metrics import recall_score, accuracy_score

def evaluate_by_recall(y_test, y_test_pred):
    return recall_score(y_test, y_test_pred, average='macro')


def evaluate_by_accuracy(y_test, y_test_pred):
    return accuracy_score(y_test, y_test_pred)#, average='macro')
    

def test2(logger, model, x_test, y_test, meta, index,
         pred_name, color_by_height, region_indicator=None):
    

    logger.info("## Testing")
    # predict
    y_test_pred = model.predict(x_test)
    print(y_test_pred)
    y_test_pred_converted = convert_partial_predictions(y_test_pred, index, meta['height'] * meta['width'])
    # save prediction
    pred_path = f'./preds/{pred_name}.tiff'
    save_predictions_geotiff(y_test_pred_converted, save_path=pred_path, meta=meta,
                             region_indicator=region_indicator, color_by_height=color_by_height)
    logger.info(f'Saved predictions to {pred_path}')
    # evaluate
    """
    try:
        logger.info('Evaluating by recall...')
        recall = evaluate_by_recall(y_test, y_test_pred)
        logger.info(f'\n{recall}')
    except:
    """
    logger.info('Evaluating by accuracy...')
    accuracy = evaluate_by_accuracy(y_test, y_test_pred)
    logger.info(f'\n{accuracy}')

In [39]:
test2(logger, best_estimator, x_pu, y_pu, meta, df_pu.index,
    pred_name='first_test', color_by_height=True, region_indicator=label_path_dict[district])

2022-03-04 15:21:55,514 - INFO - ## Testing
[2 2 2 ... 3 2 3]
Aligned raster!
2022-03-04 15:22:01,688 - INFO - Saved predictions to ./preds/first_test.tiff
2022-03-04 15:22:01,708 - INFO - Evaluating by accuracy...
2022-03-04 15:22:01,741 - INFO - 
0.6192347619753819


### Test from cropland

In [None]:
def get_valid_cropland_x_y(logger, df, n_feature, dataset):
   
    mask_valid = df.gt_cropland.values != 0
    df_valid = df[mask_valid]  # .reset_index(drop=True)
    x_valid = df_valid.iloc[:, :n_feature].values
    y_valid = df_valid.loc[:, 'gt_cropland'].values
    logger.info(
        f'df_{dataset}.shape {df_valid.shape}, x_{dataset}.shape {x_valid.shape}, y_{dataset}.shape {y_valid.shape}')
    logger.info(f'y_{dataset} with 3 classes:')
    count_classes(logger, df_valid.label.values)
    logger.info(f'y_{dataset} with 2 classes:')
    count_classes(logger, df_valid.gt_cropland.values)
    return df_valid, x_valid, y_valid
    

# get cropland masked data
df_masked, x_masked, y_masked = \
    get_valid_cropland_x_y(logger, df_te, n_feature=n_feature, dataset='from_cropland')

In [None]:
# get positive samples
df_pos, x_pos, y_pos = \
    get_crop_type_x_y_pos(logger, df=df_masked, n_feature=n_feature, dataset='from_cropland')

# get (all) unlabeled samples
df_unl, x_unl, y_unl = \
    get_unlabeled_pixels(logger, df=df_masked, size=None, dataset='from_cropland')

# concatenate data
df_pu = pd.concat([df_pos, df_unl], axis=0).sample(frac=1) #shuffle rows
x_pu = np.concatenate((x_pos, x_unl), axis=0)
y_pu = np.concatenate((y_pos, y_unl), axis=0)
    
test(logger, best_estimator, x_pu, y_pu, meta, df_pu.index,
        pred_name='first_test', color_by_height=color_by_height, region_indicator=label_path)

### Old -- Get unlabeled samples

In [None]:
from src.utils.scv import construct_valid_grid

polygons_gpd, grid = construct_valid_grid(polygons_list, tiles_x=1, tiles_y=1, shape='square')

In [None]:

grid.geometry
grid.shape

# see if to use grid.geonetry or the polgon list directly into the grid shape 
# OR the polygons total bounds

# then check if it makes sense to sample from df, or instead just need to append new points to df and shuffle it
# would then need to make sure features are crated for it 

# other idea: get the polygon, list, creating abounding box, select unlabeled in that box

In [None]:
def sample_unlabeled_idx(coords, grid, size, meta):
    iterable = iter([(feat, val) for feat, val in zip(grid.geometry, np.ones(grid.shape[0], dtype=int))])
    img = rasterio.features.rasterize(iterable, out_shape=(meta['height'], meta['width']),
                                      transform=meta['transform'])
    mask = img.reshape(-1) == 1

    valid_idx = coords.index[mask]

    return np.random.choice(valid_idx, size, replace=False)  

unl_idx_test = sample_unlabeled_idx(df_te.coords, grid, x_pos.shape[0], meta) 

In [None]:
coords_pos = gpd.GeoDataFrame({'geometry': df_pos.coords.values})

# sample unlabeled data
unl_idx = sample_unlabeled_idx(df_te.coords, x_pos.shape[0], meta) # to update
df_unl = df.loc[unl_idx, :]
coords_unl = gpd.GeoDataFrame({'geometry': df_unl.loc[unl_idx, 'coords'].values})
x_unl = df_kullu.loc[unl_idx, feature_names].values
y_unl = np.zeros(unl_idx.shape[0], dtype=int)

# concatenate
df_pu = pd.concat([df_pos, df_unl], axis=0)
x_pu = np.concatenate((x_pos, x_unl), axis=0)
y_pu = np.concatenate((y_pos, y_unl), axis=0)
coords_pu = gpd.GeoDataFrame(pd.concat([coords_pos, coords_unl], axis=0))

test(logger, best_estimator, x_pu, y_pu, meta, df_pu.index,
    pred_name='first_test', color_by_height=color_by_height, region_indicator=label_path)
"""

In [None]:
# get cropland mask
df_mask, x_mask, y_mask = \
get_crop_type_x_y_pos(logger, df, n_feature=n_feature, dataset='train_val_kullu')

x = df.loc[:, feature_names]

# predict
pred_path = pred_path_top + 'testing.tiff' #str(row) + '_' + str(col) + '.tiff'
"""
predict(logger, best_estimator, x_mask, meta, color_by_height,
        pred_path=pred_path, ancillary_dir=ancillary_dir,  
        cropland_mask=y_mask, region_indicator=None)
"""
predict(logger, best_estimator, x, meta, y_mask,
        pred_path, color_by_height, region_indicator=None)

### Geosampler

In [None]:
import argparse
import csv
import json
from multiprocessing.dummy import Pool, Lock
import os
from collections import OrderedDict
import time
from datetime import datetime, timedelta
import warnings
warnings.simplefilter('ignore', UserWarning)

import ee
import numpy as np
import rasterio
import urllib3
from rasterio.transform import Affine
from skimage.exposure import rescale_intensity
from torchvision.datasets.utils import download_and_extract_archive
import shapefile
from shapely.geometry import shape, Point

from datasets.seco_dataset import RGB_BANDS, ALL_BANDS

In [None]:
class GeoSampler:

    def sample_point(self):
        raise NotImplementedError()


class GaussianSampler(GeoSampler):

    def __init__(self, df, labeled_mask, std=5):
        self.df = df
        if labeled_mask is None:
            self.df_labeled = None
            self.df_unlabeled = df
        else:
            self.df_labeled = df[labeled_mask]
            self.df_unlabeled = df[~labeled_mask]
        self.std = std

    def sample_point(self):
        rng = np.random.default_rng()
        if self.df_labeled is not None:
            points = rng.choice(self.df_labeled, out_shape)
        else:
            points = rng.choice(self.df, out_shape)
        std = self.km2deg(self.std)
        lon, lat = np.random.normal(loc=points, scale=[std, std])
        return [lon, lat]

    @staticmethod
    def km2deg(kms, radius=6371):
        return kms / (2.0 * radius * np.pi / 360.0)


In [None]:
import numpy as np
points = [[1,0,1], [0,0,1]]
std=0.5
print("test")
print(np.random.normal(loc=points, scale=[std, std]))