In [1]:

# https://swanlund.space/parallelizing-python
from datetime import date, timedelta
import eeconvert as eeconvert
import ee
import geemap
import multiprocessing as mp
import geopandas as gpd
import os
import shutil
import numpy as np
import pickle
from progress.bar import Bar
from pathlib import Path

import rioxarray as rxr
from shapely.geometry import mapping, shape
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon

import pandas as pd
import rasterio
import matplotlib.pyplot as plt
from rasterio.mask import mask
from rasterio import features
import math, time
import os

from rasterio import Affine
from rasterio.plot import show
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import pickle
import json 

ee.Initialize()


# Imagehelpers
CLOUD_FILTER = 50
CLD_PRB_THRESH = 70 # Cloud probability (%); pixel values greater than are considered cloud
NIR_DRK_THRESH = 0.2 # Near-infrared reflectance; values less than are considered potential cloud shadow
CLD_PRJ_DIST = 5 # Maximum distance (km) to search for cloud shadows from cloud edges
BUFFER = 50 # Distance (m) to dilate the edge of cloud-identified objects

def remove_ag(img):
    afg_adm = ee.FeatureCollection("FAO/GAUL/2015/level0").filter(ee.Filter.eq('ADM0_NAME', 'Afghanistan'))
    land2019 = ee.Image("COPERNICUS/Landcover/100m/Proba-V-C3/Global/2019")
    afg_ag = land2019.select('discrete_classification')\
    .updateMask(land2019.select(['discrete_classification']).eq(40)).clip(afg_adm)
    
    return img.updateMask(afg_ag.select('discrete_classification').mask())

def mask_edges(img):
    """
    Mask the edges
    The masks for the 10m bands sometimes do not exclude bad data at
    scene edges, so we apply masks from the 20m and 60m bands as well.
    Example asset that needs this operation:
    COPERNICUS/S2_CLOUD_PROBABILITY/20190301T000239_20190301T000238_T55GDP
    """
    return img.updateMask(
      img.select('B8A').mask().updateMask(img.select('B9').mask()))

def remove_cloud_shadow(img):
    
    """
    Add cld_prb and is_cloud bands to image
    """
    temp = None
    
    # Get s2cloudless image, subset the probability band.    
    cld_prb = ee.Image(img.get('s2cloudless')).select('probability')

    # Condition s2cloudless by the probability threshold value.    
    is_cloud = cld_prb.gt(CLD_PRB_THRESH).rename('clouds')

    # Add the cloud probability layer and cloud mask as image bands.
    temp = img.addBands(ee.Image([cld_prb, is_cloud]))
    
    
    """
    Add dark_pixels, cld_proj, shadows
    """
    not_water=temp.select('SCL').neq(6)
    
    # Identify dark NIR pixels that are not water (potential cloud shadow pixels).
    SR_BAND_SCALE = 1e4
    dark_pixels = temp.select('B8').lt(NIR_DRK_THRESH*SR_BAND_SCALE).multiply(not_water).rename('dark_pixels')
    
    # Determine the direction to project cloud shadow from clouds (assumes UTM projection).
    shadow_azimuth = ee.Number(90).subtract(ee.Number(temp.get('MEAN_SOLAR_AZIMUTH_ANGLE')));
    
    # Project shadows from clouds for the distance specified by the CLD_PRJ_DIST input.
    cld_proj = (temp.select('clouds').directionalDistanceTransform(shadow_azimuth, CLD_PRJ_DIST*10)
        .reproject(**{'crs': temp.select(0).projection(), 'scale': 100})
        .select('distance')
        .mask()
        .rename('cloud_transform'))

    # Identify the intersection of dark pixels with cloud shadow projection.
    shadows = cld_proj.multiply(dark_pixels).rename('shadows')
    
    temp = temp.addBands(ee.Image([dark_pixels, cld_proj, shadows]))
    
    
    """
    Add is_cld_shdw
    """
    
     # Combine cloud and shadow mask, set cloud and shadow as value 1, else 0.
    is_cld_shdw = temp.select('clouds').add(temp.select('shadows')).gt(0)
    
    
    # Remove small cloud-shadow patches and dilate remaining pixels by BUFFER input.
    # 20 m scale is for speed, and assumes clouds don't require 10 m precision.
    is_cld_shdw = (is_cld_shdw.focal_min(2).focal_max(BUFFER*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20})
        .rename('cloudmask'))
    
    
    """
    Update image to only include those pixels with no cloud or shadow
    """
    temp = temp.addBands(is_cld_shdw)
    
    not_cld_shdw = temp.select('cloudmask').Not()
    
    return temp.select('B.*').updateMask(not_cld_shdw)
    
    
def add_ndvi(image):
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('ndvi')
    return image.addBands([ndvi])

def mask_ndvi(image, ltOrGt, value):
    ndvi = image.select('ndvi')
    if(ltOrGt == 'gt'):
        return image.updateMask(ndvi.gt(value))
    else: 
        return image.updateMask(ndvi.lt(value))
    
def getSentinelCollection(aoi, start_date, end_date):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', CLOUD_FILTER)) 
        .map(mask_edges)) # add this 
    

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    return ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))


class RasterGenerationHelper:

    def __init__(self, parent_path, child_path,  raster_output_dir, n_cores, clean=False, post_period_days=[30,45]):
        self.parent = gpd.read_file(parent_path)
        self.child = gpd.read_file(child_path)
        self.raster_output_dir = raster_output_dir + "/"
        self.post_period_days = post_period_days

        self.n_cores = n_cores
        self._make_dir()
        if not clean:
            self._ready()
            
    def get_rasters(self):            
        if not os.path.isdir(self.raster_output_dir):
            os.makedirs(self.raster_output_dir)
            print("Created new directory..".format(self.raster_output_dir))
        cpus = self.n_cores
#         cpus = 6
        parent_chunks = np.array_split(self.parent, cpus)
        pool = mp.Pool(processes=cpus)
        chunk_processes = [pool.apply_async(self._get_rasters_for_chunk, args=(chunk, self.parent)) for chunk in parent_chunks]
        chunk_results = [chunk.get() for chunk in chunk_processes]
    
    def _get_rasters_for_chunk(self, gdf_chunk, gdf_complete):
        
        for i, tile in gdf_chunk.iterrows():
            temp = self.child.sjoin(self.parent[self.parent['pgrid_id']==tile['pgrid_id']], how="inner", predicate="intersects")
#             temp = self.child.sjoin(self.parent)
            
            
            datewise_counts = temp.groupby(['BSD']).count()['grid_id'].sort_values(ascending=False)
            datewise_counts = datewise_counts.reset_index()
            datewise_counts = datewise_counts.reset_index().rename(columns={'index': 'Date Combo Code', 'grid_id': 'count'})
#             print("## TILE PGRID_ID present?", tile['pgrid_id'] in set(list(np.unique(self.parent['pgrid_id']))))
#             print("## PARENT PGRID_IDs", self.parent['pgrid_id'])
            images = []
            for index, row in datewise_counts.iterrows():
                
                bsd = row['BSD']
                
#                 bed = row['BED']
#                 print("###DATE", date.fromisoformat(bsd).isoformat())
                preStart = (date.fromisoformat(bsd) + timedelta(days = -7)).isoformat()
                preEnd = (date.fromisoformat(bsd) + timedelta(days = +7)).isoformat()
                postStart = (date.fromisoformat(bsd) + timedelta(days = +self.post_period_days[0])).isoformat()
                postEnd = (date.fromisoformat(bsd) + timedelta(days = +self.post_period_days[1])).isoformat()
                
                
                tileIDS = temp[(temp['BSD'] == bsd)]['grid_id'].to_list()
                
                aoiInput = eeconvert.gdfToFc(self.child[self.child['grid_id'].isin(tileIDS)])
                
                # Get pre
                preImage = getSentinelCollection(aoiInput, preStart, preEnd)
                pre_w_ndvi = (preImage.map(remove_cloud_shadow).map(add_ndvi).reduce(ee.Reducer.median()))

                # Get Post
                postImage = getSentinelCollection(aoiInput, postStart, postEnd)
                post_w_ndvi = (postImage.map(remove_cloud_shadow).map(add_ndvi).reduce(ee.Reducer.median()))
#                 postImage = getSentinelCollection(aoiInput, postStart, postEnd).first()
#                 postImage = ee.ImageCollection.fromImages([postImage]);

                combined = pre_w_ndvi.addBands([post_w_ndvi])   
    
                if len(combined.bandNames().getInfo()) > 13:
                
                    tmpNDVI = combined.select(['ndvi_median']).multiply(5000).rename('ndvi')
                    tmpNDVI_1 = combined.select(['ndvi_median_1']).multiply(5000).rename('ndvi1')
                    combined = combined.select('B.*', 'ndvi.*')

                    # Clip to AOI
                    combined_clip = combined.clip(aoiInput)
                    images.append(combined_clip)
            
            bounds = self.parent[self.parent['pgrid_id'] == tile['pgrid_id']].bounds
            minx, miny, maxx, maxy = np.max(bounds['minx']), np.max(bounds['miny']),np.max(bounds['maxx']),np.max(bounds['maxy'])
            aoi = ee.Geometry.Rectangle([minx, miny, maxx, maxy])

            mosaicked = ee.ImageCollection([*images]).mosaic()
#             print("BandsBrah", mosaicked.bandNames().getInfo())
            geemap.ee_export_image(
                mosaicked, 
                filename=self.raster_output_dir + str(tile['pgrid_id'])+".tif", 
                scale=10, 
                region=aoi, 
                file_per_band=False
            )
            
            
    def _filename_to_ids(self, filename):
        return int(filename.split(".tif")[0])
    
    def _ready(self):
        print(list(filter(lambda x: x.endswith(".tif"), os.listdir(self.raster_output_dir))))
        
        files = list(map(self._filename_to_ids, list(filter(lambda x: x.endswith(".tif"), os.listdir(self.raster_output_dir)))))
        old_size = self.parent.shape[0]
        self.parent = self.parent[~self.parent['pgrid_id'].isin(files)]
        new_size = self.parent.shape[0]
        
        print("Ignoring {} tiles as rasters for them are already generated; will only generate rasters for remaining {} tiles".format(old_size - new_size, new_size))
        
        
    def _make_dir(self):
        Path(self.raster_output_dir).mkdir(parents=True, exist_ok=True)
    





class MergeRasterSingleAoi:
    
    def __init__(self, data_dir, shp_path, tiles_path):
        self.shp_path = shp_path
        self.tiles_path = tiles_path
        self.data_dir = data_dir
        self.raster_out_dir = f"{self.data_dir}/interim"
        
    def merge(self, filename="merged"):
        aoi = gpd.read_file(self.shp_path)
        parent = gpd.read_file(f"{self.data_dir}/interim/parent.gpkg")
        
        aoi = gpd.read_file(self.shp_path)
        df = gpd.sjoin(parent, aoi)
        rasters = list(self.tiles_path+'/' + (df['pgrid_id']).astype('str') + ".tif")
        
        temp_dir = f"{self.raster_out_dir}/temp"
        
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        
        for file in rasters:
            if os.path.exists(file):
                shutil.copy(file, temp_dir+"/")

        
        if os.path.exists(f"{self.raster_out_dir}/{filename}.vrt"):
            os.remove(f"{self.raster_out_dir}/{filename}.vrt")
            
        if os.path.exists(f"{self.raster_out_dir}/{filename}.tif"):
            os.remove(f"{self.raster_out_dir}/{filename}.tif")
            
        os.system(f'gdalbuildvrt {self.raster_out_dir}/{filename}.vrt {temp_dir}/*.tif -srcnodata "0"')
        os.system(f'gdal_merge.py -o {self.raster_out_dir}/{filename}.tif {self.raster_out_dir}/{filename}.vrt')
        
        raster = rxr.open_rasterio(f'{self.raster_out_dir}/{filename}.tif').squeeze()
        raster = raster.rio.clip(aoi.geometry.apply(mapping), aoi.crs)
        raster.rio.to_raster(f'{self.raster_out_dir}/{filename}.tif')
        
        shutil.rmtree(temp_dir)
        
class MergeRaster:
    """
    Makes analysis ready rasters for a 
    given shapefile. Assumes vectors and 
    rasters are already generated for 
    all of Afghanistan and they are stored 
    in self.out_dir.
    
    Asssumes the following files are present:
        * district shapefiles in self.aoi_path
        * vectors/parent.gpkg
        * rasters/bestdates_tiles/*.tif (2500mx2500m GeoTIFFs)
    """
    
    
    def __init__(self, aoi_path, out_dir, n_cores):
        self.aoi_path = aoi_path
        self.out_dir = out_dir
        self.start_time = time.time()
        self.n_cores = n_cores

    
            
    def _handle_chunk(self, aoi_chunk, aoi_complete):
        parent = gpd.read_file(self.out_dir + "/vectors/parent.gpkg")
        RASTER_OUTPUT_DIR = self.out_dir + "/rasters/bestdates_tiles"

        for file in aoi_chunk:
            poly = gpd.read_file(file)
            df = gpd.sjoin(parent, poly)
            rasters = list(RASTER_OUTPUT_DIR+'/' + (df['pgrid_id']).astype('str') + ".tif")
            dist_id = file.split("/districts/")[1].split(".gpkg")[0]
            temp_dir = self.out_path + "/temp_"+  dist_id
            Path(temp_dir).mkdir(parents=True, exist_ok=True)
            
            for file in rasters:
                shutil.copy(file, temp_dir+"/")

            
            print(f"--- Making VRT for District {dist_id}: {time.time() - self.start_time} seconds ---")
            os.system(f'gdalbuildvrt {temp_dir}/temp_{dist_id}.vrt {temp_dir}/*.tif -srcnodata "0"')
            print(f"--- Merging for District {dist_id}: {time.time() - self.start_time} seconds ---")
            os.system(f'gdal_merge.py -o {self.out_path}/merged_{dist_id}.tif {temp_dir}/temp_{dist_id}.vrt')
            
            print(f"--- Clipping for District {dist_id}: {time.time() - self.start_time} seconds ---")
            raster = rxr.open_rasterio(f'{self.out_path}/merged_{dist_id}.tif').squeeze()
            raster = raster.rio.clip(poly.geometry.apply(mapping), poly.crs)
            raster.rio.to_raster(f'{self.out_path}/merged_{dist_id}.tif')

    
    def merge_rasters(self, out_path): 
        self.out_path = out_path
        aois = list(map(lambda x: self.aoi_path + "/" + x, os.listdir(self.aoi_path)))
        print(aois)
        cpus = self.n_cores
        aoi_chunks = np.array_split(aois, cpus)
        pool = mp.Pool(processes=cpus)
        chunk_processes = [pool.apply_async(self._handle_chunk, args=(chunk, aois)) for chunk in aoi_chunks]
        chunk_results = [chunk.get() for chunk in chunk_processes]



class Masker:
    def __init__(self, data_dir, input_shp, input_raster, mask_raster):
        self.input_raster = rasterio.open(input_raster)
        self.input_shp = gpd.read_file(input_shp)
        self.mask_raster = rasterio.open(mask_raster)
        self.data_dir = data_dir
        
    def mask(self, filename="masked"):
        out_img, out_transform = mask(self.mask_raster, shapes=self.input_shp.geometry, crop=True)
        out_img[out_img == 0] = 255
        is_valid = (out_img != 255.0).astype(np.uint8)
        cropland = []
        for coords, value in features.shapes(is_valid, transform=out_transform):
            if value != 0:
                geom = shape(coords)
                cropland.append({"geometry": geom})
                
        cropland = gpd.GeoDataFrame(cropland).set_crs("epsg:4326")
        out_img, out_transform = mask(self.input_raster, cropland.geometry, crop=True)
        out_img[np.isnan(out_img)] = 0
        out_img = out_img[0:24]
        with rasterio.open(
            f'{self.data_dir}/interim/{filename}.tif',
            'w',
            driver='GTiff',
            height=out_img.shape[1],
            width=out_img.shape[2],
            count=out_img.shape[0],
            dtype='float32',
            crs=self.input_raster.crs,
            transform=out_transform,
        ) as dst:
            dst.write(out_img[0:24])



class Sampler:
    def __init__(self, data_dir, input_raster):
        self.input_raster = input_raster
        self.data_dir = data_dir
        
    def _add_ndvi(self, df, b8, b4, label):
        df[label] = (df[b8] - df[b4]) / (df[b8] + df[b4]) 
        return df
   
    def _add_dates(self, df):
        child_gdf = gpd.read_file(f"{self.data_dir}/interim/child.gpkg")
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.reset_index().x, df.reset_index().y))
        gdf = gdf.set_crs("epsg:4326").sjoin(child_gdf)
        gdf = gdf.drop(['grid_id', 'geometry'], axis=1)
        gdf['dayofyear'] = pd.to_datetime(gdf['BSD']).dt.dayofyear
        gdf = gdf.drop('BSD', axis=1)
        gdf = gdf.drop(['index_right'], axis=1)
        return gdf

    def _generate_df_from_raster(self):
        with rasterio.open(self.input_raster) as src:
            profile = src.profile
            
        image = rxr.open_rasterio(self.input_raster)
        df = image.to_dataframe(name="value")
        df = df.reset_index()
        df.columns = ['band', 'y', 'x', 'spatial_ref', 'value']
        df = pd.pivot(df, index = ['y', 'x'], columns=['band'], values=['value']).reset_index()
        print(df.columns)
        cols = [*df.columns.get_level_values(0)[0:2], *df.columns.get_level_values(1)[2:]]
        df.columns = df.columns.to_flat_index()
        df.columns = cols
        
        
        df = df.set_index(['y', 'x'])
        df = self._add_ndvi(df, 8, 4, "ndvi_pre")
        df = self._add_ndvi(df, 20, 16, "ndvi_post")
        df = self._add_dates(df)
        
        return df, profile
        
    def sample(self, sample_size, sample_filename="sample", full_filename="full", save_full = True):
        df, profile = self._generate_df_from_raster()
        sample_length = int(len(df) * sample_size)
        sample = df.sample(sample_length, random_state=7)
        sample.to_pickle(f'{self.data_dir}/interim/{sample_filename}.tgz')
        if save_full:
            df.to_pickle(f'{self.data_dir}/interim/{full_filename}.tgz')
       

*** Earth Engine *** FINAL DEADLINE: ee.Authenticate will fail after 2022-06-06. Please upgrade. https://developers.google.com/earth-engine/guides/python_install


In [7]:
import pickle
import pandas as pd
import numpy as np
import rasterio
import rioxarray as rxr
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from pathlib import Path

class DataHelper:
    def __init__(self, data_dir, raw):
        self.data_dir = data_dir
        self.raw = pd.read_pickle(raw)
        
    def pre_only(self):
        return self.raw[[*range(1,13),'ndvi_pre', 'dayofyear']]
    
    def post_only(self):
        return self.raw[[*range(12,25),'ndvi_pre','ndvi_post', 'dayofyear']]
    
    def diff_bands(self):
        copy = self.raw.copy()
        for col in range(1,13):
            copy[col] = copy[col+12] - copy[col]
        return copy[[*range(1,13), 'ndvi_pre','dayofyear']]
    
    def diff_all(self):
        copy = self.raw.copy()
        for col in range(1,13):
            copy[col] = copy[col+12] - copy[col]
        copy['diff_ndvi'] = copy['ndvi_post'] - copy['ndvi_pre']
        return copy[[*range(1,13), 'ndvi_pre', 'diff_ndvi', 'dayofyear']]
    
    def ndvi_and_day(self):
        return self.raw[['ndvi_pre', 'dayofyear']]
    
    def save(self, df, label, filename=None):
        print(f"------ Saved {label}; Columns: {df.columns}")
        if filename != None:
            df.to_pickle(filename)
        else:
            df.to_pickle(f'{self.data_dir}/interim/data_{label}.tgz')
        
class ModelingHelper:
    def __init__(self, data_dir, raw, run_name):
        self.raw = pd.read_pickle(raw)
        self.data_dir = data_dir
        self.run_name = run_name
        with rasterio.open(f"{self.data_dir}/interim/temp.tif") as src:
            self.profile = src.profile
    
    def ready_data(self, data):
        dataset = data[data['ndvi_pre'] > 0.3]
        dataset = dataset.loc[~(dataset==0).all(axis=1)]
        dataset.columns = dataset.columns.astype('str')
        return dataset
        
    def fit(self, data, model_type, n, save=True, drop_ndvi=False):
        clean = self.ready_data(data)
        scaler = StandardScaler()
        scaler.fit(clean)
        normalised_data = scaler.transform(clean)
        
        if model_type=='kmeans':
            model = KMeans(n_clusters=n)
            model.fit(normalised_data)

        if model_type=='gmm':
            model = GaussianMixture(n, covariance_type='full', random_state=0)
            model.fit(normalised_data)
        
        self.model = model
        
        MODEL_DIR = f'{self.data_dir}/outputs/models/{self.run_name}'
        Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)

        with open(f'{MODEL_DIR}/model.pickle', 'wb') as f:
            pickle.dump(model, f)
        
        with open(f'{MODEL_DIR}/scaler.pickle', 'wb') as f:
            pickle.dump(scaler, f)
            
        return model
    
    def load_model(self, path):
        with open(path, 'rb') as f:
            model = pickle.load(f)
    
        return model
    
    def load_scaler(self, path):
        with open(path, 'rb') as f:
            scaler = pickle.load(f)
    
        return scaler
    
    def predict(self, data, model, scaler):
        dataset = self.ready_data(data)
        cluster_assignments =  model.predict(scaler.transform(dataset))
        result = dataset.copy()
        result['clust'] = cluster_assignments
        result = result[['clust']]
        result = pd.merge(data.reset_index(), result.reset_index(), how="left").set_index(['y','x'])
        return result
        
    def save_raster(self, results, filename):
        crs = self.profile['crs']
        transform = self.profile['transform']
        
        clust_assignments = pd.DataFrame(results['clust'])
        clust_assignments = pd.melt(clust_assignments, value_vars=['clust'], value_name='value', ignore_index=False)   
        clust_assignments = clust_assignments.drop('variable', axis=1) 
        clust_assignments[np.isnan(clust_assignments['value'])] = -99
#         return clust_assignments
        clust_assignments = clust_assignments.reset_index().drop_duplicates(subset=['y', 'x']).set_index(['y', 'x']).to_xarray()
        clust_assignments.rio.to_raster(f"{filename}.tif")
        
        with rasterio.open(f"{filename}.tif", "r+") as src:
            src.crs = crs
            src.nodata=-99
    
    def save_ndvi_plot(self, results, model_type, n, filename):
        fig, ax = plt.subplots(n, 1, dpi=70, figsize=(9,9))
        ax=ax.flatten()
            
        for i in range(0,n):
            data = results[results['clust']==i]['ndvi_pre']
            ax[i].hist(data, bins=100)
            ax[i].set_title(f"{model_type} with k={n}; cluster={i}")
        plt.tight_layout()
        plt.savefig(f"{filename}.png")
        plt.close('all')
        
    def get_poppy_pixels(self, dist_id, year):
        poppyPixels = pd.read_csv(self.data_dir + "/inputs/poppy_1994-2020.csv")
        poppyPixels = poppyPixels[poppyPixels['distid'] == dist_id]

        return poppyPixels[f'X{year}'].iloc[0] 

    def save_comparison_results(self, results, dist, year, filename):
        results = pd.DataFrame(results.groupby('clust').count()['ndvi_pre']).rename(columns={'ndvi_pre': 'pixels_from_clustering'})
        results['clustering_ha'] = results['pixels_from_clustering']/100
#         results['unodc_ha'] = self.get_poppy_pixels(dist, year)
        results.to_csv(f"{filename}.csv")
        print(f"--------- Comparison saved to {filename}.csv")
        
class PredictionHelper:
    def _init_():
        print("Init")
    def prep_raster():
        print("Prep raster")
    def load_models():
        print("Load raster")
    def predict():
        print("Predict")
    def save():
        print("Save")
                    

In [3]:
# import argparse
# import gc
# parser = argparse.ArgumentParser()
# parser.add_argument("data_dir")
# parser.add_argument("shp_path")
# parser.add_argument("model_path")
# parser.add_argument("data_file_suffix")
# # parser.add_argument("--tiles_path")
# parser.add_argument("--model_type")
# parser.add_argument("--num")
# parser.add_argument("--year")
# parser.add_argument("--name")
# parser.add_argument("--dist")
# parser.add_argument("--skip_raster_generation", action=argparse.BooleanOptionalAction)
# args = parser.parse_args()       



# from utils.rasters import MergeRasterSingleAoi, Masker, Sampler
# from utils.data import ModelingHelper, DataHelper
# from pathlib import Path
# import os
# import shutil

# DATA_DIR = args.data_dir
# SHP_PATH = args.shp_path
# MODEL_PATH = args.model_path
# DATA_FILE_SUFFIX = args.data_file_suffix
# TILES_PATH = f"{DATA_DIR}/interim/tiles"
# YEAR = '2019'
# NAME = MODEL_PATH.split("/")[-1]
# SKIP_RASTER_GENERATION = False
# N = 3
# MODEL_TYPE = 'kmeans'
# DIST = '2308'

# if args.year:
#     YEAR = args.year
# if args.name:
#     NAME = args.name
# if args.skip_raster_generation:
#     SKIP_RASTER_GENERATION = True
# if args.model_type:
#     MODEL_TYPE=args.model_type
# if args.num:
#     N=int(args.num)
# if args.dist:
#     DIST = args.dist
# # if args.tiles_path:
# #     TILES_PATH = arg.tiles_path

    


In [4]:
DATA_DIR = "../../data"
SHP_PATH = "/data/tmp/arogya/data/inputs/aoi.gpkg"
MODEL_PATH = "/data/tmp/arogya/data/outputs/models/kmeans_3_diff_bands/"
DATA_FILE_SUFFIX = "diff_bands"
TILES_PATH = f"{DATA_DIR}/interim/tiles"
YEAR = '2019'
NAME = MODEL_PATH.split("/")[-1]
SKIP_RASTER_GENERATION = False
N = 3
MODEL_TYPE = 'kmeans'
DIST = '2308'

In [5]:


if not SKIP_RASTER_GENERATION:
    
    
    if os.path.exists(f"{DATA_DIR}/interim/temp"):
        shutil.rmtree(f"{DATA_DIR}/interim/temp")
        
    if os.path.exists(f"{DATA_DIR}/interim/temp.vrt"):
        os.remove(f"{DATA_DIR}/interim/temp.vrt")
    
    if os.path.exists(f"{DATA_DIR}/interim/temp.tif"):
        os.remove(f"{DATA_DIR}/interim/temp.tif")
    
    if os.path.exists(f"{DATA_DIR}/interim/merged.tif"):
        os.remove(f"{DATA_DIR}/interim/merged.tif")
        
    if os.path.exists(f"{DATA_DIR}/interim/masked.tif"):
        os.remove(f"{DATA_DIR}/interim/masked.tif")
    
    
    print("In Merger")
    
    # Create mereged raster for AOI
    mrs = MergeRasterSingleAoi(DATA_DIR, SHP_PATH, TILES_PATH)
    mrs.merge("temp")

    # Create Masked Raster for AOI
    _CROP_MASK_PATH = f'{DATA_DIR}/inputs/{YEAR}_E060N40_PROBAV_LC100_global_v3.0.1_2019.tif'
    _INPUT_RASTER_PATH = f'{DATA_DIR}/interim/temp.tif'
    print("In Masker")

    masker = Masker(DATA_DIR, SHP_PATH, _INPUT_RASTER_PATH, _CROP_MASK_PATH)
    masker.mask("temp")

# print("In Sampler")


In Merger
In Masker


In [6]:
print("In Sampler")
sampler = Sampler(DATA_DIR, f'{DATA_DIR}/interim/temp.tif')
sampler.sample(1, sample_filename = "temp", save_full=False)

dh = DataHelper(DATA_DIR, f'{DATA_DIR}/interim/temp.tgz')

if DATA_FILE_SUFFIX=="diff_bands":
    dh.save(dh.diff_bands(), "diff_bands", filename=f'{DATA_DIR}/interim/temp.tgz')
if DATA_FILE_SUFFIX=="pre_only":
    dh.save(dh.pre_only(), "pre_only", filename=f'{DATA_DIR}/interim/temp.tgz')
if DATA_FILE_SUFFIX=="post_only":
    dh.save(dh.post_only(), "post_only", filename=f'{DATA_DIR}/interim/temp.tgz')
if DATA_FILE_SUFFIX=="ndvi_and_day":
    dh.save(dh.ndvi_and_day(), "ndvi_and_day", filename=f'{DATA_DIR}/interim/temp.tgz')
if DATA_FILE_SUFFIX=="diff_all":
    dh.save(dh.ndvi_and_day(), "diff_all", filename=f'{DATA_DIR}/interim/temp.tgz')

    
    
_DATA_FILE_PATH = f'{DATA_DIR}/interim/temp.tgz'


# Path(f"{DATA_DIR}/outputs/predictions/{NAME}").mkdir(parents=True, exist_ok=True)

# mh.save_raster(results, f"{DATA_DIR}/outputs/predictions/{NAME}/{NAME}")
# mh.save_comparison_results(results, DIST, YEAR, f"{DATA_DIR}/outputs/predictions/{NAME}/{NAME}")
# mh.save_ndvi_plot(results, MODEL_TYPE, N, f"{DATA_DIR}/outputs/predictions/{NAME}/{NAME}")

# mrs=None
# masker=None
# sampler=None
# dh=None
# mh=None
# model=None
# scaler=None
# results=None
# gc.collect(generation=2)

In Sampler
MultiIndex([(    'y', ''),
            (    'x', ''),
            ('value',  1),
            ('value',  2),
            ('value',  3),
            ('value',  4),
            ('value',  5),
            ('value',  6),
            ('value',  7),
            ('value',  8),
            ('value',  9),
            ('value', 10),
            ('value', 11),
            ('value', 12),
            ('value', 13),
            ('value', 14),
            ('value', 15),
            ('value', 16),
            ('value', 17),
            ('value', 18),
            ('value', 19),
            ('value', 20),
            ('value', 21),
            ('value', 22),
            ('value', 23),
            ('value', 24),
            ('value', 25),
            ('value', 26)],
           names=[None, 'band'])
------ Saved diff_bands; Columns: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'ndvi_pre', 'dayofyear'], dtype='object')


In [8]:
mh = ModelingHelper(DATA_DIR, _DATA_FILE_PATH, "dummy")
model = mh.load_model(f"{MODEL_PATH}/model.pickle")
scaler = mh.load_scaler(f"{MODEL_PATH}/scaler.pickle")


results = mh.predict(mh.raw, model, scaler)
results.reset_index().drop_duplicates(subset=['y', 'x']).set_index(['y', 'x'])

Path(f"{DATA_DIR}/outputs/predictions/{NAME}").mkdir(parents=True, exist_ok=True)

clust_assignments = mh.save_raster(results, f"{DATA_DIR}/outputs/predictions/{NAME}/{NAME}")
type(clust_assignments)
# mh.save_comparison_results(results, DIST, YEAR, f"{DATA_DIR}/outputs/predictions/{NAME}/{NAME}")
# mh.save_ndvi_plot(results, MODEL_TYPE, N, f"{DATA_DIR}/outputs/predictions/{NAME}/{NAME}")

# mrs=None
# masker=None
# sampler=None
# dh=None
# mh=None
# model=None
# scaler=None
# results=None
# gc.collect(generation=2)

pandas.core.frame.DataFrame

In [14]:
clust_assignments.reset_index().drop_duplicates(subset=['y', 'x']).set_index(['y', 'x']).to_xarray().rio.to_raster("test.tif")

In [10]:
clust_assignments.to_xarray()

ValueError: cannot convert a DataFrame with a non-unique MultiIndex into xarray