In [1]:
import geopandas as gps
import rasterio                  # I/O raster data (netcdf, height, geotiff, ...)
import rasterio.mask
import rasterio.warp             # Reproject raster samples
import rasterio.merge
from rasterio.transform import rowcol
from rasterio import features
import fiona                     # I/O vector data (shape, geojson, ...)
import pyproj                    # Change coordinate reference system
from osgeo import gdal,ogr,osr
import pandas as pd
import shapely
from shapely.geometry import box, Point
import json
import json
from sklearn.model_selection import train_test_split
import shutil
import numpy as np               # numerical array manipulation
import time
import os
from PIL import Image
import PIL.ImageDraw
from core.visualize import display_images

import matplotlib.pyplot as plt  # plotting tools
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from tqdm import trange
import warnings                  # ignore annoying warnings
warnings.filterwarnings("ignore")

%reload_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from multiprocessing import Pool

In [2]:
# Required configurations (including the input and output paths) are stored in a separate file (such as config/Preprocessing.py)
# Please provide required info in the file before continuing with this notebook. 
# hbh: in this scene,a new config named Preprocessing_within is created to distinguish from the original
from config import Preprocessing   
# In case you are using a different folder name such as configLargeCluster, then you should import from the respective folder 
# Eg. from configLargeCluster import Preprocessing
config = Preprocessing.Configuration()

In [14]:
base_dir = r'D:\lakemapping'
output_dir=r'D:\sample'
raw_img_dir=os.path.join(base_dir , r'sample750_pad\tif')
training_area_fn =os.path.join(base_dir , r'2_dataset\sampleAnnotations\SampleV9\raw_total_area.shp')
training_polygon_fn = os.path.join(base_dir ,r'2_dataset\sampleAnnotations\SampleV9\raw_total_polygon.shp')


raw_area_prefix='raw_area_'
area_prefix='area_'
pad_area_prefix='pad_area_'
raw_image_prefix='sample_'
image_prefix='image_'
pad_image_prefix='pad_image_'
annotation_prefix='annotation_'
area_file_type='.shp'
image_file_type='.tif'
annotation_file_type='.png'
type_num=5

model_dir = os.path.join(base_dir,r'3_training\U_Net')
dataset_dir=os.path.join(base_dir ,'2_dataset')  
patch_size = (576,576,6) 
patch_dir = os.path.join(model_dir,'patches{}'.format(patch_size[0])) 
# The training areas are divided into training, validation and testing set. Note that training area can have different sizes, so it doesn't guarantee that the final generated patches (when using sequential stratergy) will be in the same ratio.
test_ratio = 0.2
val_ratio = 0.25

## 提取shapefile
将shapefile按类别存储为单个文件，以便后续裁剪

In [4]:
for i in range(0,type_num):
    writePath=os.path.join(output_dir,r'image\output{}'.format(i))
    if not os.path.exists(writePath):
        os.makedirs(writePath)
    writePath=os.path.join(output_dir,r'raw_image\output{}'.format(i))
    if not os.path.exists(writePath):
        os.makedirs(writePath)
    writePath2=os.path.join(output_dir,r'shp\output{}'.format(i))
    if not os.path.exists(writePath2):
        os.makedirs(writePath2)

In [8]:
trainingArea = gps.read_file(training_area_fn)
for i in tqdm(trainingArea.index):
    fn=trainingArea.loc[i]['file_name']
    ty=trainingArea.loc[i]['type']
    writePath=os.path.join(output_dir,r'shp\output{}'.format(ty))
    shape=trainingArea[i:i+1]
    fn=os.path.join(writePath,raw_area_prefix+fn+area_file_type)
    if not os.path.exists(fn):
        print(fn)
        shape.to_file()

  0%|          | 0/747 [00:00<?, ?it/s]

## 裁剪sampe影像，获得image影像

In [9]:
def Image_Compress(path_image, path_out_image):
    """
    :param path_image: 输入需要压缩的影像路径
    :param path_out_image: 输出压缩后的影像路径
    :return: None
    """
    ds = gdal.Open(path_image)
    # 打开影像数据
    driver = gdal.GetDriverByName('GTiff')
    # 创建输出的数据驱动
    driver.CreateCopy(path_out_image, ds, strict=1, 
                      options=["TILED=YES", "COMPRESS=LZW", "BIGTIFF=YES"])
    # 设置压缩参数
    """
    PACKBITS：连续字节压缩，快速无损压缩
    LZW：所有信息全部保留（可逆），以某一数值代替字符串，快速无损压缩
    """
    del ds


for i in range(0,type_num):
    all_files = []
    for root, dirs, files in os.walk(os.path.join(output_dir,'shp\output{}'.format(i))):
        for file in files:
            if file.endswith(area_file_type)and file.startswith(raw_area_prefix):
                 all_files.append((os.path.join(root, file), file))
                    
    for in_shp_path, filename in all_files:
        print(in_shp_path)
        raster_fn=os.path.join(raw_img_dir,filename.replace(raw_area_prefix,raw_image_prefix).replace(area_file_type,image_file_type))
        input_raster = gdal.Open(raster_fn)
        out_raw_raster_path=in_shp_path.replace(raw_area_prefix,image_prefix).replace(area_file_type,image_file_type).replace('shp','raw_image')
        out_raster_path=out_raw_raster_path.replace('raw_','')
        if not os.path.exists(out_raw_raster_path):
            print(raster_fn)
            print(out_raw_raster_path)
                # 利用gdal.Warp进行裁剪
                # https://gdal.org/api/python/osgeo.gdal.html#osgeo.gdal.Warp
            result = gdal.Warp(
                    out_raw_raster_path,
                    input_raster,
                    format = 'GTiff',
                    cutlineDSName = in_shp_path, # 用于裁剪的矢量
                    cropToCutline = True, # 是否使用cutlineDSName的extent作为输出的界线
    #                 dstNodata = -999 # 输出数据的nodata值
                    )
            result.FlushCache()
            del result
        if not os.path.exists(out_raster_path):
            print(out_raster_path)
            Image_Compress(out_raw_raster_path,out_raster_path)
            print('compressed')


D:\sample\shp\output0\raw_area_10031.shp
D:\sample\shp\output0\raw_area_10150.shp
D:\sample\shp\output0\raw_area_10174.shp
D:\sample\shp\output0\raw_area_10332.shp
D:\sample\shp\output0\raw_area_10358.shp
D:\sample\shp\output0\raw_area_10383.shp
D:\sample\shp\output0\raw_area_10408.shp
D:\sample\shp\output0\raw_area_10689.shp
D:\sample\shp\output0\raw_area_10713.shp
D:\sample\shp\output0\raw_area_10856.shp
D:\sample\shp\output0\raw_area_10892.shp
D:\sample\shp\output0\raw_area_10928.shp
D:\sample\shp\output0\raw_area_11020_3.shp
D:\sample\shp\output0\raw_area_11081.shp
D:\sample\shp\output0\raw_area_11289.shp
D:\sample\shp\output0\raw_area_11306.shp
D:\sample\shp\output0\raw_area_11324.shp
D:\sample\shp\output0\raw_area_11383.shp
D:\sample\shp\output0\raw_area_11428.shp
D:\sample\shp\output0\raw_area_11490.shp
D:\sample\shp\output0\raw_area_11601.shp
D:\sample\shp\output0\raw_area_11664.shp
D:\sample\shp\output0\raw_area_11740.shp
D:\sample\shp\output0\raw_area_11773.shp
D:\sample\shp\

D:\sample\shp\output1\raw_area_14624_3.shp
D:\sample\shp\output1\raw_area_14624_4.shp
D:\sample\shp\output1\raw_area_14674.shp
D:\sample\shp\output1\raw_area_14682.shp
D:\sample\shp\output1\raw_area_14698.shp
D:\sample\shp\output1\raw_area_149.shp
D:\sample\shp\output1\raw_area_15054.shp
D:\sample\shp\output1\raw_area_15277.shp
D:\sample\shp\output1\raw_area_15346.shp
D:\sample\shp\output1\raw_area_15355.shp
D:\sample\shp\output1\raw_area_1538.shp
D:\sample\shp\output1\raw_area_15414.shp
D:\sample\shp\output1\raw_area_15419.shp
D:\sample\shp\output1\raw_area_15457.shp
D:\sample\shp\output1\raw_area_15496.shp
D:\sample\shp\output1\raw_area_15536.shp
D:\sample\shp\output1\raw_area_15573.shp
D:\sample\shp\output1\raw_area_15601.shp
D:\sample\shp\output1\raw_area_15663.shp
D:\sample\shp\output1\raw_area_15682.shp
D:\sample\shp\output1\raw_area_15721.shp
D:\sample\shp\output1\raw_area_15744.shp
D:\sample\shp\output1\raw_area_15769.shp
D:\sample\shp\output1\raw_area_15841.shp
D:\sample\shp\o

## 提取shapefile
读取image影像，保存影像边界，pad width 为92个像素

In [11]:
for i in range(0,type_num):
    writePath=os.path.join(output_dir,r'image\output{}'.format(i))
    shpWritePath=os.path.join(output_dir,r'shp\output{}'.format(i))
    
    all_files = []
    for root, dirs, files in os.walk(writePath):
        for file in files:
            if file.endswith(image_file_type)and file.startswith(image_prefix):
                 all_files.append((os.path.join(root, file), file))
    # print(all_files)

    for fullPath, filename in all_files:
        print(filename)
        shp_fn=filename.replace(image_file_type,area_file_type).replace(image_prefix,pad_area_prefix).replace('image','shp')
#         print(shp_fn)
        dataset = gdal.Open(fullPath)
        geocd=dataset.GetGeoTransform()
        outfilename =os.path.join(shpWritePath,shp_fn)
        if not os.path.exists(outfilename):
            print(outfilename)
            oDriver = ogr.GetDriverByName('ESRI Shapefile')
            oDS = oDriver.CreateDataSource(outfilename)
            srs = osr.SpatialReference(wkt=dataset.GetProjection())
            oLayer = oDS.CreateLayer("polygon", srs, ogr.wkbPolygon)
            oDefn = oLayer.GetLayerDefn()
            row = dataset.RasterXSize+92
            line = dataset.RasterYSize+92
            geoxmin = geocd[0]-92*geocd[1]-92*geocd[2]
            geoymin = geocd[3]-92*geocd[4]-92*geocd[5]
            geoxmax = geocd[0] + (row) * geocd[1] + (line) * geocd[2]
            geoymax = geocd[3] + (row) * geocd[4] + (line) * geocd[5]
            ring = ogr.Geometry(ogr.wkbLinearRing)
            ring.AddPoint(geoxmin, geoymin)
            ring.AddPoint(geoxmax, geoymin)
            ring.AddPoint(geoxmax, geoymax)
            ring.AddPoint(geoxmin, geoymax)
            ring.CloseRings()
            poly = ogr.Geometry(ogr.wkbPolygon)
            poly.AddGeometry(ring)
            outfeat = ogr.Feature(oDefn)
            outfeat.SetGeometry(poly)
            oLayer.CreateFeature(outfeat)
            outfeat = None

image_10031.tif
image_10150.tif
image_10174.tif
image_10332.tif
image_10358.tif
image_10383.tif
image_10408.tif
image_10689.tif
image_10713.tif
image_10856.tif
image_10892.tif
image_10928.tif
image_11020_3.tif
image_11081.tif
image_11289.tif
image_11306.tif
image_11324.tif
image_11383.tif
image_11428.tif
image_11490.tif
image_11601.tif
image_11664.tif
image_11740.tif
image_11773.tif
image_11904.tif
image_12000.tif
image_12234.tif
image_12262.tif
image_12277.tif
image_12312.tif
image_12344.tif
image_12351.tif
image_12389.tif
image_12395.tif
image_12443.tif
image_12455.tif
image_12596.tif
image_1260.tif
image_12602.tif
image_12637.tif
image_12666.tif
image_12688.tif
image_12821.tif
image_12843.tif
image_13177.tif
image_13224.tif
image_13246.tif
image_13251.tif
image_13307.tif
image_13362.tif
image_13407.tif
image_13411_2.tif
image_13554.tif
image_13582.tif
image_13618.tif
image_13705.tif
image_13742.tif
image_13748.tif
image_13830.tif
image_13928.tif
image_13944.tif
image_13962.tif
image

In [12]:
# all_files = []
# shpWritePath=r'D:\lakemapping\2_dataset\shp\val_2'
# for root, dirs, files in os.walk(r'D:\lakemapping\2_dataset\patchesReshape\val'):
#     for file in files:
#         if file.endswith(image_file_type)and file.startswith(image_prefix):
#              all_files.append((os.path.join(root, file), file))
# print(all_files)
for i in range(0,type_num):
    writePath=os.path.join(output_dir,r'image\output{}'.format(i))
    shpWritePath=os.path.join(output_dir,r'shp\output{}'.format(i))
    
    all_files = []
    for root, dirs, files in os.walk(writePath):
        for file in files:
            if file.endswith(image_file_type)and file.startswith(image_prefix):
                 all_files.append((os.path.join(root, file), file))
    # print(all_files)

    for fullPath, filename in all_files:
        print(filename)
        shp_fn=filename.replace(image_file_type,area_file_type).replace(image_prefix,area_prefix).replace('image','shp')
#         print(shp_fn)
        dataset = gdal.Open(fullPath)
        geocd=dataset.GetGeoTransform()
        outfilename =os.path.join(shpWritePath,shp_fn)
        if not os.path.exists(outfilename):
            print(outfilename)
            oDriver = ogr.GetDriverByName('ESRI Shapefile')
            oDS = oDriver.CreateDataSource(outfilename)
            srs = osr.SpatialReference(wkt=dataset.GetProjection())
            oLayer = oDS.CreateLayer("polygon", srs, ogr.wkbPolygon)
            oDefn = oLayer.GetLayerDefn()
            row = dataset.RasterXSize
            line = dataset.RasterYSize
            geoxmin = geocd[0]
            geoymin = geocd[3]
            geoxmax = geocd[0] + (row) * geocd[1] + (line) * geocd[2]
            geoymax = geocd[3] + (row) * geocd[4] + (line) * geocd[5]
            ring = ogr.Geometry(ogr.wkbLinearRing)
            ring.AddPoint(geoxmin, geoymin)
            ring.AddPoint(geoxmax, geoymin)
            ring.AddPoint(geoxmax, geoymax)
            ring.AddPoint(geoxmin, geoymax)
            ring.CloseRings()
            poly = ogr.Geometry(ogr.wkbPolygon)
            poly.AddGeometry(ring)
            outfeat = ogr.Feature(oDefn)
            outfeat.SetGeometry(poly)
            oLayer.CreateFeature(outfeat)
            outfeat = None

image_10031.tif
image_10150.tif
image_10174.tif
image_10332.tif
image_10358.tif
image_10383.tif
image_10408.tif
image_10689.tif
image_10713.tif
image_10856.tif
image_10892.tif
image_10928.tif
image_11020_3.tif
image_11081.tif
image_11289.tif
image_11306.tif
image_11324.tif
image_11383.tif
image_11428.tif
image_11490.tif
image_11601.tif
image_11664.tif
image_11740.tif
image_11773.tif
image_11904.tif
image_12000.tif
image_12234.tif
image_12262.tif
image_12277.tif
image_12312.tif
image_12344.tif
image_12351.tif
image_12389.tif
image_12395.tif
image_12443.tif
image_12455.tif
image_12596.tif
image_1260.tif
image_12602.tif
image_12637.tif
image_12666.tif
image_12688.tif
image_12821.tif
image_12843.tif
image_13177.tif
image_13224.tif
image_13246.tif
image_13251.tif
image_13307.tif
image_13362.tif
image_13407.tif
image_13411_2.tif
image_13554.tif
image_13582.tif
image_13618.tif
image_13705.tif
image_13742.tif
image_13748.tif
image_13830.tif
image_13928.tif
image_13944.tif
image_13962.tif
image

## 裁剪sampe影像，获得pad image影像

In [13]:
for i in range(0,type_num):
    all_files = []
    for root, dirs, files in os.walk(os.path.join(output_dir,'shp\output{}'.format(i))):
        for file in files:
            if file.endswith(area_file_type)and file.startswith(pad_area_prefix):
                 all_files.append((os.path.join(root, file), file))
#     print(all_files)                    
    for fullPath, filename in all_files:
        print(filename)
        out_raw_raster_path=fullPath.replace(pad_area_prefix,pad_image_prefix).replace(area_file_type,image_file_type).replace('shp','raw_image')
        out_raster_path=out_raw_raster_path.replace('raw_','')
        
        if not os.path.exists(out_raw_raster_path):
            raster_fn=os.path.join(raw_img_dir,filename.replace(pad_area_prefix,raw_image_prefix).replace(area_file_type,image_file_type))#.replace('_1','').replace('_2','')
            print(raster_fn)
            input_raster = gdal.Open(raster_fn)
            in_shp_path=fullPath
            print(out_raw_raster_path)
                # 利用gdal.Warp进行裁剪
                # https://gdal.org/api/python/osgeo.gdal.html#osgeo.gdal.Warp
            result = gdal.Warp(
                    out_raw_raster_path,
                    input_raster,
                    format = 'GTiff',
                    cutlineDSName = in_shp_path, # 用于裁剪的矢量
                    cropToCutline = True, # 是否使用cutlineDSName的extent作为输出的界线
    #                 dstNodata = -999 # 输出数据的nodata值
                    )
            result.FlushCache()
            del result
        if not os.path.exists(out_raster_path):
            Image_Compress(out_raw_raster_path,out_raster_path)
            print('compressed')

pad_area_10031.shp
pad_area_10150.shp
pad_area_10174.shp
pad_area_10332.shp
pad_area_10358.shp
pad_area_10383.shp
pad_area_10408.shp
pad_area_10689.shp
pad_area_10713.shp
pad_area_10856.shp
pad_area_10892.shp
pad_area_10928.shp
pad_area_11020_3.shp
pad_area_11081.shp
pad_area_11289.shp
pad_area_11306.shp
pad_area_11324.shp
pad_area_11383.shp
pad_area_11428.shp
pad_area_11490.shp
pad_area_11601.shp
pad_area_11664.shp
pad_area_11740.shp
pad_area_11773.shp
pad_area_11904.shp
pad_area_12000.shp
pad_area_12234.shp
pad_area_12262.shp
pad_area_12277.shp
pad_area_12312.shp
pad_area_12344.shp
pad_area_12351.shp
pad_area_12389.shp
pad_area_12395.shp
pad_area_12443.shp
pad_area_12455.shp
pad_area_12596.shp
pad_area_1260.shp
pad_area_12602.shp
pad_area_12637.shp
pad_area_12666.shp
pad_area_12688.shp
pad_area_12821.shp
pad_area_12843.shp
pad_area_13177.shp
pad_area_13224.shp
pad_area_13246.shp
pad_area_13251.shp
pad_area_13307.shp
pad_area_13362.shp
pad_area_13407.shp
pad_area_13411_2.shp
pad_area_

## 生成annotation

In [31]:
#Read the training area 、 training polygons
trainingArea = gps.read_file(r'D:\lakemapping\2_dataset\sampleAnnotations\SampleV4\train_area.shp')
trainingPolygon = gps.read_file(r'D:\lakemapping\2_dataset\sampleAnnotations\SampleV4\train_polygon.shp')

print(trainingPolygon.shape,trainingArea.shape)# area:id, geomerry;   polygon:id, geometry 
trainingPolygon
trainingArea
# print(f'Read a total of {trainingPolygon.shape[0]} object polygons and {trainingArea.shape[0]} training areas.')
# print(f'Polygons will be assigned to training areas in the next steps.')

(131449, 3) (445, 6)


Unnamed: 0,CLASS_NAME,area,geometry
0,Lake,0.018956,"POLYGON Z ((-66.54333 -2.42330 0.00000, -66.54..."
1,LakeLikeRiver,0.040392,"POLYGON Z ((-66.49653 -2.42321 0.00000, -66.49..."
2,LakeLikeRiver,0.230842,"POLYGON Z ((-66.49959 -2.42312 0.00000, -66.49..."
3,Lake,0.029674,"POLYGON Z ((-66.43051 -2.42132 0.00000, -66.43..."
4,Lake,0.005558,"POLYGON Z ((-66.55762 -2.43021 0.00000, -66.55..."
...,...,...,...
131444,Reservoir,0.189746,"POLYGON Z ((33.00752 24.34722 0.00000, 33.0074..."
131445,Reservoir,0.000091,"POLYGON Z ((33.02126 24.33500 0.00000, 33.0212..."
131446,Reservoir,0.915970,"POLYGON Z ((33.02135 24.33509 0.00000, 33.0213..."
131447,Lake,0.225691,"POLYGON Z ((174.36704 64.65400 0.00000, 174.36..."


Unnamed: 0,id,type,file_name,dataset,area,geometry
0,2913,1,2913_1,train,122.289714,"POLYGON Z ((-3.69199 16.13554 0.00000, -3.5400..."
1,65,1,65,train,446.782976,"POLYGON Z ((-76.80614 6.22074 0.00000, -76.646..."
2,85,1,85,train,359.016936,"POLYGON Z ((-74.20875 6.79620 0.00000, -74.051..."
3,96,1,96,train,567.541389,"POLYGON Z ((-74.00941 7.49429 0.00000, -73.840..."
4,102,1,102,train,176.831955,"POLYGON Z ((-72.33154 3.79646 0.00000, -72.264..."
...,...,...,...,...,...,...
440,18682,0,18682,train,414.393331,"POLYGON Z ((-70.93654 -48.43438 0.00000, -70.6..."
441,18710,5,18710,train,375.745158,"POLYGON Z ((-68.81463 -40.45141 0.00000, -68.6..."
442,18818,2,18818,train,185.891121,"POLYGON Z ((172.52783 -43.44513 0.00000, 172.6..."
443,18856,3,18856,train,192.827160,"POLYGON Z ((-71.99683 -53.30585 0.00000, -71.8..."


In [32]:
# Check if the training areas and the training polygons have the same crs     
if trainingArea.crs  != trainingPolygon.crs:
    print('Training area CRS does not match training_polygon CRS')
    targetCRS = trainingPolygon.crs #Areas are less in number so conversion should be faster
    trainingArea = trainingArea.to_crs(targetCRS)
print(trainingPolygon.crs)
print(trainingArea.crs)
assert trainingPolygon.crs == trainingArea.crs

epsg:4326
epsg:4326


In [33]:
# As input we received two shapefile, first one contains the training areas/rectangles and other contains the polygon of lakes/objects in those training areas
# The first task is to determine the parent training area for each polygon.

def dividePolygonsInTrainingAreas(trainingPolygon, trainingArea):
    '''Assign annotated ploygons in to the training areas.'''
    # For efficiency, assigned polygons are removed from the list, we make a copy here. 
    cpTrainingPolygon = trainingPolygon.copy()
    splitPolygons = {}
    for i in tqdm(trainingArea.index):
        spTemp = [] 
        allocated = []
        print("area's index:",i)
        for j in cpTrainingPolygon.index:
            if cpTrainingPolygon.loc[j]['geometry'].intersects(trainingArea.loc[i]['geometry']):
                spTemp.append(cpTrainingPolygon.loc[j])
                allocated.append(j)      
        splitPolygons[i] = {'polygons':spTemp,'bounds':list(trainingArea.bounds.loc[i]),'file_name':trainingArea.loc[i]['file_name'] ,'type':trainingArea.loc[i]['type']}
        cpTrainingPolygon = cpTrainingPolygon.drop(allocated)#assigned polygons are removed from the list
    return splitPolygons

# areasWithPolygons contains the object polygons for each area!
areasWithPolygons = dividePolygonsInTrainingAreas(trainingPolygon, trainingArea)
print(f'Assigned training polygons in {len(areasWithPolygons)} training areas')

  0%|          | 0/445 [00:00<?, ?it/s]

area's index: 0
area's index: 1
area's index: 2
area's index: 3
area's index: 4
area's index: 5
area's index: 6
area's index: 7
area's index: 8
area's index: 9
area's index: 10
area's index: 11
area's index: 12
area's index: 13
area's index: 14
area's index: 15
area's index: 16
area's index: 17
area's index: 18
area's index: 19
area's index: 20
area's index: 21
area's index: 22
area's index: 23
area's index: 24
area's index: 25
area's index: 26
area's index: 27
area's index: 28
area's index: 29
area's index: 30
area's index: 31
area's index: 32
area's index: 33
area's index: 34
area's index: 35
area's index: 36
area's index: 37
area's index: 38
area's index: 39
area's index: 40
area's index: 41
area's index: 42
area's index: 43
area's index: 44
area's index: 45
area's index: 46
area's index: 47
area's index: 48
area's index: 49
area's index: 50
area's index: 51
area's index: 52
area's index: 53
area's index: 54
area's index: 55
area's index: 56
area's index: 57
area's index: 58
area's 

In [17]:
# print(areasWithPolygons) 

In [28]:
def extractAreasThatOverlapWithTrainingData(areaInfo):
    """Iterates over raw NDWI images and using findOverlap() extract areas that overlap with training data. 
    The overlapping areas in raw images are written in a separate file, and annotation file are created from polygons in the overlapping areas.
    """   
    polygonsInAreaDf = gps.GeoDataFrame(areaInfo['polygons'])
    file_name=str(areaInfo['file_name'])
    bboxArea = box(*areaInfo['bounds'])
    area_type=areaInfo['type']
    writePath=os.path.join(output_dir,'image\output{}'.format(area_type))
    if not os.path.exists(writePath):
        os.makedirs(writePath)
    #draw image: set pad width of the sample images
    raw_img = rasterio.open(os.path.join(writePath,image_prefix+file_name+image_file_type))
    profile = raw_img.profile  
    profile['dtype'] = rasterio.uint8
#     profile['nodata']= None
    profile['count'] = 1
    polygons = []
    for i in polygonsInAreaDf.index:
        gm = polygonsInAreaDf.loc[i]['geometry']
        polygons.append(gm)
    output_filename=os.path.join(writePath,annotation_prefix+file_name+annotation_file_type)
    if not os.path.exists(output_filename):
        print(output_filename)
        with rasterio.open(output_filename, 'w+', **profile) as out:
            out_arr = out.read(1)
            burned = features.rasterize(polygons, fill=0, default_value=1,all_touched=False,out=out_arr, transform=out.transform)
            out.write_band(1, burned)

In [34]:
for value in areasWithPolygons.values():
    extractAreasThatOverlapWithTrainingData(value)

D:\sample\image\output1\annotation_2913_1.png
D:\sample\image\output1\annotation_65.png
D:\sample\image\output1\annotation_85.png
D:\sample\image\output1\annotation_96.png
D:\sample\image\output1\annotation_102.png
D:\sample\image\output1\annotation_115.png
D:\sample\image\output1\annotation_149.png
D:\sample\image\output0\annotation_168.png
D:\sample\image\output0\annotation_178.png
D:\sample\image\output1\annotation_213.png
D:\sample\image\output1\annotation_306.png
D:\sample\image\output0\annotation_343.png
D:\sample\image\output1\annotation_520.png
D:\sample\image\output1\annotation_646.png
D:\sample\image\output1\annotation_648.png
D:\sample\image\output1\annotation_862.png
D:\sample\image\output0\annotation_863.png
D:\sample\image\output2\annotation_933.png
D:\sample\image\output0\annotation_953.png
D:\sample\image\output1\annotation_1090.png
D:\sample\image\output0\annotation_1260.png
D:\sample\image\output1\annotation_1397.png
D:\sample\image\output1\annotation_1402_2.png
D:\sa

D:\sample\image\output3\annotation_8629.png
D:\sample\image\output3\annotation_8662.png
D:\sample\image\output0\annotation_8667.png
D:\sample\image\output1\annotation_8671.png
D:\sample\image\output3\annotation_8727.png
D:\sample\image\output0\annotation_8870.png
D:\sample\image\output5\annotation_8904.png
D:\sample\image\output1\annotation_8966.png
D:\sample\image\output0\annotation_8992.png
D:\sample\image\output0\annotation_9010.png
D:\sample\image\output0\annotation_9044.png
D:\sample\image\output4\annotation_9068.png
D:\sample\image\output1\annotation_9078.png
D:\sample\image\output1\annotation_9089.png
D:\sample\image\output0\annotation_9093.png
D:\sample\image\output1\annotation_9105.png
D:\sample\image\output0\annotation_9115.png
D:\sample\image\output1\annotation_9117.png
D:\sample\image\output1\annotation_9169.png
D:\sample\image\output1\annotation_9211_1.png
D:\sample\image\output1\annotation_9238.png
D:\sample\image\output1\annotation_9263.png
D:\sample\image\output3\annota

D:\sample\image\output3\annotation_15253.png
D:\sample\image\output1\annotation_15277.png
D:\sample\image\output1\annotation_15346.png
D:\sample\image\output3\annotation_15388.png
D:\sample\image\output1\annotation_15414.png
D:\sample\image\output1\annotation_15457.png
D:\sample\image\output1\annotation_15496.png
D:\sample\image\output1\annotation_15573.png
D:\sample\image\output1\annotation_15601.png
D:\sample\image\output0\annotation_15612.png
D:\sample\image\output1\annotation_15663.png
D:\sample\image\output1\annotation_15682.png
D:\sample\image\output0\annotation_15693.png
D:\sample\image\output0\annotation_15816.png
D:\sample\image\output1\annotation_15864.png
D:\sample\image\output0\annotation_15898.png
D:\sample\image\output3\annotation_16048.png
D:\sample\image\output0\annotation_16071.png
D:\sample\image\output1\annotation_16092.png
D:\sample\image\output5\annotation_16118.png
D:\sample\image\output1\annotation_16142.png
D:\sample\image\output0\annotation_16178.png
D:\sample\

## 划分数据集

In [22]:
def moveFileinType(lakeType,patchType,fn):
    output_dir=os.path.join(dataset_dir,r'patchesReshape/{}/type{}'.format(patchType,lakeType))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    move(output_dir,fn)
    move(output_dir,fn.replace(image_prefix,annotation_prefix).replace(image_file_type,annotation_file_type))
    move(output_dir,fn.replace(image_prefix,pad_image_prefix))
    
def move(output_dir,fn):
    old=os.path.join(patch_output,fn)
    if(os.path.exists(old)):
        new=os.path.join(output_dir,fn)
        shutil.move(old, new)

In [24]:
# split dataset and copy images & annotations to new directory
for i in range(0,type_num):
    frames = []
    frames_json = os.path.join(patch_dir,'sample_list_supplement_{}.json'.format(i))
#     print(frames_json)
    patch_output = os.path.join(dataset_dir,r'output\output{}'.format(i))
    all_files = os.listdir(patch_output)
    all_image_files = [fn for fn in all_files if fn.startswith(image_prefix) and fn.endswith(image_file_type)]
    len(all_image_files)
    print(all_image_files)

    if os.path.isfile(frames_json):
        print("dataset type{} had been splited".format(i))
#             train data are classified into different dirs for following process.
        with open(frames_json, 'r') as file:
            fjson = json.load(file)#
            for train_fn in fjson['training_frames']:
                moveFileinType(i,'train',train_fn)
            for testing_fn in fjson['testing_frames']:
                moveFileinType(i,'test',testing_fn)
            for validation_fn in fjson['validation_frames']:
                moveFileinType(i,'val',validation_fn)
    else:
            print("Creating and writing train-test split from file")
            frames_list = list(range(len(all_image_files)))
            
            # Divide into training and test set       
            training_frames, testing_frames = train_test_split(frames_list, test_size=test_ratio)
            # Further divide into training set into training and validataion set              
            training_frames, validation_frames = train_test_split(training_frames, test_size=val_ratio)
            
            training_frames_name=[all_image_files[id] for id in training_frames]
            testing_frames_name=[all_image_files[id] for id in testing_frames]
            validation_frames_name=[all_image_files[id] for id in validation_frames]
            
            # train data are classified into different dirs for following process.
            for train_fn in training_frames_name:
                moveFileinType(i,'train',train_fn)
            for testing_fn in testing_frames_name:
                moveFileinType(i,'test',testing_fn) 
            for validation_fn in validation_frames_name:
                moveFileinType(i,'val',validation_fn)
                
            frame_split = {
                'training_frames': training_frames_name,
                'testing_frames': testing_frames_name,
                'validation_frames': validation_frames_name
            }
            
            if not os.path.exists(patch_dir):
                os.makedirs(patch_dir)
            with open(frames_json, 'w') as f:
                json.dump(frame_split, f)
                
            print('training_frames', training_frames_name)
            print('validation_frames',validation_frames_name )
            print('testing_frames', testing_frames_name)

80

['image_13177.tif', 'image_13224.tif', 'image_13246.tif', 'image_13251.tif', 'image_13307.tif', 'image_13362.tif', 'image_13407.tif', 'image_13554.tif', 'image_13582.tif', 'image_13618.tif', 'image_13705.tif', 'image_13742.tif', 'image_13748.tif', 'image_13830.tif', 'image_13928.tif', 'image_13944.tif', 'image_13962.tif', 'image_13979.tif', 'image_14006.tif', 'image_14077.tif', 'image_14084.tif', 'image_14111.tif', 'image_14206.tif', 'image_14422.tif', 'image_14449.tif', 'image_14494.tif', 'image_14689.tif', 'image_14735.tif', 'image_14753.tif', 'image_14781.tif', 'image_14807.tif', 'image_14836.tif', 'image_14857.tif', 'image_14862.tif', 'image_14872.tif', 'image_14914.tif', 'image_14926.tif', 'image_14947.tif', 'image_14975.tif', 'image_14997.tif', 'image_15009.tif', 'image_15612.tif', 'image_15693.tif', 'image_15816.tif', 'image_15898.tif', 'image_16071.tif', 'image_16178.tif', 'image_16227.tif', 'image_16329.tif', 'image_16333.tif', 'image_16388.tif', 'image_16458.tif', 'image_1648

91

['image_1090.tif', 'image_11963.tif', 'image_13010.tif', 'image_13031.tif', 'image_13036.tif', 'image_13063.tif', 'image_13294.tif', 'image_13324.tif', 'image_13346.tif', 'image_13347.tif', 'image_13444.tif', 'image_13451.tif', 'image_13478.tif', 'image_13494.tif', 'image_13512.tif', 'image_13538.tif', 'image_13663.tif', 'image_13816.tif', 'image_13885.tif', 'image_13891.tif', 'image_14042.tif', 'image_14147.tif', 'image_14241.tif', 'image_14277.tif', 'image_14312.tif', 'image_14346.tif', 'image_14370.tif', 'image_14416.tif', 'image_14500.tif', 'image_14541.tif', 'image_14557.tif', 'image_14583.tif', 'image_1460.tif', 'image_14600.tif', 'image_14624_2.tif', 'image_14624_3.tif', 'image_14674.tif', 'image_14682.tif', 'image_14698.tif', 'image_15054.tif', 'image_15277.tif', 'image_15346.tif', 'image_15355.tif', 'image_15414.tif', 'image_15419.tif', 'image_15457.tif', 'image_15496.tif', 'image_15536.tif', 'image_15573.tif', 'image_15601.tif', 'image_15663.tif', 'image_15682.tif', 'image_15

30

['image_13105.tif', 'image_13119.tif', 'image_13153.tif', 'image_13198.tif', 'image_13624.tif', 'image_16287.tif', 'image_16371.tif', 'image_16417.tif', 'image_16583.tif', 'image_16738.tif', 'image_17210.tif', 'image_17328.tif', 'image_17363.tif', 'image_17374.tif', 'image_17388.tif', 'image_17920.tif', 'image_18569.tif', 'image_18605.tif', 'image_18618.tif', 'image_18818.tif', 'image_2080.tif', 'image_2604.tif', 'image_3421.tif', 'image_3477.tif', 'image_4052.tif', 'image_4851.tif', 'image_4892.tif', 'image_4949.tif', 'image_5264.tif', 'image_6198.tif']
dataset type2 had been splited


25

['image_12847.tif', 'image_13803.tif', 'image_14172.tif', 'image_14309.tif', 'image_14542.tif', 'image_14668.tif', 'image_15074.tif', 'image_15078.tif', 'image_15107.tif', 'image_15137.tif', 'image_15171.tif', 'image_15183.tif', 'image_15204.tif', 'image_15225.tif', 'image_15253.tif', 'image_15312.tif', 'image_15381.tif', 'image_15388.tif', 'image_16048.tif', 'image_18120.tif', 'image_18641.tif', 'image_18667.tif', 'image_18809.tif', 'image_18856.tif', 'image_9436.tif']
dataset type3 had been splited


0

[]
Creating and writing train-test split from file


9

['image_16118.tif', 'image_17140.tif', 'image_17434.tif', 'image_17481.tif', 'image_17683.tif', 'image_17753.tif', 'image_17883.tif', 'image_18710.tif', 'image_18725.tif']
dataset type5 had been splited
