# Split tiles

In [4]:
import geopandas as gpd
import os 
import numpy as np
import pandas as pd

indir = r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\geodata\blocks\384\raw"

df_patchid = gpd.read_file(os.path.join(indir, 'patchid.geojson'))
df_fileid = gpd.read_file(os.path.join(indir, 'fileid.geojson'))

df_all = gpd.overlay(df_patchid,df_fileid,how='intersection')

In [5]:
df_all['id_fn'] = df_all.patch_id.astype(str) + '_' +  df_all.file_id.astype(str)

In [6]:
from sklearn.cross_validation import train_test_split

train_all, eval_data = train_test_split(df_all['id_fn'], test_size = 1/6,  random_state=42)

outdir = r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\geodata\blocks\384\splits"
if not os.path.exists(outdir):
    os.makedirs(outdir)
        
np.savetxt(os.path.join(outdir,"eval.tileids"), np.array(eval_data).astype(str),fmt='%s')



In [7]:
##train and test
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=10, test_size=1/5,  random_state=42)
i = 0
for train_index, test_index in rs.split(train_all):
    i = i + 1
    train_all = np.array(train_all)
    train_data = train_all[train_index]
    test_data = train_all[test_index]
    np.savetxt(os.path.join(outdir,"train_fold" + str(i-1) + ".tileids"), np.array(train_data).astype(str),fmt='%s')
    np.savetxt(os.path.join(outdir,"test_fold" + str(i-1) + ".tileids"), np.array(test_data).astype(str),fmt='%s')

In [8]:
import numpy as np
import os
from collections import Counter
import pandas as pd
indir = r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\geodata\blocks\384\splits"

traintiles = np.loadtxt(os.path.join(indir,"train_fold0.tileids"), dtype='str')
testtiles = np.loadtxt(os.path.join(indir,"test_fold0.tileids"), dtype='str')
evaltiles = np.loadtxt(os.path.join(indir,"eval.tileids"), dtype='str')

In [9]:
filesnm = []
for root, dirs, files in os.walk(r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\gz\24\MCD12Q1v6\data09"):  
    for name in files:        
        if name.endswith(".gz"):  
            # shapefile name without extension  
            fname, ext = name.split('.')
            filesnm.append(fname)

In [10]:
filesnm_df = pd.DataFrame(filesnm)

In [11]:
names = {0:'file',1:'id_fn'}

filesnm_df = filesnm_df.iloc[:,0].str.split('_', 1, expand=True).rename(columns = names)
filesnm_df['file_nm'] = filesnm_df['file'].astype(str) + '_' + filesnm_df['id_fn'].astype(str)

In [12]:
outdir = r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\gz\24\MCD12Q1v6\tileids"
if not os.path.exists(outdir):
    os.makedirs(outdir)

np.savetxt(os.path.join(outdir,"train_fold" + str(0) + ".tileids"), np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(traintiles),'file_nm']).astype(str),fmt='%s')
np.savetxt(os.path.join(outdir,"test_fold" + str(0) + ".tileids"), np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(testtiles),'file_nm']).astype(str),fmt='%s')
np.savetxt(os.path.join(outdir,"eval.tileids"), np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(evaltiles),'file_nm']).astype(str),fmt='%s')

In [13]:
print('Total files = ', filesnm_df.loc[filesnm_df['id_fn'].isin(np.concatenate([traintiles,testtiles,evaltiles])),'file_nm'].shape)

Total files =  (4608,)


In [14]:
df_all.loc[df_all['id_fn'].isin(traintiles),'split'] = 0
df_all.loc[df_all['id_fn'].isin(testtiles),'split'] = 1
df_all.loc[df_all['id_fn'].isin(evaltiles),'split'] = 2

In [158]:
df_all.to_file(driver = 'ESRI Shapefile', filename= os.path.join(indir,"split.shp"))

In [15]:
df_all.to_file(driver="GeoJSON",filename= os.path.join(r'F:\acoca\research\gee\dataset\test\MOD09_250m500m\geodata\blocks\384','split.geojson'))

# create config

In [16]:
import configparser

In [17]:
config2 = configparser.ConfigParser()
config2['2009'] = {'pix250': '24',
                   'nbands250': '7',
                   'nbands500': '5',
                   'nobs': '46',
                   'datadir': 'data09',
                   'sqlwhere': '"where date is not null and year=2001"',
                   'tiletable': 'tiles23',
                   'fieldtable': 'fields2009',
                   'level': 'L1C'}

In [18]:
with open(r'F:\acoca\research\gee\dataset\test\MOD09_250m500m\gz\24\MCD12Q1v6\dataset.ini', 'w') as configfile:
    config2.write(configfile)

# create geotransform

In [19]:
import os
import pandas as pd
filesnm = []
for root, dirs, files in os.walk(r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\gz\24\MCD12Q1v6\data09"):  
    for name in files:        
        if name.endswith(".gz"):  
            # shapefile name without extension  
            fname, ext = name.split('.')
            filesnm.append(fname)

In [20]:
col_1 = pd.DataFrame(filesnm)

In [21]:
col_1['1'] = 0
col_1['2'] = 250
col_1['3'] = 0
col_1['4'] = 0
col_1['5'] = 0
col_1['6'] = -250
col_1['7'] = 32632

In [22]:
col_1.to_csv(r"F:\acoca\research\gee\dataset\test\MOD09_250m500m\gz\24\MCD12Q1v6\geotransforms.csv", index= None, header = None)

# split tiles 250m data

In [13]:
import geopandas as gpd
import os 
import numpy as np
import pandas as pd

indir = r"F:\acoca\research\gee\dataset\final\geodata\ids\p207_250m\final"

df_split = gpd.read_file(os.path.join(indir, 'AMZ_p207k0_250m.geojson'))

In [14]:
filesnm = []
for root, dirs, files in os.walk(r"F:\acoca\research\gee\dataset\final\MOD13Q1_250m\gz\69\MCD12Q1v6\data09"):  
    for name in files:        
        if name.endswith(".gz"):  
            # shapefile name without extension  
            fname, ext = name.split('.')
            filesnm.append(fname)

In [15]:
filesnm_df = pd.DataFrame(filesnm)
names = {0:'file',1:'id_fn'}

filesnm_df = filesnm_df.iloc[:,0].str.split('_', 1, expand=True).rename(columns = names)
filesnm_df['file_nm'] = filesnm_df['file'].astype(str) + '_' + filesnm_df['id_fn'].astype(str)

In [16]:
outdir = r"F:\acoca\research\gee\dataset\final\MOD13Q1_250m\gz\69\MCD12Q1v6\tileids"
if not os.path.exists(outdir):
    os.makedirs(outdir)

np.savetxt(os.path.join(outdir,"train_fold" + str(0) + ".tileids"), np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(np.array(df_split[df_split.split == 0].id_fn)),'file_nm']).astype(str),fmt='%s')
np.savetxt(os.path.join(outdir,"test_fold" + str(0) + ".tileids"), np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(np.array(df_split[df_split.split == 1].id_fn)),'file_nm']).astype(str),fmt='%s')
np.savetxt(os.path.join(outdir,"eval.tileids"), np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(np.array(df_split[df_split.split == 2].id_fn)),'file_nm']).astype(str),fmt='%s')

In [17]:
np.array(filesnm_df.loc[filesnm_df['id_fn'].isin(np.array(df_split[df_split.split == 2].id_fn)),'file_nm']).shape

(1170,)

In [8]:
import tensorflow as tf

In [6]:
import glob, os

In [11]:
options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)

fileNames_250m = sorted(glob.glob(r'F:\acoca\research\gee\dataset\final\MOD09_250m500m\raw\192\data09\*.gz'),key=os.path.getmtime)
fileNames_500m = sorted(glob.glob(r'F:\acoca\research\gee\dataset\final\MOD09_250m500m\raw\96\data09\*.gz'),key=os.path.getmtime)

n_patches_first = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_250m[0], options=options))
n_patches_last = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_250m[-1], options=options))

batchsize_merge = n_patches_first

tfiles_250m = (n_patches_first * (len(fileNames_250m)-1)) + n_patches_last

n_patches_first = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_500m[0], options=options))
n_patches_last = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_500m[-1], options=options))

tfiles_500m = (n_patches_first * (len(fileNames_500m)-1)) + n_patches_last

print(tfiles_250m, tfiles_500m)

3978 3978
