In [1]:
import sys  
sys.path.insert(0, '../src')

In [2]:
import utils as ut
import rasterio as rs
from rasterio.windows import from_bounds
import numpy as np
import pandas as pd

import os, sys, warnings
from tqdm import tqdm

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
%pylab inline
plt.style.use("bmh")

Populating the interactive namespace from numpy and matplotlib


In [5]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [6]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [7]:
data_path = '../features/'

In [8]:
patches_imgs = ut.list_files_with_absolute_paths(data_path,endswith='stack_ee.tif')
patches_imgs

['../features/p1_stack_ee.tif',
 '../features/p0_stack_ee.tif',
 '../features/p2_stack_ee.tif',
 '../features/p3_stack_ee.tif']

In [9]:
col_names = ["lulc", "blue", "green","red","red_e1","red_e2","red_e3","nir1","swir1","swir2","nir2"]

# Prepare training data

In [10]:
df_imgs = [pd.DataFrame(rs.open(f'../data/{img}').read().reshape(11,-1).T,columns=col_names) for img in patches_imgs[-3:]]  
len(df_imgs)

3

In [26]:
patches_imgs[-3:]

['../features/p0_stack_ee.tif',
 '../features/p2_stack_ee.tif',
 '../features/p3_stack_ee.tif']

In [11]:
df_all = pd.concat(df_imgs)
df_all.shape, df_all.columns

((5135073, 11),
 Index(['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1',
        'swir1', 'swir2', 'nir2'],
       dtype='object'))

In [12]:
df_all['NDVI'] = df_all[['nir1','red']].apply(ut.ndvi,axis=1)

In [13]:
df_all['NDWI'] = df_all[['nir1','swir2']].apply(ut.ndwi,axis=1)

In [14]:
df_all.tail()

Unnamed: 0,lulc,blue,green,red,red_e1,red_e2,red_e3,nir1,swir1,swir2,nir2,NDVI,NDWI
2201290,306,348,480,497,966,1573,1777,1994,1751,1040,2073,0.60096,0.31444
2201291,306,310,452,467,949,1598,1834,2010,1790,1079,2078,0.62293,0.30139
2201292,306,314,435,446,888,1483,1700,1855,1798,1104,1932,0.61234,0.2538
2201293,306,317,428,437,864,1440,1641,1762,1795,1102,1878,0.60255,0.23045
2201294,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0.0,0.0


In [15]:
df_all.replace(65535,np.nan,inplace=True)

In [16]:
df_all.isna().sum()

lulc      4891
blue      4990
green     4990
red       4990
red_e1    4990
red_e2    4990
red_e3    4990
nir1      4990
swir1     4990
swir2     4990
nir2      4990
NDVI         0
NDWI         0
dtype: int64

In [17]:
df_all.dropna(inplace=True)
df_all.isna().sum()

lulc      0
blue      0
green     0
red       0
red_e1    0
red_e2    0
red_e3    0
nir1      0
swir1     0
swir2     0
nir2      0
NDVI      0
NDWI      0
dtype: int64

In [18]:
df_all[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']]=\
df_all[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']].astype('int64')

In [19]:
df_all.lulc.value_counts(normalize=True)

305   0.48235
303   0.35818
304   0.05735
306   0.04144
307   0.02926
302   0.01396
202   0.00528
203   0.00455
501   0.00390
401   0.00208
301   0.00158
403   0.00007
404   0.00000
Name: lulc, dtype: float64

`305`: `forest, shrub (1)`\
`303`: `arable land, orchard (2)`\
`304`: `meadow/grassland, sandy area, stony (pebble) area, open area (3)`\
`306`: `swamp/mire, bog, quaking bog, marshy grassland (4)`\
`307`: `peat field, abandoned peat field (5)`\
`302-401-301-403-404`: `residential building area (6)`\
`202-203`: `lake, artificial lake, pond, etc (7)`\
`501`: `road area or square (8)`

In [20]:
df_all.lulc.replace([202,203],202,inplace=True) # water
df_all.lulc.replace([302,401,301,403,404],302,inplace=True) # residential area

In [21]:
df_all.lulc.unique()

array([305, 306, 304, 303, 501, 302, 202, 307])

In [22]:
labels=[305,303,304,306,307,302,202,501]
codes=[1,2,3,4,5,6,7,8]
labels_dict = {k:i for i,k in zip(codes,labels)}
df_all.lulc = df_all.lulc.map(labels_dict)

In [23]:
df_all.lulc.unique()

array([1, 4, 3, 2, 8, 6, 7, 5])

In [24]:
df_all.to_parquet('../features/training_data_ee_LULC.parquet')

In [25]:
df_all.shape

(5130083, 13)

# Prepare testing data

In [27]:
out_of_sample = pd.DataFrame(rs.open('../features/p1_stack_ee.tif').read().reshape(11,-1).T,columns=col_names)
out_of_sample.shape, out_of_sample.columns

((1245382, 11),
 Index(['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1',
        'swir1', 'swir2', 'nir2'],
       dtype='object'))

In [28]:
out_of_sample['NDVI'] = out_of_sample[['nir1','red']].apply(ut.ndvi,axis=1)

In [29]:
out_of_sample['NDWI'] = out_of_sample[['nir1','swir2']].apply(ut.ndwi,axis=1)

In [31]:
out_of_sample.head()

Unnamed: 0,lulc,blue,green,red,red_e1,red_e2,red_e3,nir1,swir1,swir2,nir2,NDVI,NDWI
0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0.0,0.0
1,306,490,691,788,1048,1477,1663,1887,2181,1423,1957,0.41084,0.14018
2,306,475,631,774,1031,1406,1553,1743,2285,1514,1824,0.38498,0.07031
3,306,397,465,639,729,1013,1143,1325,2099,1372,1356,0.34929,24.28217
4,306,320,384,506,614,832,936,1078,2131,1396,1159,0.36111,26.36136


In [33]:
out_of_sample.replace(65535,np.nan,inplace=True)

In [34]:
out_of_sample.isna().sum()

lulc      2232
blue      2232
green     2232
red       2232
red_e1    2232
red_e2    2232
red_e3    2232
nir1      2232
swir1     2232
swir2     2232
nir2      2232
NDVI         0
NDWI         0
dtype: int64

In [35]:
out_of_sample.dropna(inplace=True)
out_of_sample.isna().sum()

lulc      0
blue      0
green     0
red       0
red_e1    0
red_e2    0
red_e3    0
nir1      0
swir1     0
swir2     0
nir2      0
NDVI      0
NDWI      0
dtype: int64

In [36]:
out_of_sample[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']]=\
out_of_sample[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']].astype('int64')

In [37]:
out_of_sample.lulc.replace([202,203],202,inplace=True) # water
out_of_sample.lulc.replace([302,401,301,403,404],302,inplace=True) # residential area

In [38]:
out_of_sample.lulc.unique()

array([306, 305, 304, 307, 303, 302, 202, 501])

In [39]:
labels=[305,303,304,306,307,302,202,501]
codes=[1,2,3,4,5,6,7,8]
labels_dict = {k:i for i,k in zip(codes,labels)}
out_of_sample.lulc = out_of_sample.lulc.map(labels_dict)

In [40]:
out_of_sample.lulc.unique()

array([4, 1, 3, 5, 2, 6, 7, 8])

In [41]:
out_of_sample.to_parquet('../features/testing_data_ee_LULC.parquet')

In [42]:
out_of_sample.shape

(1243150, 13)