In [12]:
import sys  
sys.path.insert(0, '../src')
import utils as ut
import rasterio as rs
from rasterio.windows import from_bounds
import numpy as np
import pandas as pd

import os, sys, warnings
from tqdm import tqdm

In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
%pylab inline
plt.style.use("bmh")

Populating the interactive namespace from numpy and matplotlib


In [15]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [16]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [17]:
data_path = '../features/'

In [18]:
patches_imgs = ut.list_files_with_absolute_paths(data_path,endswith='stack_ee.tif')
patches_imgs

['../features/p03_stack_ee.tif',
 '../features/p02_stack_ee.tif',
 '../features/p04_stack_ee.tif',
 '../features/p01_stack_ee.tif']

In [19]:
col_names = ['ndvi','ndre','evi','sbi','gvi','wet','satvi','ndmi',"lulc", "blue", "green","red","red_e1","red_e2","red_e3","nir1","swir1","swir2","nir2"]

# Prepare training data

In [20]:
patches_imgs[-3:]

['../features/p02_stack_ee.tif',
 '../features/p04_stack_ee.tif',
 '../features/p01_stack_ee.tif']

In [78]:
df_imgs = [pd.DataFrame(rs.open(f'{img}').read().reshape(19,-1).T,columns=col_names) for img in patches_imgs[-3:]]  
len(df_imgs)

3

In [79]:
df_all = pd.concat(df_imgs)
df_all.shape, df_all.columns

((4506415, 19),
 Index(['ndvi', 'ndre', 'evi', 'sbi', 'gvi', 'wet', 'satvi', 'ndmi', 'lulc',
        'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1', 'swir1',
        'swir2', 'nir2'],
       dtype='object'))

In [80]:
df_all.tail()

Unnamed: 0,ndvi,ndre,evi,sbi,gvi,wet,satvi,ndmi,lulc,blue,green,red,red_e1,red_e2,red_e3,nir1,swir1,swir2,nir2
1059733,0.78243,0.62744,2.18628,2128.4458,892.74548,-430.02701,-1185.73657,0.70341,24.0,265.0,330.0,211.0,497.0,1730.0,2182.0,2171.0,871.0,378.0,2372.0
1059734,0.79396,0.62939,2.25899,2200.91162,929.11627,-469.6123,-1210.11499,0.67468,24.0,256.0,356.0,211.0,507.0,1739.0,2210.0,2229.0,905.0,433.0,2421.0
1059735,0.80643,0.64031,2.40902,2314.8186,1061.11938,-384.5195,-1190.11182,0.69198,24.0,259.0,375.0,231.0,530.0,1788.0,2186.0,2417.0,885.0,440.0,2381.0
1059736,0.80385,0.62212,2.44157,2325.33228,1026.09802,-479.10431,-1263.08972,0.67966,24.0,260.0,331.0,236.0,557.0,1937.0,2322.0,2391.0,991.0,456.0,2527.0
1059737,0.0,0.0,-0.0,151405.51562,-29071.32617,-9941.65918,-32767.5,0.0,24.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0


In [81]:
df_all.replace([65535,inf,-inf],np.nan,inplace=True)

In [82]:
df_all.isna().sum()

ndvi         0
ndre         0
evi        180
sbi          0
gvi          0
wet          0
satvi        0
ndmi         1
lulc         0
blue      5842
green     5842
red       5842
red_e1    5842
red_e2    5842
red_e3    5842
nir1      5842
swir1     5842
swir2     5842
nir2      5842
dtype: int64

In [84]:
df_all.dropna(inplace=True)
df_all.isna().sum()

ndvi      0
ndre      0
evi       0
sbi       0
gvi       0
wet       0
satvi     0
ndmi      0
lulc      0
blue      0
green     0
red       0
red_e1    0
red_e2    0
red_e3    0
nir1      0
swir1     0
swir2     0
nir2      0
dtype: int64

In [85]:
df_all[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']]=\
df_all[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']].astype('int64')

In [86]:
df_all.lulc.value_counts(normalize=True)

25   0.25567
12   0.22788
24   0.16733
29   0.09576
21   0.08348
36   0.05973
18   0.04389
23   0.02655
20   0.02311
35   0.00804
3    0.00341
2    0.00303
41   0.00211
Name: lulc, dtype: float64

`23-24-25-29`: `forest (1)`\
`12`: `Non-irrigated arable land (2)`\
`18-20-21`: `agricultural areas (3)`\
`35-36`: `Wetlands (4)`\
`2-3`: `Urban fabric (5)`\
`41`: `Water bodies (6)`

In [87]:
df_all.lulc.replace([23,24,25,29],23,inplace=True) # forest
df_all.lulc.replace([18,20,21],18,inplace=True) # agricultural areas
df_all.lulc.replace([35,36],35,inplace=True) # Wetlands
df_all.lulc.replace([2,3],2,inplace=True) # Urban fabric

In [88]:
df_all.lulc.unique()

array([23, 35, 12, 18,  2, 41])

In [89]:
labels=[23,35,12,18,2,41]
codes=[1,2,3,4,5,6]
labels_dict = {k:i for i,k in zip(codes,labels)}
df_all.lulc = df_all.lulc.map(labels_dict)

In [90]:
df_all.lulc.unique()

array([1, 2, 3, 4, 5, 6])

In [91]:
df_all.to_parquet('../features/training_data_ee_LULC.parquet')

In [92]:
df_all.shape

(4500392, 19)

# Prepare testing data

In [93]:
out_of_sample = pd.DataFrame(rs.open('../features/p03_stack_ee.tif').read().reshape(19,-1).T,columns=col_names)
out_of_sample.shape, out_of_sample.columns

((1874040, 19),
 Index(['ndvi', 'ndre', 'evi', 'sbi', 'gvi', 'wet', 'satvi', 'ndmi', 'lulc',
        'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1', 'swir1',
        'swir2', 'nir2'],
       dtype='object'))

In [94]:
out_of_sample.tail()

Unnamed: 0,ndvi,ndre,evi,sbi,gvi,wet,satvi,ndmi,lulc,blue,green,red,red_e1,red_e2,red_e3,nir1,swir1,swir2,nir2
1874035,0.84997,0.64323,2.29542,3012.896,1395.21082,-808.2771,-1771.87024,0.66862,23.0,254.0,376.0,203.0,680.0,2535.0,3225.0,3132.0,1389.0,622.0,3545.0
1874036,0.8634,0.6563,2.41563,3176.22827,1556.4845,-754.27887,-1810.85461,0.68897,23.0,248.0,400.0,217.0,702.0,2650.0,3371.0,3383.0,1399.0,623.0,3623.0
1874037,0.89383,0.69917,2.52629,3693.66162,2074.66162,-636.60321,-1959.29358,0.73016,23.0,234.0,438.0,224.0,739.0,2909.0,3754.0,4174.0,1500.0,651.0,3920.0
1874038,0.89505,0.7366,2.36102,3856.20142,2254.59985,-622.29742,-1998.28564,0.72981,23.0,245.0,407.0,196.0,671.0,2800.0,3627.0,4424.0,1625.0,691.0,3998.0
1874039,0.88711,0.70675,2.41761,3193.30542,1712.37341,-660.83582,-1756.27905,0.7084,23.0,211.0,385.0,182.0,606.0,2655.0,3215.0,3527.0,1391.0,602.0,3514.0


In [95]:
out_of_sample.replace([65535,inf,-inf],np.nan,inplace=True)

In [96]:
out_of_sample.isna().sum()

ndvi         0
ndre         1
evi         52
sbi          0
gvi          0
wet          0
satvi        0
ndmi         1
lulc         0
blue      1496
green     1496
red       1496
red_e1    1496
red_e2    1496
red_e3    1496
nir1      1496
swir1     1496
swir2     1496
nir2      1496
dtype: int64

In [97]:
out_of_sample.dropna(inplace=True)
out_of_sample.isna().sum()

ndvi      0
ndre      0
evi       0
sbi       0
gvi       0
wet       0
satvi     0
ndmi      0
lulc      0
blue      0
green     0
red       0
red_e1    0
red_e2    0
red_e3    0
nir1      0
swir1     0
swir2     0
nir2      0
dtype: int64

In [98]:
out_of_sample[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']]=\
out_of_sample[['lulc', 'blue', 'green', 'red', 'red_e1', 'red_e2', 'red_e3', 'nir1','swir1', 'swir2', 'nir2']].astype('int64')

In [99]:
out_of_sample.lulc.replace([23,24,25,29],23,inplace=True) # forest
out_of_sample.lulc.replace([18,20,21],18,inplace=True) # agricultural areas
out_of_sample.lulc.replace([35,36],35,inplace=True) # Wetlands
out_of_sample.lulc.replace([2,3,7],2,inplace=True) # Urban fabric

In [100]:
out_of_sample.lulc.unique()

array([23, 12, 18,  2, 35])

In [101]:
labels=[23,35,12,18,2,41]
codes=[1,2,3,4,5,6]
labels_dict = {k:i for i,k in zip(codes,labels)}
out_of_sample.lulc = out_of_sample.lulc.map(labels_dict)

In [102]:
out_of_sample.lulc.unique()

array([1, 3, 4, 5, 2])

In [103]:
out_of_sample.to_parquet('../features/testing_data_ee_LULC.parquet')

In [104]:
out_of_sample.shape

(1872491, 19)