## using NDVI for feature reduction

### NDVI = (NIR - RED) / (RED + NIR)

### NDVI = (BAND5 - BAND1) / (BAND1 + BAND5)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import scipy
from PIL import Image
from scipy import ndimage
import gdal
import os
import geopandas as gpd
from skimage import io
from skimage.io import imread
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans



# reading the masked tif images and forming a dataset.
### here we are only reading the masked images which are of concern to us, that is, band1(red) and band5(CIR) for each image

In [4]:
path = ["E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I1B1.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I1B5.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I2B1.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I2B5.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I3B1.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I3B5.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I4B1.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I4B5.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I5B1.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I5B5.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I6B1.tif",
       "E:\\Internship_Harvesting\\Dataset\\Bathinda_Cropland\\Masked_images\\I6B5.tif"]

In [5]:
I1B1 = gdal.Open(path[0])
I1B1

<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x0000023273499090> >

In [6]:
I1B1.RasterYSize, I1B1.RasterXSize, I1B1.RasterCount

(8963, 8298, 1)

In [7]:
I1B1_array = I1B1.GetRasterBand(1).ReadAsArray()
I1B1_array

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [8]:
np.count_nonzero(np.isnan(I1B1_array)), np.count_nonzero(~np.isnan(I1B1_array))

(40574909, 33800065)

# creating an array which only consists the non-nan values indices

In [9]:
# I1B1_nan_index = np.argwhere(np.isnan(I1B1))

I1B1_non_nan_index = np.argwhere(~np.isnan(I1B1_array))

In [10]:
I1B1_non_nan_index

array([[   0, 5738],
       [   0, 5739],
       [   0, 5740],
       ...,
       [8962, 5788],
       [8962, 5789],
       [8962, 5790]], dtype=int64)

In [11]:
I1B1_array[0, 5738], I1B1_array[0, 5739], I1B1_array[8962, 5789], I1B1_array[8962, 5790]

(1249.0, 1242.0, 1248.0, 1237.0)

In [12]:
I1B1_non_nan_index.shape

(33800065, 2)

# we have indices now we will make the dataset using this indices
### all the images of our concern, band1 and band5 of each image and concatenating them in one array we would have an array with 12 bands and 33800065 pixel values(all non-nan or agriculture)

In [13]:
test_x = np.zeros((1, 33800065))

In [14]:
for p in path:
    print(p)
    
    image = gdal.Open(p)
    
    image_array = image.GetRasterBand(1).ReadAsArray()
    print(image_array.shape)
    sample = []
    
    for i in range(I1B1_non_nan_index.shape[0]):
        sample.append(image_array[I1B1_non_nan_index[i,0], I1B1_non_nan_index[i,1]])
        
    sample = np.array(sample)
    
    sample = sample.reshape(1, 33800065)
        
    test_x = np.concatenate((sample, test_x), axis = 0)
    
    print(test_x.shape)
    
    print("***************")
    

E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I1B1.tif
(8963, 8298)
(2, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I1B5.tif
(8963, 8298)
(3, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I2B1.tif
(8963, 8298)
(4, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I2B5.tif
(8963, 8298)
(5, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I3B1.tif
(8963, 8298)
(6, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I3B5.tif
(8963, 8298)
(7, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I4B1.tif
(8963, 8298)
(8, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images\I4B5.tif
(8963, 8298)
(9, 33800065)
***************
E:\Internship_Harvesting\Dataset\Bathinda_Cropland\Masked_images

In [15]:
test_x = test_x[:12, :]

In [16]:
test_x.shape

(12, 33800065)

In [17]:
test_x

array([[2929.5, 3312. , 3197.5, ..., 3368.5, 3290. , 3261. ],
       [1082.5, 1059. , 1041. , ...,  992.5,  987.5,  996.5],
       [2953. , 3448. , 3508. , ..., 3022. , 2969.5, 2969. ],
       ...,
       [1447. , 1467. , 1492. , ..., 1253. , 1236. , 1277. ],
       [2622. , 2757. , 2787. , ..., 2800. , 2755.5, 2731. ],
       [1249. , 1242. , 1209. , ..., 1251. , 1248. , 1237. ]])

## forming an array which consist NDVI of each image.
### using the array formed above we will use the NDVI formula for calculating NDVI for each image, that would yield us an array of 33800065 x 6 (6 meaning one for each image) and (33800065 meaning all the non-nan pixels or agriculture pixels)

In [18]:
test_NDVI = np.zeros((6, 33800065))

In [19]:
for i in range(test_NDVI.shape[0]):
    
    test_NDVI[i, :] = (test_x[i*2+1, :] - test_x[i*2, :]) / (test_x[i*2+1, :] + test_x[i*2, :])

test_NDVI

array([[-0.46036889, -0.51544269, -0.50878849, ..., -0.54482917,
        -0.53828171, -0.53188491],
       [-0.415628  , -0.48781014, -0.50461077, ..., -0.44731801,
        -0.44518798, -0.4458242 ],
       [-0.33561254, -0.40274196, -0.39168063, ..., -0.25868726,
        -0.25013001, -0.24177546],
       [ 0.08495947,  0.07220217,  0.07153637, ..., -0.04786463,
        -0.05802489, -0.05458058],
       [-0.11727924, -0.1212938 , -0.13744761, ..., -0.3277897 ,
        -0.32935431, -0.3141783 ],
       [-0.35468871, -0.37884471, -0.39489489, ..., -0.38237472,
        -0.37654552, -0.3765121 ]])

In [20]:
test_NDVI = test_NDVI.T

In [21]:
test_NDVI.shape

(33800065, 6)

# we have our test data, we will apply K-means unsupervised algorithm on this and capture details

# https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203

# reference for the below mentioned code

In [22]:
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=50, n_init=5, random_state=0)
test_NDVI_predict = kmeans.fit_predict(test_NDVI)

In [23]:
np.unique(test_NDVI_predict)

array([0, 1])

In [24]:
test_NDVI_predict.shape, test_NDVI.shape

((33800065,), (33800065, 6))

In [25]:
test_NDVI_predict = test_NDVI_predict.reshape(33800065, 1)
test_NDVI_predict.shape

(33800065, 1)

### since we have got the prediction of the agriculture pixels, we would map it to corresponding index

In [26]:
I1B1_non_nan_index.shape

(33800065, 2)

In [27]:
I1B1_non_nan_index

array([[   0, 5738],
       [   0, 5739],
       [   0, 5740],
       ...,
       [8962, 5788],
       [8962, 5789],
       [8962, 5790]], dtype=int64)

In [28]:
result_index = np.concatenate((I1B1_non_nan_index, test_NDVI_predict), axis = 1)
result_index

array([[   0, 5738,    0],
       [   0, 5739,    0],
       [   0, 5740,    0],
       ...,
       [8962, 5788,    1],
       [8962, 5789,    1],
       [8962, 5790,    1]], dtype=int64)

### here, result_index is an array which consists the x_index as first column, y_index as 2nd column and its value(0/1) as the 3rd column

In [29]:
result_index.shape

(33800065, 3)

### mapping result_index to a 8963 x 8298 dimension which can be later saved in the form of an image

In [30]:
result = np.zeros((8963, 8298))

In [31]:
result[:] = 128

In [32]:
result = result.astype("int")

In [33]:
for i in range(result_index.shape[0]):
    result[result_index[i,0], result_index[i,1]] = result_index[i, 2]

In [34]:
result

array([[128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128],
       ...,
       [128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128]])

In [35]:
np.unique(result)

array([  0,   1, 128])

In [36]:
result[result[:] == 0] = 255

In [37]:
np.unique(result)

array([  1, 128, 255])

In [38]:
result[result[:] == 1] = 0

In [39]:
np.unique(result)

array([  0, 128, 255])

### here,
### 0 --> 1 during classification(minority, non-wheat)
### 255 --> 0 during classification(majority, wheat)
### 128 --> nan values (non-agri plus outside boundary, nan)

In [41]:
# number of pixels belonging to majority class
np.count_nonzero(result == 255)

23206785

In [42]:
# number of pixels belonging to minority class
np.count_nonzero(result == 0)

10593280

In [43]:
# number of pixels belonging to non-agriculture class
np.count_nonzero(result == 128)

40574909

In [44]:
result = result.astype("uint8")

In [45]:
result.shape

(8963, 8298)

### saving result array in .jpg and .tif form

In [63]:
io.imsave("E:\\Internship_Harvesting\\Unsupervised_Result_Image\\NDVI_red_kmeans_result.jpg", result)

In [46]:
io.imsave("E:\\Internship_Harvesting\\Unsupervised_Result_Image\\NDVI_red_kmeans_result.tif", result)