# Collect Sentinel-2 data

- using the labels provided by Nuno (Binga)
- the labels are there but the actual band values need to be collected using this notebook

In [None]:
from google.colab import drive
import ee
import numpy as np
import pandas as pd
import geopandas as gpd
import geemap
import time

## Setup

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/land_cover_classification_kaza/automl

/content/drive/MyDrive/land_cover_classification_kaza/automl


In [None]:
ee.Authenticate()

In [None]:
ee.Initialize(project='ee-alexvmt')

## Load and clean labels

In [None]:
file_name = 'RND_Binga_1000'
gdf = gpd.read_file('{}.shp'.format(file_name))

In [None]:
gdf

Unnamed: 0,id,LC2023,LC2022,LC2021,LC2020,Comments,LC2023_Num,LC2023_N2,geometry
0,0.0,Shrub,,,,,5.0,5.0,POINT (27.77476 -17.22082)
1,1.0,Forest,,,,,3.0,3.0,POINT (27.80260 -18.07115)
2,2.0,Crop,,,,,2.0,2.0,POINT (28.06489 -17.48651)
3,3.0,Shrub,,,,,5.0,5.0,POINT (27.68538 -17.65882)
4,4.0,Forest,,,,,3.0,3.0,POINT (27.21559 -18.16223)
...,...,...,...,...,...,...,...,...,...
1260,,Crop,,,,,,,POINT (27.36553 -17.73099)
1261,,Crop,,,,,,,POINT (27.35226 -17.73310)
1262,,Crop,,,,,,,POINT (27.36824 -17.72848)
1263,,Crop,,,,,,,POINT (27.39716 -17.69348)


In [None]:
gdf[gdf['LC2023'].isnull() | gdf['geometry'].isnull()]

Unnamed: 0,id,LC2023,LC2022,LC2021,LC2020,Comments,LC2023_Num,LC2023_N2,geometry
1000,0.0,Grassland,,,,,4.0,4.0,
1239,,,Crop,,,,,,POINT (27.45679 -17.69189)


In [None]:
gdf.shape

(1265, 9)

In [None]:
gdf = gdf[gdf['LC2023'].notnull() & gdf['geometry'].notnull()].reset_index(drop=True)

In [None]:
gdf.shape

(1263, 9)

In [None]:
gdf['LC2023'].value_counts()

Forest       317
Shrub        304
Crop         223
Water        113
BuiltUp      105
Grassland     77
Wetlands      65
Bare          59
Name: LC2023, dtype: int64

In [None]:
land_cover_classes = list(gdf['LC2023'].unique())
land_cover_classes.sort()
land_cover_classes

['Bare',
 'BuiltUp',
 'Crop',
 'Forest',
 'Grassland',
 'Shrub',
 'Water',
 'Wetlands']

In [None]:
classes_dict = {'Bare': 0,
                'BuiltUp': 1,
                'Crop': 2,
                'Forest': 3,
                'Grassland': 4,
                'Shrub': 5,
                'Water': 6,
                'Wetlands': 7}

In [None]:
gdf['class'] = gdf['LC2023'].map(classes_dict)

In [None]:
gdf

Unnamed: 0,id,LC2023,LC2022,LC2021,LC2020,Comments,LC2023_Num,LC2023_N2,geometry,class
0,0.0,Shrub,,,,,5.0,5.0,POINT (27.77476 -17.22082),5
1,1.0,Forest,,,,,3.0,3.0,POINT (27.80260 -18.07115),3
2,2.0,Crop,,,,,2.0,2.0,POINT (28.06489 -17.48651),2
3,3.0,Shrub,,,,,5.0,5.0,POINT (27.68538 -17.65882),5
4,4.0,Forest,,,,,3.0,3.0,POINT (27.21559 -18.16223),3
...,...,...,...,...,...,...,...,...,...,...
1258,,Crop,,,,,,,POINT (27.36553 -17.73099),2
1259,,Crop,,,,,,,POINT (27.35226 -17.73310),2
1260,,Crop,,,,,,,POINT (27.36824 -17.72848),2
1261,,Crop,,,,,,,POINT (27.39716 -17.69348),2


## Load and prepare Sentinel-2 data

In [None]:
start_date = '2023-01-01'
end_date = '2023-12-31'

In [None]:
bands = ['B2',
         'B3',
         'B4',
         'B5',
         'B6',
         'B7',
         'B8',
         'B8A',
         'B11',
         'B12']

In [None]:
# filter to less cloudy images? -> 20%
# mask clouds?
# median composite for rainy and dry season
# rainy season: November to March/April
s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED').filterDate(ee.Date(start_date), ee.Date(end_date)).select(bands).median()

## Collect and export data

In [None]:
def get_band_values_from_points(columns, gdf, image, chunks):

  df_with_bands = pd.DataFrame(columns=columns)

  list_gdf = np.array_split(gdf, chunks)

  for i, gdf in enumerate(list_gdf):
    print('Processing chunk {}/{}...'.format(i+1, chunks))
    fc = geemap.geopandas_to_ee(gdf)
    fc_with_bands = image.sampleRegions(collection=fc, scale=10)
    df_with_bands_temp = geemap.ee_to_pandas(fc_with_bands)
    df_with_bands = pd.concat([df_with_bands, df_with_bands_temp])

  df_with_bands = df_with_bands.reset_index(drop=True)

  return(df_with_bands)

In [None]:
columns = bands + ['class'] + ['LC2023']

In [None]:
start_time = time.perf_counter()
df_with_bands = get_band_values_from_points(columns, gdf, s2, 10)
end_time = time.perf_counter()
run_time = round((end_time - start_time) / 60, 2)
print('Run time: {} minutes.'.format(run_time))

In [None]:
df_with_bands = df_with_bands[columns]
df_with_bands

Unnamed: 0,B2,B3,B4,B5,B6,B7,B8,B8A,B11,B12,class,LC2023
0,542.294118,686.076923,869.000000,1314.500000,1749.250000,1914.444444,1948.250000,2277.333333,2616.000000,1867.000000,5,Shrub
1,566.842105,808.500000,1061.200000,1638.714286,2009.000000,2412.250000,2648.750000,2837.666667,3107.000000,1995.800000,3,Forest
2,1250.285714,1635.000000,2214.000000,2658.333333,2989.333333,3210.000000,3216.000000,3572.000000,4027.000000,2747.000000,2,Crop
3,673.200000,861.000000,1196.000000,1561.400000,1857.892857,2120.166667,2334.400000,2563.433333,2975.250000,2087.666667,5,Shrub
4,795.500000,991.000000,1188.125000,1569.285714,2205.368421,2464.100000,2594.357143,2784.600000,2832.400000,2037.833333,3,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...
1258,921.880000,1497.058824,2719.764706,3164.312500,3425.750000,3591.891667,3683.636364,3777.244444,5072.571429,4572.187500,2,Crop
1259,795.681818,1210.539130,2206.444444,2655.533333,2907.400000,3107.285714,3170.500000,3291.800000,4558.333333,3892.666667,2,Crop
1260,854.928571,1364.608696,2593.454545,3039.000000,3304.100000,3490.909091,3549.142857,3736.571429,4980.000000,4364.250000,2,Crop
1261,796.954545,1239.200000,2445.200000,2837.000000,2992.666667,3182.000000,3275.200000,3427.000000,4977.400000,4601.750000,2,Crop


In [None]:
df_with_bands.to_csv('{}.csv'.format(file_name), index=False)