# Collect Sentinel-2 data

- using the labels provided by Nuno (Mufunta)
- the labels are there but the actual band values need to be collected using this notebook

In [1]:
from google.colab import drive
import ee
import numpy as np
import pandas as pd
import time
import json
import geopandas as gpd
from shapely.geometry import Point
import geemap

## Setup

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/land_cover_classification_kaza

/content/drive/MyDrive/land_cover_classification_kaza


In [4]:
ee.Authenticate()

True

In [5]:
ee.Initialize(project='ee-alexvmt')

## Load and clean labels

In [6]:
file_name = 'data.csv'
df = pd.read_csv('data/{}'.format(file_name))
df.shape

(267431, 5)

In [7]:
df

Unnamed: 0,Set,LC_Nr,LC_Out,Landcover,.geo
0,train,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,train,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
2,train,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
3,train,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
4,train,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
...,...,...,...,...,...
267426,test,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267427,test,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267428,test,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267429,test,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [8]:
# remove land cover class deforestation
df = df[df['LC_Nr'] != 2]
df.shape

(260580, 5)

In [9]:
df['Landcover'] = df['Landcover'].str.capitalize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Landcover'] = df['Landcover'].str.capitalize()


In [10]:
df['lon'] = df['.geo'].apply(lambda x: json.loads(x)['coordinates'][0])
df['lat'] = df['.geo'].apply(lambda x: json.loads(x)['coordinates'][1])
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
df = df.drop(['LC_Out', '.geo', 'lon', 'lat'], axis=1)
gdf = gpd.GeoDataFrame(df, crs='EPSG:4326', geometry=geometry)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lon'] = df['.geo'].apply(lambda x: json.loads(x)['coordinates'][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lat'] = df['.geo'].apply(lambda x: json.loads(x)['coordinates'][1])


In [11]:
gdf

Unnamed: 0,Set,LC_Nr,Landcover,geometry
0,train,4,Cropland,POINT (25.09098 -15.07000)
1,train,4,Cropland,POINT (25.09107 -15.07000)
2,train,4,Cropland,POINT (25.09116 -15.07000)
3,train,4,Cropland,POINT (25.09125 -15.07000)
4,train,4,Cropland,POINT (25.09134 -15.07000)
...,...,...,...,...
267426,test,8,Wetland,POINT (24.97788 -15.13486)
267427,test,8,Wetland,POINT (24.97797 -15.13486)
267428,test,8,Wetland,POINT (24.97761 -15.13477)
267429,test,8,Wetland,POINT (24.97779 -15.13477)


## Downsample and balance labels

In [12]:
gdf['Landcover'].value_counts()

Forest      117266
Cropland     58759
Wetland      45501
Shrub        27552
Grass        10826
Water          376
Built up       300
Name: Landcover, dtype: int64

In [13]:
gdf[['Set', 'Landcover']].groupby(['Set', 'Landcover']).size()

Set    Landcover
test   Built up        87
       Cropland     17668
       Forest       35378
       Grass         3275
       Shrub         8123
       Water          120
       Wetland      13740
train  Built up       213
       Cropland     41091
       Forest       81888
       Grass         7551
       Shrub        19429
       Water          256
       Wetland      31761
dtype: int64

In [14]:
gdf[['Set', 'LC_Nr']].groupby(['Set', 'LC_Nr']).size()

Set    LC_Nr
test   1          120
       3           87
       4        17668
       5         3275
       6         8123
       7        35378
       8        13740
train  1          256
       3          213
       4        41091
       5         7551
       6        19429
       7        81888
       8        31761
dtype: int64

In [15]:
train = gdf[gdf['Set'] == 'train']
test = gdf[gdf['Set'] == 'test']

In [16]:
random_state = 42
class_1 = train.sample(200, random_state=random_state)
class_3 = train.sample(200, random_state=random_state)
class_4 = train.sample(500, random_state=random_state)
class_5 = train.sample(500, random_state=random_state)
class_6 = train.sample(500, random_state=random_state)
class_7 = train.sample(500, random_state=random_state)
class_8 = train.sample(500, random_state=random_state)
test = test.sample(1000, random_state=random_state)

In [17]:
gdf_balanced = pd.concat([class_1, class_3, class_4, class_5, class_6, class_7, class_8, test], ignore_index=True)
gdf_balanced

Unnamed: 0,Set,LC_Nr,Landcover,geometry
0,train,4,Cropland,POINT (25.18252 -15.02913)
1,train,8,Wetland,POINT (25.28250 -15.26682)
2,train,7,Forest,POINT (25.32948 -14.81695)
3,train,7,Forest,POINT (25.00330 -15.07997)
4,train,6,Shrub,POINT (25.36182 -15.08078)
...,...,...,...,...
3895,test,7,Forest,POINT (25.17973 -15.05761)
3896,test,4,Cropland,POINT (25.18764 -15.07387)
3897,test,7,Forest,POINT (25.30011 -15.44289)
3898,test,7,Forest,POINT (25.47716 -14.72954)


## Load and prepare Sentinel-2 data

In [18]:
start_date = '2023-01-01'
end_date = '2023-12-31'

In [19]:
bands = ['B2',
         'B3',
         'B4',
         'B5',
         'B6',
         'B7',
         'B8',
         'B8A',
         'B11',
         'B12']

In [20]:
s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED').filterDate(ee.Date(start_date), ee.Date(end_date)).filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', 20)).select(bands).median()

## Collect and export data

In [21]:
def get_band_values_from_points(columns, gdf, image, chunks):

  df_with_bands = pd.DataFrame(columns=columns)

  list_gdf = np.array_split(gdf, chunks)

  for i, gdf in enumerate(list_gdf):
    print('Processing chunk {}/{}...'.format(i+1, chunks))
    fc = geemap.geopandas_to_ee(gdf)
    fc_with_bands = image.sampleRegions(collection=fc, scale=10)
    df_with_bands_temp = geemap.ee_to_df(fc_with_bands)
    df_with_bands = pd.concat([df_with_bands, df_with_bands_temp])

  df_with_bands = df_with_bands.reset_index(drop=True)

  return(df_with_bands)

In [22]:
columns = bands + ['LC_Nr'] + ['Landcover']

In [23]:
start_time = time.perf_counter()
df_with_bands = get_band_values_from_points(columns, gdf_balanced, s2, 40)
end_time = time.perf_counter()
run_time = round((end_time - start_time) / 60, 2)
print('Run time: {} minutes.'.format(run_time))

Processing chunk 1/40...
Processing chunk 2/40...
Processing chunk 3/40...
Processing chunk 4/40...
Processing chunk 5/40...
Processing chunk 6/40...
Processing chunk 7/40...
Processing chunk 8/40...
Processing chunk 9/40...
Processing chunk 10/40...
Processing chunk 11/40...
Processing chunk 12/40...
Processing chunk 13/40...
Processing chunk 14/40...
Processing chunk 15/40...
Processing chunk 16/40...


Exception: User memory limit exceeded.

In [None]:
df_with_bands = df_with_bands[columns]
df_with_bands

In [None]:
df_with_bands.to_csv('df_with_bands.csv', index=False)