# Train and test set

### Feature groups
- raw Sentinel-2 bands
- indices based on raw Sentinel-2 bands (e.g. NDVI, NDWI, SAVI, EVI)

### More advanced ideas
- seasonal quarterly and monthly time series
- min, max, median indices

## Setup

In [None]:
from google.colab import drive
import pandas as pd

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/land_cover_classification_kaza

/content/drive/MyDrive/land_cover_classification_kaza


## Load and prepare raw data

We're using the labels provided by Nuno and the according Sentinel-2 bands.

In [None]:
raw_data = pd.read_csv('data/raw_data.csv')
raw_data.shape

(267442, 15)

In [None]:
raw_data

Unnamed: 0,system:index,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,LC_Nr,LC_Out,Landcover,.geo
0,00000000000000000010_0,2970.5,2088.5,763.0,1005.5,1389.0,1640.0,2035.5,2232.0,2549.0,2571.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,00000000000000000010_1,3070.0,2153.0,764.0,992.0,1448.0,1702.5,2092.5,2293.5,2560.0,2608.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
2,00000000000000000010_2,3070.0,2153.0,746.0,978.0,1385.0,1702.5,2092.5,2293.5,2542.0,2608.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
3,00000000000000000010_3,3024.5,2069.0,724.0,923.0,1372.0,1659.5,2010.0,2228.0,2447.5,2552.5,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
4,00000000000000000010_4,3024.5,2069.0,714.5,913.0,1347.0,1659.5,2010.0,2228.0,2427.5,2552.5,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267437,00000000000000000561_158,2304.5,1836.5,500.0,605.5,722.5,1036.0,1484.5,1662.5,1650.0,1935.5,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267438,00000000000000000561_159,2197.5,1720.5,497.5,618.0,734.0,1005.0,1422.5,1600.0,1651.0,1843.0,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267439,00000000000000000561_160,2197.5,1720.5,515.0,622.5,759.0,1005.0,1422.5,1600.0,1642.0,1843.0,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267440,00000000000000000561_161,2304.5,1836.5,504.5,571.0,689.0,1036.0,1484.5,1662.5,1702.0,1935.5,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [None]:
raw_data = raw_data.drop(['system:index', '.geo'], axis=1)

In [None]:
# remove land cover class deforestation as it is not needed in this use case
raw_data = raw_data[raw_data['Landcover'] != 'Deforestation']
raw_data.shape

(261066, 13)

In [None]:
raw_data['Landcover'] = raw_data['Landcover'].str.capitalize()

## Downsample and balance raw data

In [None]:
raw_data['Landcover'].value_counts()

Forest      117277
Cropland     58759
Wetland      45501
Shrub        27552
Grass        10826
Bare           475
Water          376
Built up       300
Name: Landcover, dtype: int64

In [None]:
raw_data['LC_Nr'].value_counts()

7    117277
4     58759
8     45501
6     27552
5     10826
2       475
1       376
3       300
Name: LC_Nr, dtype: int64

In [None]:
def sample_train_and_test_data(df, land_cover_class, train_fraction, desired_train_samples=None):

  random_state = 42

  # filter and shuffle df
  df = df[df['Landcover'] == land_cover_class]
  df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

  # calculate n train samples
  n_train_samples = int(round(df.shape[0] * train_fraction, 0))

  # sample train and test samples
  train_samples = df.iloc[:n_train_samples]
  test_samples = df.iloc[n_train_samples:]

  # downsample train samples if specified
  if desired_train_samples is not None:
    train_samples = train_samples.sample(n=desired_train_samples, random_state=random_state).reset_index(drop=True)

  print(f'land cover class: {land_cover_class}; train samples: {train_samples.shape[0]}; test samples: {test_samples.shape[0]}')

  return train_samples, test_samples

In [None]:
train_fraction = 0.7
train_forest, test_forest = sample_train_and_test_data(raw_data, 'Forest', train_fraction, desired_train_samples=500)
train_cropland, test_cropland = sample_train_and_test_data(raw_data, 'Cropland', train_fraction, desired_train_samples=500)
train_wetland, test_wetland = sample_train_and_test_data(raw_data, 'Wetland', train_fraction, desired_train_samples=500)
train_shrub, test_shrub = sample_train_and_test_data(raw_data, 'Shrub', train_fraction, desired_train_samples=500)
train_grass, test_grass = sample_train_and_test_data(raw_data, 'Grass', train_fraction, desired_train_samples=500)
train_bare, test_bare = sample_train_and_test_data(raw_data, 'Bare', train_fraction)
train_water, test_water = sample_train_and_test_data(raw_data, 'Water', train_fraction)
train_built_up, test_built_up = sample_train_and_test_data(raw_data, 'Built up', train_fraction)

land cover class: Forest; train samples: 500; test samples: 35183
land cover class: Cropland; train samples: 500; test samples: 17628
land cover class: Wetland; train samples: 500; test samples: 13650
land cover class: Shrub; train samples: 500; test samples: 8266
land cover class: Grass; train samples: 500; test samples: 3248
land cover class: Bare; train samples: 332; test samples: 143
land cover class: Water; train samples: 263; test samples: 113
land cover class: Built up; train samples: 210; test samples: 90


In [None]:
train = pd.concat([train_forest, train_cropland, train_wetland, train_shrub, train_grass, train_bare, train_water, train_built_up], ignore_index=True)
train

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,LC_Nr,LC_Out,Landcover
0,2006.000000,1215.000000,522.100000,701.937500,810.666667,1098.000000,1695.750000,1901.000000,2257.333333,2226.000000,7,Forest,Forest
1,1903.000000,1237.500000,404.000000,529.000000,551.000000,952.666667,1618.800000,1868.500000,1861.500000,2135.000000,7,Forest,Forest
2,2244.500000,1337.666667,476.500000,696.375000,735.000000,1156.250000,1856.625000,2145.750000,2619.000000,2523.500000,7,Forest,Forest
3,2029.000000,1238.000000,450.000000,622.000000,693.000000,1056.000000,1666.000000,1925.000000,2076.000000,2252.000000,7,Forest,Forest
4,2249.000000,1340.750000,498.000000,707.000000,797.500000,1241.500000,1904.416667,2156.500000,2371.600000,2507.500000,7,Forest,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3300,3092.666667,2532.000000,866.500000,1101.500000,1475.000000,1689.750000,1899.000000,2054.833333,2287.000000,2368.250000,3,Built-up,Built up
3301,3588.000000,3187.500000,1151.000000,1325.000000,2030.700000,2052.250000,2444.166667,2571.083333,2662.666667,2812.333333,3,Built-up,Built up
3302,3474.400000,2907.500000,1121.000000,1458.833333,1844.000000,2013.000000,2354.500000,2526.928571,2717.166667,2771.857143,3,Built-up,Built up
3303,3429.250000,3303.000000,1203.333333,1398.000000,1782.000000,1736.000000,1926.000000,2068.000000,2502.000000,2315.000000,3,Built-up,Built up


In [None]:
test = pd.concat([test_forest, test_cropland, test_wetland, test_shrub, test_grass, test_bare, test_water, test_built_up], ignore_index=True)
test

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,LC_Nr,LC_Out,Landcover
0,1492.200000,754.666667,326.363636,454.545455,359.500000,741.000000,1498.500,1754.666667,1896.500000,1956.000000,7,Forest,Forest
1,1393.000000,654.000000,286.000000,404.000000,324.000000,656.000000,1472.000,1695.000000,1758.000000,1890.000000,7,Forest,Forest
2,1879.500000,1112.333333,387.750000,539.750000,563.666667,967.500000,1588.750,1797.000000,1792.000000,2065.250000,7,Forest,Forest
3,1854.666667,1055.000000,358.000000,486.600000,515.000000,882.333333,1581.000,1811.250000,1846.000000,2100.250000,7,Forest,Forest
4,2185.600000,1432.500000,491.800000,686.384615,810.200000,1173.285714,1643.125,1868.714286,2091.500000,2164.222222,7,Forest,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78316,2933.750000,2918.666667,972.400000,1253.000000,1621.333333,1959.000000,2026.000,2106.166667,1894.000000,2159.750000,3,Built-up,Built up
78317,3748.500000,3796.857143,1192.666667,1412.000000,1753.000000,2074.333333,2197.000,2339.500000,2432.000000,2625.000000,3,Built-up,Built up
78318,3832.000000,3842.883333,1321.200000,1558.500000,1933.800000,2081.000000,2198.000,2375.333333,2643.000000,2629.500000,3,Built-up,Built up
78319,3834.500000,3540.500000,1269.000000,1395.000000,1776.000000,1950.500000,2181.000,2431.500000,2635.000000,2786.500000,3,Built-up,Built up


## Compute indices

In [None]:
!pip install spyndex

Collecting spyndex
  Downloading spyndex-0.5.0.tar.gz (727 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.5/727.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting eemont>=0.3.6 (from spyndex)
  Downloading eemont-0.3.6.tar.gz (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.7/134.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas>=2.0.3 (from spyndex)
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Collecting ee_extra>=0.0.15 (from eemont>=0.3.6->spyndex)
  Downloading ee_extra-0.0.15.tar.gz (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  

In [None]:
import spyndex

In [None]:
idx = spyndex.computeIndex(
    index = ['NDVI', 'NDWI', 'SAVI', 'EVI'],
    params = {
        'R': train['B4'],
        'G': train['B3'],
        'B': train['B2'],
        'N': train['B8'],
        'S1': train['B11'],
        'S2': train['B12'],
        'g': 2.5,
        'L': 0.5,
        'C1': 6,
        'C2': 7.5

    }
)
train = pd.concat([train, idx], axis=1)
train

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,LC_Nr,LC_Out,Landcover,NDVI,NDWI,SAVI,EVI
0,2006.000000,1215.000000,522.100000,701.937500,810.666667,1098.000000,1695.750000,1901.000000,2257.333333,2226.000000,7,Forest,Forest,0.471534,-0.525601,0.707186,1.128064
1,1903.000000,1237.500000,404.000000,529.000000,551.000000,952.666667,1618.800000,1868.500000,1861.500000,2135.000000,7,Forest,Forest,0.543212,-0.557415,0.814650,1.532390
2,2244.500000,1337.666667,476.500000,696.375000,735.000000,1156.250000,1856.625000,2145.750000,2619.000000,2523.500000,7,Forest,Forest,0.561717,-0.579912,0.842450,1.362946
3,2029.000000,1238.000000,450.000000,622.000000,693.000000,1056.000000,1666.000000,1925.000000,2076.000000,2252.000000,7,Forest,Forest,0.499458,-0.538918,0.749052,1.209127
4,2249.000000,1340.750000,498.000000,707.000000,797.500000,1241.500000,1904.416667,2156.500000,2371.600000,2507.500000,7,Forest,Forest,0.496703,-0.540700,0.744936,1.149952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3300,3092.666667,2532.000000,866.500000,1101.500000,1475.000000,1689.750000,1899.000000,2054.833333,2287.000000,2368.250000,3,Built-up,Built up,0.215843,-0.349860,0.323721,0.437618
3301,3588.000000,3187.500000,1151.000000,1325.000000,2030.700000,2052.250000,2444.166667,2571.083333,2662.666667,2812.333333,3,Built-up,Built up,0.134651,-0.335451,0.201955,0.254216
3302,3474.400000,2907.500000,1121.000000,1458.833333,1844.000000,2013.000000,2354.500000,2526.928571,2717.166667,2771.857143,3,Built-up,Built up,0.191435,-0.301325,0.287121,0.406187
3303,3429.250000,3303.000000,1203.333333,1398.000000,1782.000000,1736.000000,1926.000000,2068.000000,2502.000000,2315.000000,3,Built-up,Built up,0.168067,-0.283077,0.252071,0.431706


In [None]:
idx = spyndex.computeIndex(
    index = ['NDVI', 'NDWI', 'SAVI', 'EVI'],
    params = {
        'R': test['B4'],
        'G': test['B3'],
        'B': test['B2'],
        'N': test['B8'],
        'S1': test['B11'],
        'S2': test['B12'],
        'g': 2.5,
        'L': 0.5,
        'C1': 6,
        'C2': 7.5

    }
)
test = pd.concat([test, idx], axis=1)
test

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,LC_Nr,LC_Out,Landcover,NDVI,NDWI,SAVI,EVI
0,1492.200000,754.666667,326.363636,454.545455,359.500000,741.000000,1498.500,1754.666667,1896.500000,1956.000000,7,Forest,Forest,0.681294,-0.613325,1.021715,2.392184
1,1393.000000,654.000000,286.000000,404.000000,324.000000,656.000000,1472.000,1695.000000,1758.000000,1890.000000,7,Forest,Forest,0.688761,-0.626272,1.032893,2.301766
2,1879.500000,1112.333333,387.750000,539.750000,563.666667,967.500000,1588.750,1797.000000,1792.000000,2065.250000,7,Forest,Forest,0.521438,-0.537043,0.781991,1.354954
3,1854.666667,1055.000000,358.000000,486.600000,515.000000,882.333333,1581.000,1811.250000,1846.000000,2100.250000,7,Forest,Forest,0.563744,-0.582783,0.845437,1.477904
4,2185.600000,1432.500000,491.800000,686.384615,810.200000,1173.285714,1643.125,1868.714286,2091.500000,2164.222222,7,Forest,Forest,0.441569,-0.505822,0.662239,0.981177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78316,2933.750000,2918.666667,972.400000,1253.000000,1621.333333,1959.000000,2026.000,2106.166667,1894.000000,2159.750000,3,Built-up,Built up,0.077565,-0.203686,0.116331,0.157447
78317,3748.500000,3796.857143,1192.666667,1412.000000,1753.000000,2074.333333,2197.000,2339.500000,2432.000000,2625.000000,3,Built-up,Built up,0.162246,-0.265349,0.243340,0.423792
78318,3832.000000,3842.883333,1321.200000,1558.500000,1933.800000,2081.000000,2198.000,2375.333333,2643.000000,2629.500000,3,Built-up,Built up,0.154955,-0.258122,0.232408,0.408780
78319,3834.500000,3540.500000,1269.000000,1395.000000,1776.000000,1950.500000,2181.000,2431.500000,2635.000000,2786.500000,3,Built-up,Built up,0.194740,-0.307692,0.292078,0.569025


## Save train and test set

In [None]:
train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)

ImportError: cannot import name 'SequenceNotStr' from 'pandas._typing' (/usr/local/lib/python3.10/dist-packages/pandas/_typing.py)