# Machine Learning using Sentinel-2 Data

This example uses training data from the
[Coast Train](https://github.com/nick-murray/coastTrain) dataset
along with Sentinel-2 data to demonstrate how to use a
machine learning classifier, in this case, Random Forest, to
assign a class to each pixel.

This notebook combines lessons from previous notebooks into
a comprehensive worked example.

## Getting started

First we load the required Python libraries and tools.

In [None]:
# Reload functions during development
%load_ext autoreload
%autoreload 2

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr
from sklearn.ensemble import RandomForestClassifier
from ldn.typology import colors, classes as classes_values
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


## Training data

Load it.

In [None]:
# Get the training data
training_data = gpd.read_file("training_data.geojson")
training_data = training_data[~training_data['outlier']] # Remove outliers from the training data
training_data.drop(columns=['outlier'], inplace=True) # Drop outlier column as it's no longer needed
class_attr = "lulc"

training_data.explore(
    column=class_attr,
    categorical=True,
    categories=(present_classes := sorted(training_data[class_attr].unique())),
    cmap=[colors[c] for c in present_classes],
    legend=True,
    style_kwds={"radius": 6, "fillOpacity": 0.8, "weight": 0.5}
)

In [None]:
# Remove geometry column for train/test split
training_data = training_data.drop(columns="geometry")

print(len(training_data))

# Split 70/30 into train/test. Splits the classes into train/test in a representative way.
train_gdf, test_gdf = train_test_split(training_data, test_size=0.3, stratify=training_data[class_attr], random_state=42)

print(f"Training set class distribution:\n{train_gdf[class_attr].value_counts()}")
print(f"Test set class distribution:\n{test_gdf[class_attr].value_counts()}")
print(train_gdf)

## Create a classifier and fit a model

We pass in simple numpy arrays to the classifier, one has the
observations (the values of the red, green, blue and so on)
while the other has the classes.

In [None]:
# The classes are the first column
classes = np.array(train_gdf)[:, 0]
print(f"Classes: {classes}")

# The observation data is everything after the first column
observations = np.array(train_gdf)[:, 1:]

# Create a model...
classifier = RandomForestClassifier(class_weight='balanced')

# ...and fit it to the data
model = classifier.fit(observations, classes)

In [None]:
# Define features and target

feature_cols = [c for c in train_gdf.columns if c != class_attr]

# TODO: Add MAD bands back in.
low_importance = ["smad", "bcmad", "emad", "blue"]
feature_cols_reduced = [f for f in feature_cols if f not in low_importance]

X_train = train_gdf[feature_cols_reduced].values
y_train = train_gdf[class_attr].values
X_test = test_gdf[feature_cols_reduced].values
y_test = test_gdf[class_attr].values

classifier = RandomForestClassifier(n_estimators=500, class_weight="balanced", random_state=42)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Feature importance — drop noisy features
importances = pd.Series(model.feature_importances_, index=feature_cols_reduced).sort_values(ascending=False)
print("Feature importances:")
print(importances)
# Feature importance is probably the most useful next step — it'll tell you which bands are actually helping and which are adding noise.

target_names = [k for k, v in sorted(classes_values.items(), key=lambda x: x[1]) if v != 0]

print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# geomad_dem=xr.open_dataset('geomad_dem.tif',engine="rasterio").astype(np.uint8).to_array().squeeze()
geomad_dem = xr.open_dataset("geomad_dem.nc")

stack = np.stack([geomad_dem[f].values.flatten() for f in feature_cols], axis=1)
stack = np.stack([geomad_dem[f].values.flatten() for f in feature_cols_reduced], axis=1)
stack = np.nan_to_num(stack, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)

predictions = model.predict(stack)

# Reshape back to raster
prediction_map = predictions.reshape(geomad_dem[feature_cols_reduced[0]].shape)

# Wrap in DataArray
predicted_da = xr.DataArray(
    prediction_map,
    coords={"y": geomad_dem.y, "x": geomad_dem.x},
    dims=["y", "x"],
    name="lulc",
)

## Visualise our results


In [None]:
from matplotlib.colors import ListedColormap
from ipyleaflet import basemaps

from odc.geo.xr import assign_crs

predicted_da = assign_crs(predicted_da, crs="EPSG:6933")

class_indexes = list(colors.keys())
cmap = ListedColormap([colors[c] for c in class_indexes])

predicted_da.odc.explore(categories=class_indexes, cmap=cmap, legend=True, tiles=basemaps.Esri.WorldImagery)


### Aim for >80% accuracy. Don't just look at the confusion matrix, also look at the output map.

Use a product for validation.
One validation method for tuning and another for final measure.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

target_names = [k for k, v in sorted(classes_values.items(), key=lambda x: x[1]) if v != 0]

# Classification report
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot(xticks_rotation=45, cmap="Blues")