# Notebook - v3

Notebook that incorporates Kevin work on preprocessing data and for testing models

In [1]:
%load_ext autoreload
%autoreload 2


### Model baseline testing

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

from urban_watch.ml_logic.data import load_data
from urban_watch.ml_logic.package import preprocess_image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load raw data

X_raw, meta = load_data()
print("Raw X shape:", X_raw.shape)

Raw X shape: (10, 300, 300, 10)


In [4]:
#Preprocess images

X_processed = np.array([preprocess_image(img) for img in X_raw])
print("Processed X shape:", X_processed.shape)

Processed X shape: (10, 300, 300, 13)


In [5]:
# Flatten images 

n_tiles = X_processed.shape[0]
flat_dim = np.prod(X_processed.shape[1:])
X_flat = X_processed.reshape(n_tiles, flat_dim)

In [6]:
print("NaN before imputer:", np.isnan(X_flat).sum())

NaN before imputer: 5857137


In [7]:
# Remplacer les Nan du cloud masking

imputer = SimpleImputer(strategy="constant", fill_value=0, keep_empty_features=True)
X_no_nan = imputer.fit_transform(X_flat)

In [8]:
print("NaN after imputer:", np.isnan(X_no_nan).sum())
print("Shape after imputation:", X_no_nan.shape)

NaN after imputer: 0
Shape after imputation: (10, 1170000)


In [9]:
# Generate Fake y for testing with 2 classes (0 - Non Urban and 1 Urban)

y_fake = np.random.randint(0, 2, size=len(X_processed))
print("y:", y_fake)

y: [0 0 0 1 1 0 1 0 1 1]


In [10]:
print("Shape X:", X_flat.shape)
print("Shape y:", y_fake.shape)

Shape X: (10, 1170000)
Shape y: (10,)


In [11]:
# Train/test/Split

X_train, X_test, y_train, y_test = train_test_split( X_no_nan, y_fake, test_size=0.30, random_state=42)

In [12]:
# Model 1 : Logistic Regression 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression(max_iter=500)
scores = cross_val_score(logreg, X_no_nan, y_fake, cv=3, scoring="accuracy")

print("Logistic Regression mean accuracy:", scores.mean())

Logistic Regression mean accuracy: 0.6666666666666666


In [13]:
# Model 2 : RandomForest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    n_jobs=-1
)

scores = cross_val_score(rf, X_no_nan, y_fake, cv=3, scoring="accuracy")
print("Random Forest mean accuracy:", scores.mean())

Random Forest mean accuracy: 0.49999999999999994


In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

gb = GradientBoostingClassifier()

scores = cross_val_score(gb, X_no_nan, y_fake, cv=3, scoring="accuracy")
print("Gradient Boosting mean accuracy:", scores.mean())

Gradient Boosting mean accuracy: 0.49999999999999994


### Test 2 - Baseline finetuné

In [15]:
import numpy as np
import pandas as pd
from urban_watch.ml_logic.data import load_data
from urban_watch.ml_logic.package import preprocess_image, CloudMasker

# 1. Charger les données brutes
X_raw, meta = load_data()

# 2. Preprocessing Kevin → 13 bandes standardisées avec NaN sur nuages
X_processed = np.array([preprocess_image(img) for img in X_raw])
print("X_processed:", X_processed.shape)  # (n_tiles, H, W, 13)

def extract_features(img):
    """
    img : (H, W, 13) après preprocess_image
    Retourne un dict de features agrégées pour 1 tuile.
    """
    feats = {}

    # Stats simples par bande
    for b in range(img.shape[-1]):
        band = img[:, :, b]
        feats[f"b{b}_mean"] = np.nanmean(band)
        feats[f"b{b}_std"]  = np.nanstd(band)
        feats[f"b{b}_min"]  = np.nanmin(band)
        feats[f"b{b}_max"]  = np.nanmax(band)

    # On suppose que ndvi, ndbi, mndwi sont les 3 dernières bandes
    ndvi  = img[:, :, -3]
    ndbi  = img[:, :, -2]
    mndwi = img[:, :, -1]

    # Pourcentage de pixels forts en végétation / urbain / eau
    feats["ndvi_strong_pct"] = np.mean(ndvi > 0.4)
    feats["ndbi_urban_pct"]  = np.mean(ndbi > 0.0)
    feats["water_pct"]       = np.mean(mndwi > 0.0)

    return feats

# 3. Appliquer à toutes les tuiles
features_list = [extract_features(img) for img in X_processed]
X_tab = pd.DataFrame(features_list)

print("X_tab shape:", X_tab.shape)
X_tab.head()

X_processed: (10, 300, 300, 13)
X_tab shape: (10, 55)


Unnamed: 0,b0_mean,b0_std,b0_min,b0_max,b1_mean,b1_std,b1_min,b1_max,b2_mean,b2_std,...,b11_std,b11_min,b11_max,b12_mean,b12_std,b12_min,b12_max,ndvi_strong_pct,ndbi_urban_pct,water_pct
0,-1.042644e-16,0.999982,-3.364899,1.685737,-1.0136820000000001e-17,0.999986,-0.501124,13.250412,5.1408150000000006e-17,0.999989,...,0.999995,-5.864994,3.595883,-9.383798e-16,0.999997,-0.829292,4.290277,0.001689,0.004033,0.001022
1,3.163554e-17,0.999974,-1.362895,8.741292,3.6154900000000005e-17,0.999979,-1.053829,20.316168,6.101140000000001e-17,0.999979,...,0.999994,-5.377937,5.893429,-5.549778e-15,0.999995,-3.089769,7.081466,0.230189,0.283844,0.245656
2,-7.692338000000001e-17,0.999986,-1.385541,3.848679,3.55031e-17,0.999989,-1.092803,5.832785,0.0,0.999989,...,0.999993,-5.837613,7.433826,-8.158316e-16,0.999995,-2.924445,5.935637,0.0599,0.115244,0.112311
3,3.7709160000000004e-17,0.999986,-1.806246,3.542543,5.0278880000000005e-17,0.999988,-1.569526,5.627428,-1.319821e-16,0.999988,...,0.999994,-5.904089,7.019022,5.360986e-15,0.999993,-4.33712,8.352238,0.111978,0.247811,0.225511
4,-1.7462410000000002e-17,0.999983,-1.862046,4.166743,-7.421524000000001e-17,0.99997,-1.404635,21.40144,-7.858085000000001e-17,0.999973,...,0.999995,-4.587105,6.036141,-4.714851e-15,0.999998,-1.19102,3.306906,0.2655,0.288467,0.132511


In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

y = y_fake  # en attendant le vrai y

logreg = LogisticRegression(
    max_iter=1000,
    penalty="l2",
    solver="lbfgs"
)

scores = cross_val_score(logreg, X_tab, y, cv=3, scoring="accuracy")
print("LogReg accuracy:", scores.mean())

LogReg accuracy: 0.7777777777777777


In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=1,
    n_jobs=-1,
    random_state=42
)

scores = cross_val_score(rf, X_tab, y, cv=3, scoring="accuracy")
print("RandomForest accuracy:", scores.mean())


RandomForest accuracy: 0.49999999999999994


In [18]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=42)

scores = cross_val_score(hgb, X_tab, y, cv=3, scoring="accuracy")
print("HistGradientBoosting accuracy:", scores.mean())

HistGradientBoosting accuracy: 0.38888888888888884


### Rafinement des features

In [17]:
import numpy as np
from scipy.ndimage import uniform_filter

def extract_features_numpy(image_13):
    """
    image_13: (300,300,13)
    Retourne un vecteur 1D de features tabulaires
    """

    H, W, C = image_13.shape
    feats = []

    # ---- 1) Stats globales par bande ----
    for c in range(C):
        band = image_13[:,:,c]

        feats.append(np.nanmean(band))
        feats.append(np.nanstd(band))
        feats.append(np.nanmin(band))
        feats.append(np.nanmax(band))
        feats.append(np.nanpercentile(band, 10))
        feats.append(np.nanpercentile(band, 90))

    # ---- 2) Texture simple : variance locale ----
    for c in range(C):
        band = image_13[:,:,c]
        mean_3 = uniform_filter(band, size=3)
        mean_sq_3 = uniform_filter(band**2, size=3)
        local_var = mean_sq_3 - mean_3**2

        feats.append(np.nanmean(local_var))
        feats.append(np.nanmax(local_var))

    # ---- 3) Global NDVI / NDBI / MNDWI stats ----
    NDVI = image_13[:,:,10]   # si indices ajoutés en 11,12,13
    NDBI = image_13[:,:,11]
    MNDWI = image_13[:,:,12]

    for ind in [NDVI, NDBI, MNDWI]:
        feats.append(np.nanmean(ind))
        feats.append(np.nanstd(ind))
        feats.append(np.nanmean(ind > 0))  # fraction positive

    return np.array(feats)
