In [None]:
import os
import numpy as np
from glob import glob
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt

In [None]:
class PASTISSegmentation:
    """
    Here we use a subset of the PASTIS dataset: https://github.com/VSainteuf/pastis-benchmark
    """
    def __init__(
        self,
        image_dir: str,
        annotation_dir: str,
        split:str = "train",
        median_of_days: bool = False,
        Xmean = None,
        Xstd = None,
        binary_labels: bool = False
    ) -> None:
        
        images = glob(os.path.join(image_dir, split, 'S2_*.npy'))
        annotations = []
        for im in images:
            name = os.path.splitext(os.path.basename(im))[0].replace("S2_", "")
            annotations.append(os.path.join(annotation_dir, split, f"TARGET_{name}.npy"))

        # Store in the class for future reference
        self.median_of_days = median_of_days
        self.binary_labels = binary_labels
        
        # Load data
        self.X = self.read_data(images)
        if median_of_days:
            self.X = np.median(self.X, axis=1) #Take median value across 43 days

        ### Normalization
        ### Using the provided values (Xmean, Xstd) we normalize the input X to have zero mean and unit variance
        if Xmean is not None and Xstd is not None:
            self.X = (self.X - Xmean) / Xstd
            
        self.y = self.read_data(annotations)
        self.y = self.y[:,0] # We are only interested in the 20 classes for now
        if binary_labels:
            self.y[self.y>0] = 1 # Convert to binary labels
        
    def __len__(self):
        return self.X.shape[0]
    
    def read_data(self, files):
        """
        Reads and stacks our data
        """
        t = []
        for im in files:
            t.append(np.load(im))
        return np.stack(t, axis=0)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def pixelwise(self):
        """
        This method flattens our images to individual pixels, so we can treat
        each pixel as a sample and train our favirote classifier on it. 
        """
        if self.median_of_days:
            return  np.transpose(self.X, (0, 2, 3, 1)).reshape(-1, 10), self.y.reshape(-1)
        else:
            return  np.transpose(self.X, (0, 3, 4, 1, 2)).reshape(-1, 430), self.y.reshape(-1)
    
    def pixelwise_test(self):
        """
        Short test for the above method.
        """
        N = self.__len__()
        
        pX, py = self.pixelwise()
        if self.median_of_days:
            tX = np.transpose(pX.reshape(N, 128, 128, 10), (0, 3, 1, 2))    
        else:
            tX = np.transpose(pX.reshape(N, 128, 128, 43, 10), (0, 3, 4, 1, 2))
        tY = py.reshape(N, 128, 128)
        assert np.all(tX == self.X) and np.all(tY == self.y)
        print("All test passed!")

In [None]:
base_path = "../" # Define it

p_train = PASTISSegmentation(os.path.join(base_path, "data", "images"),
                             os.path.join(base_path, "data", "annotations"),
                             split="train",
                             median_of_days=True,
                             binary_labels=False)

p_test = PASTISSegmentation(os.path.join(base_path, "data", "images"),
                             os.path.join(base_path, "data", "annotations"),
                            split="test",
                            median_of_days=True,
                            binary_labels=False)
p_train.pixelwise_test()
p_test.pixelwise_test()

X_train, y_train = p_train.pixelwise()
X_test, y_test = p_test.pixelwise()

In [None]:
base_path = "../" # Define it

Xmean = np.array([ 596.57817383, 878.493514, 969.89764811, 1324.39628906, 2368.21767578, 2715.68257243, 2886.70323486, 2977.03915609, 2158.25386556, 1462.10965169])
Xmean = Xmean.reshape((1, 10, 1, 1))
Xstd = np.array([251.33337853, 289.95055489, 438.725014, 398.7289996, 706.53781626, 832.72503267, 898.14189979, 909.04165075, 661.66078257, 529.15340992])
Xstd = Xstd.reshape((1, 10, 1, 1))

p_train_norm = PASTISSegmentation(os.path.join(base_path, "data", "images"),
                             os.path.join(base_path, "data", "annotations"),
                             split="train",
                             median_of_days=True,
                             Xmean=Xmean,
                             Xstd=Xstd,
                             binary_labels=False)

p_test_norm = PASTISSegmentation(os.path.join(base_path, "data", "images"),
                             os.path.join(base_path, "data", "annotations"),
                            split="test",
                            median_of_days=True,
                            Xmean=Xmean,
                            Xstd=Xstd, 
                            binary_labels=False)
p_train_norm.pixelwise_test()
p_test_norm.pixelwise_test()

X_train_norm, y_train_norm = p_train_norm.pixelwise()
X_test_norm, y_test_norm = p_test_norm.pixelwise()


In [None]:
# Instead of images, we focus on the individual pixels. For each pixel, we have 10 features and a class (in y).
X_train.shape, y_train.shape, X_test.shape, y_test.shape,

In [None]:
# The normalized data should have the same shape as above
X_train_norm.shape, y_train_norm.shape, X_test_norm.shape, y_test_norm.shape,

In [None]:
# But the difference in values are strak!
print(np.mean(X_train), np.mean(X_train_norm))

In [None]:
imd = 5
# Show the 3rd band of the third image
plt.imshow(p_train.X[imd, 3])
plt.colorbar()
plt.show()

# Show the labels for third image
plt.imshow(p_train.y[imd])
plt.colorbar()
plt.show()

In [None]:
imd = 5
# Show the 3rd band of the third image
plt.imshow(p_train_norm.X[imd, 3])
plt.colorbar()
plt.show()

# Show the labels for third image
plt.imshow(p_train_norm.y[imd])
plt.colorbar()
plt.show()

# Feature importance and normalization in Logistic Regression

In [None]:
# Train a logistic regressor on the unnormalied data
logistic_regressor = sklearn.linear_model.LogisticRegression(n_jobs=-1, penalty='none', max_iter=100).fit(X_train, y_train)
print("Score:", logistic_regressor.score(X_test, y_test))

# Compute feature importance 
model_fi = permutation_importance(logistic_regressor, X_train, y_train)
print("Importance of the 10 features:", model_fi['importances_mean'])

In [None]:
# Train a logistic regressor on the normalied data
logistic_regressor_norm = sklearn.linear_model.LogisticRegression(n_jobs=-1, penalty='none', max_iter=100).fit(X_train_norm, y_train_norm)
print("Score:", logistic_regressor_norm.score(X_train_norm, y_train_norm))

# Compute feature importance
model_fi_norm = permutation_importance(logistic_regressor_norm, X_train_norm, y_train_norm)
print("Importance of the 10 features:", model_fi_norm['importances_mean'])

In [None]:
# Train a logistic regressor on the unnormalied data for 1000 iterations
# This will take a long time!!!
logistic_regressor = sklearn.linear_model.LogisticRegression(n_jobs=-1, penalty='none', max_iter=1000).fit(X_train, y_train)
print("Score:", logistic_regressor.score(X_test, y_test))

# Compute feature importance 
model_fi = permutation_importance(logistic_regressor, X_train, y_train)
print("Importance of the 10 features:", model_fi['importances_mean'])

# Random Forest

In [None]:
# Fit a random forest
random_forest = sklearn.ensemble.RandomForestClassifier(n_jobs=-1, random_state=0).fit(X_train, y_train)
print("Score:", random_forest.score(X_test, y_test))

print("Importance of the 10 features:", random_forest.feature_importances_)

In [None]:
# Fit a random forest
random_forest_norm = sklearn.ensemble.RandomForestClassifier(n_jobs=-1, random_state=0).fit(X_train_norm, y_train_norm)
print("Score:", random_forest_norm.score(X_test_norm, y_test_norm))
print("Importance of the 10 features:", random_forest_norm.feature_importances_)