This notebook includes step-by-step process of running and saving the baseline models.
- Clustering Methods
    - k-means
    - k-medoids
    - agglomerative
- Statistical features
    - Histogram
    - PCA

## Imports

In [30]:
%load_ext autoreload
%autoreload 2

import os, sys
from pathlib import Path
sys.path.append('/home/k64835/Master-Thesis-SITS') # change it according to system path
scripts_path = Path("../Data-Preprocessing/").resolve()
sys.path.append(str(scripts_path))
scripts_path = Path("../Evaluation/").resolve()
sys.path.append(str(scripts_path))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import pickle
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestCentroid
from scripts.data_visualiser import *
from scripts.data_loader import *
from scripts.data_preprocessor import *
from scripts.temporal_data_preprocessor import *
from scripts.temporal_data_loader import *
from scripts.temporal_visualiser import *
from scripts.temporal_chanel_refinement import *
from model_scripts.model_helper import *
from model_scripts.dataset_creation import *
from model_scripts.train_model_ae import *
from model_scripts.model_visualiser import *
from model_scripts.subpatch_extraction import *
from model_scripts.feature_extraction import *
from model_scripts.clustering import *
from evaluation_scripts.result_visualiser import *
from evaluation_scripts.evaluation_helper import *
from Pipeline.temporal_preprocessing_pipeline import *
from Pipeline.temporal_preprocessing_pipeline import *
from Pipeline.preprocess_script import *
import numpy as np
import config as config
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import skimage.measure
import torch
import torch.nn as nn
import torch.optim as optim

## Prepare dataset: B10

### Loading the pre-processed data

Data: Extracted and Pre-processed sub-patches

Dimensions: (N, T, C, H, W) = (N, 7, 10, 4, 4)

In [3]:
train_subpatches, eval_subpatches, train_coord_fn, eval_coord_fn = get_model_ready_data(model_type='baseline', tensor_type='b10', encoding_method='sin-cos', visualisation_images=False)
train_subpatches.shape, eval_subpatches.shape

(torch.Size([33128, 7, 10, 4, 4]), torch.Size([1197, 7, 10, 4, 4]))

## 1. K-means Clustering 

Clustering the sub-patches 

In [None]:
kmeans_b10 = kmeans_function(train_subpatches, n_clusters=2, random_state=1)    

train_subpatch_predictions = kmeans_b10.predict(train_subpatches.reshape(train_subpatches.size(0), -1).numpy())
eval_subpatch_predictions = kmeans_b10.predict(eval_subpatches.reshape(eval_subpatches.size(0), -1).numpy())

Clustering Accuracy: Convert sub-patch level labels to patch-level labels and compare with ground truth

In [12]:
# disease, acc, precision, recall, f1_score, f2_score = evaluate_clustering_metrics(eval_coord_fn, eval_subpatch_predictions, config.labels_path, config.subpatch_to_patch_threshold, 'Flattened Data', True) #for saving predictions
disease, acc, precision, recall, f1_score, f2_score = evaluate_clustering_metrics(eval_coord_fn, eval_subpatch_predictions, config.labels_path, config.subpatch_to_patch_threshold)
print("Disease cluster:", disease)
print("Accuracy:",acc)
print("Precision:",precision)
print("Recall:",recall)
print("F1-score:",f1_score)
print("F2-score:", f2_score)

Disease cluster: 1
Accuracy: 63.93
Precision: 62.75
Recall: 91.43
F1-score: 74.42
F2-score: 83.77


Save Model

In [14]:
# with open(config.kmeans_b10_path, 'wb') as file:
#     pickle.dump(kmeans_b10, file)

## 2. Agglomerative Clustering 

Clustering the sub-patches 

In [None]:
agg_b10 = agg_clustering_function(train_subpatches, n_clusters=2)

train_subpatch_predictions = agg_b10.fit_predict(train_subpatches.reshape(train_subpatches.size(0), -1).numpy())
test_subpatch_predictions = agg_b10.fit_predict(eval_subpatches.reshape(eval_subpatches.size(0), -1).numpy())

Clustering Accuracy: Convert sub-patch level labels to patch-level labels and compare with ground truth

In [7]:
disease, acc, precision, recall, f1_score, f2_score = evaluate_clustering_metrics(eval_coord_fn, test_subpatch_predictions, config.labels_path, config.subpatch_to_patch_threshold)
print("Disease cluster:", disease)
print("Accuracy:",acc)
print("Precision:",precision)
print("Recall:",recall)
print("F1-score:",f1_score)
print("F2-score:", f2_score)

Disease cluster: 0
Accuracy: 42.62
Precision: 50.0
Recall: 54.29
F1-score: 52.05
F2-score: 53.37


Save Model

In [None]:
# with open(config.agg_path, 'wb') as file:
#     pickle.dump(agg_b10, file)

## 3. K-medoids Clustering 

Clustering the sub-patches 

In [None]:
kmedoids_b10 = kmedoids_function(train_subpatches, n_clusters=2, random_state=4, metric='manhattan')

train_subpatch_predictions = kmedoids_b10.predict(train_subpatches.reshape(train_subpatches.size(0), -1).numpy())
test_subpatch_predictions = kmedoids_b10.predict(eval_subpatches.reshape(eval_subpatches.size(0), -1).numpy())

Clustering Accuracy: Convert sub-patch level labels to patch-level labels and compare with ground truth

In [24]:
disease, acc, precision, recall, f1_score, f2_score = evaluate_clustering_metrics(eval_coord_fn, test_subpatch_predictions, config.labels_path, config.subpatch_to_patch_threshold)
print("Disease cluster:", disease)
print("Accuracy:",acc)
print("Precision:",precision)
print("Recall:",recall)
print("F1-score:",f1_score)
print("F2-score:", f2_score)

Disease cluster: 1
Accuracy: 67.21
Precision: 85.71
Recall: 51.43
F1-score: 64.29
F2-score: 55.9


Save Model

In [25]:
# with open(config.kmedoids_path, 'wb') as file:
#     pickle.dump(kmedoids_b10, file)

## 4. Feature Extraction using Channel-wise Histrograms

### Feature extraction 

In [33]:
histogram_features_train = extract_global_histogram(train_subpatches, bins=34)
histogram_features_eval = extract_global_histogram(eval_subpatches, bins=34)
histogram_features_train.shape

(33128, 34)

### Modeling: k-means

In [34]:
kmeans_b10 = kmeans_function(histogram_features_train, n_clusters=2, random_state=31)

train_subpatch_predictions = kmeans_b10.predict(histogram_features_train.reshape(histogram_features_train.shape[0],-1))
eval_subpatch_predictions = kmeans_b10.predict(histogram_features_eval.reshape(histogram_features_eval.shape[0],-1))

### Evaluation

In [35]:
disease, acc, precision, recall, f1_score, f2_score = evaluate_clustering_metrics(eval_coord_fn, eval_subpatch_predictions, config.labels_path, config.subpatch_to_patch_threshold)
print("Disease cluster:", disease)
print("Accuracy:",acc)
print("Precision:",precision)
print("Recall:",recall)
print("F1-score:",f1_score)
print("F2-score:", f2_score)

Disease cluster: 0
Accuracy: 52.46
Precision: 55.77
Recall: 82.86
F1-score: 66.67
F2-score: 75.52


Save Model

In [None]:
# with open(config.kmeans_hist_path, 'wb') as file:
#     pickle.dump(kmeans_b10, file)

## 5. Feature Reduction using PCA

### Feature extraction 

Here, we apply PCA to channel dimension of every data sample, and get top 3 channels per sample.
The resulting list of features has a size of (H * W * T * 3) per sample. 
We also print the overall top 3 channels for train and test data that was retained by PCA.

Channels retained in decreasing order of importance: 6, 8 and 5, corresponding to bands 8 (NIR), 11 (SWIR 1), and 7 (Red-edge 3).

In [None]:
features_train, top_channel_indices_train = pca_feature_extraction_channel(train_subpatches, n_components=config.pca_components) 
features_eval, top_channel_indices_eval = pca_feature_extraction_channel(eval_subpatches, n_components=config.pca_components)

top_channel_indices_train, top_channel_indices_eval

([np.int64(6), np.int64(8), np.int64(5)],
 [np.int64(6), np.int64(8), np.int64(5)])

### Modeling: k-means

In [None]:
kmeans_b10 = kmeans_function(np.array(features_train), n_clusters=2, random_state=32)

train_subpatch_predictions = kmeans_b10.predict(np.array(features_train))
eval_subpatch_predictions = kmeans_b10.predict(np.array(features_eval))

### Evaluation

In [None]:
disease, acc, precision, recall, f1_score, f2_score = evaluate_clustering_metrics(eval_coord_fn, eval_subpatch_predictions, config.labels_path, config.subpatch_to_patch_threshold)
print("Disease cluster:", disease)
print("Accuracy:",acc)
print("Precision:",precision)
print("Recall:",recall)
print("F1-score:",f1_score)
print("F2-score:", f2_score)

Disease cluster: 0
Accuracy: 57.38
Precision: 58.82
Recall: 85.71
F1-score: 69.77
F2-score: 78.53


Save Model

In [None]:
# with open(config.kmeans_pca_path, 'wb') as file:
#     pickle.dump(kmeans_b10, file)