In [None]:
### Autoreload all modules ###
%load_ext autoreload
%autoreload 2


### Import necessary libraries ###
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### Set base path ###
base_path = Path(os.getcwd())
while not (base_path / '.git').exists():
    base_path = base_path.parent
print('Base path: ', base_path)


### Import custom functions ###
sys.path.append(str(base_path / 'src/data/preprocessing'))
sys.path.append(str(base_path / 'src/visualization'))

from utils import get_files, parse_filename, load_mask
from preprocessing_plots import plot_images
from create_dataset import run_clustering, copy_to_final_directory, compute_stats_for_bands, create_datasets
from histogram_eval import compute_histograms, compute_average_histograms, calculate_wasserstein_all

## Geographical clustering

In [None]:
diff_mask_dir = '/scratch/yves/Thesis/Yves-MSc-Thesis/data/UKR/filtered/diff_masks'
clusters, features, data = run_clustering(diff_mask_dir, nclusters = 8, mask_type='diff_mask', random_state = 30)

print("Uniquce clusters: ", np.unique(clusters, return_counts=True))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6), sharey=True, sharex=True)
unique_clusters = np.unique(clusters)
colors = plt.cm.hsv(np.linspace(0, 1, len(unique_clusters) + 1))  

cluster_assignments = {
    'training': [1, 3, 5, 7],  # 398
    'validation': [0, 4],      # 161
    'test': [2, 6]             # 134
}

y_min = np.min(features[:, 1])
y_max = np.max(features[:, 1])

titles = ['36UXA', '36UYA']

for i, ax in enumerate(axes):
    mask = features[:, 2] == i
    cluster_colors = [colors[unique_clusters.tolist().index(cl)] for cl in clusters[mask]]
    ax.scatter(features[mask, 0], features[mask, 1], c=cluster_colors, alpha=0.5)
    ax.set_title(f'Tile {titles[i]}')
    ax.grid(True)
    ax.set_ylim(y_max, y_min)
    
for uc in unique_clusters:
    assignment_key = [k for k, v in cluster_assignments.items() if uc in v]
    axes[0].scatter([], [], color=colors[unique_clusters.tolist().index(uc)], label=f'{uc}: {assignment_key[0]}')

axes[0].legend(title='Clusters')

plt.tight_layout()
plt.show()


## Create training/validation/test set assignment 

In [None]:
valtest_dates = ["20170909", "20180601", "20180616", "20180815", "20190606", "20160731", "20190407", "20190606", "20190701", "20190904"]
train_dates = ["20170825", "20171909", "20180422", "20180508", "20180731", "20180830", "20180919", "20190427", "20160522", "20160830", "20190517", "20190611", "20190616", "20190820", "20190909"]

train_data, test_data, val_data, df = create_datasets(diff_mask_dir, cluster_assignments, clusters, data, train_dates, valtest_dates, mask_type='diff_mask')


## Visualize the val/test/train distribution

Deforestation change distribution of train, val and test data

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

axs[0].hist(train_data["Deforestation"], bins=20, alpha=0.5, color='b', label='Train', density=True)
axs[0].hist(val_data["Deforestation"], bins=20, alpha=0.5, color='r', label='Validation', density=True)
axs[0].hist(test_data["Deforestation"], bins=20, alpha=0.5, color='g', label='Test', density=True)
axs[0].set_xlabel('Deforestation change (in pixels)')
axs[0].set_ylabel('Density')
axs[0].set_title('Deforestation change (# of pixels per mask, all locations)')
axs[0].legend()

axs[1].hist(train_data[train_data['Deforestation'] > 0]["Deforestation"], bins=20, alpha=0.5, color='b', label='Train', density=True)
axs[1].hist(val_data[val_data['Deforestation'] > 0]["Deforestation"], bins=20, alpha=0.5, color='r', label='Validation', density=True)
axs[1].hist(test_data[test_data['Deforestation'] > 0]["Deforestation"], bins=20, alpha=0.5, color='g', label='Test', density=True)
axs[1].set_xlabel('Deforestation change (in pixels)' )
axs[1].set_ylabel('Density')
axs[1].set_title('Deforestation change (# of pixels per mask, excl. locations with no change)')
axs[1].legend()

plt.tight_layout()
plt.show()

Cloud cover distribution of train, val and test data

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

axs[0].hist(train_data["Cloud1"]*100 + train_data["Cloud2"]*100, bins=20, alpha=0.5, color='b', label='Train', density=True)
axs[0].hist(val_data["Cloud1"]*100 + val_data["Cloud2"]*100, bins=20, alpha=0.5, color='r', label='Validation', density=True)
axs[0].hist(test_data["Cloud1"]*100 + test_data["Cloud2"]*100, bins=20, alpha=0.5, color='g', label='Test', density=True)
axs[0].set_xlabel('Cloud cover in %')
axs[0].set_ylabel('Density')
axs[0].set_title('Cloud cover (date A + date B per location, all locations)')
axs[0].legend()

axs[1].hist(train_data[train_data['Cloud1'] > 0]["Cloud1"]*100 + train_data[train_data['Cloud2'] > 0]["Cloud2"]*100, bins=20, alpha=0.5, color='b', label='Train', density=True)
axs[1].hist(val_data[val_data['Cloud1'] > 0]["Cloud1"]*100 + val_data[val_data['Cloud2'] > 0]["Cloud2"]*100 , bins=20, alpha=0.5, color='r', label='Validation', density=True)
axs[1].hist(test_data[test_data['Cloud1'] > 0]["Cloud1"]*100 + test_data[test_data['Cloud2'] > 0]["Cloud2"]*100, bins=20, alpha=0.5, color='g', label='Test', density=True)
axs[1].set_xlabel('Cloud cover in %')
axs[1].set_ylabel('Density')
axs[1].set_title('Cloud cover (date A + date B per location, excl. locations with no clouds)')
axs[1].legend()

plt.tight_layout()
plt.show()

Season / year-type distribution

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
cats = ['Season1', 'Season2', 'Year Type']
titles = ['Season distribution of Image A', 'Season distribution of Image B', 'Year type distribution between A and B']

for i in range(len(cats)):
    train_percent = train_data[cats[i]].value_counts(normalize=True) * 100
    val_percent = val_data[cats[i]].value_counts(normalize=True) * 100
    test_percent = test_data[cats[i]].value_counts(normalize=True) * 100

    all_categories = set(train_percent.index) | set(val_percent.index) | set(test_percent.index)
    
    train_percent = train_percent.reindex(all_categories, fill_value=0)
    val_percent = val_percent.reindex(all_categories, fill_value=0)
    test_percent = test_percent.reindex(all_categories, fill_value=0)

    x = range(len(all_categories)) 
    width = 0.25 

    axs[i].bar(x, train_percent, width=width, color='b', alpha=0.5, label='Train')
    axs[i].bar([p + width for p in x], val_percent, width=width, color='r', alpha=0.5, label='Validation')
    axs[i].bar([p + width*2 for p in x], test_percent, width=width, color='g', alpha=0.5, label='Test')
    axs[i].set_xlabel(cats[i])
    axs[i].set_ylabel('Percentage')
    axs[i].set_title(titles[i])
    axs[i].set_xticks([p + width for p in x])
    axs[i].set_xticklabels(list(all_categories))
    axs[i].legend()

plt.tight_layout()
plt.show()


## Copy the dataset to the final destination

In [None]:
diff_mask_dir = str(base_path / 'data/UKR/filtered/diff_masks')
final_dir = str(base_path / 'data/UKR/final_datasets/change_test')
train_final_dir = os.path.join(final_dir, 'train')
val_final_dir = os.path.join(final_dir, 'val')
test_final_dir = os.path.join(final_dir, 'test')

copy_to_final_directory(train_data, diff_mask_dir, train_final_dir, split_type='train', save_format='tif')
copy_to_final_directory(val_data, diff_mask_dir, val_final_dir, split_type='val', save_format='tif')
copy_to_final_directory(test_data, diff_mask_dir, test_final_dir, split_type='test', save_format='tif')

## Percentile calculation

In [None]:
images_files_dir = str(base_path / 'data/UKR/final_datasets/change_new/train')
save_path = str(base_path / 'data/UKR/final_datasets/change_new/percentiles/train')

stats = compute_stats_for_bands(images_files_dir, save = False, save_path = save_path, percentiles = [0, 100],
                               return_values = True, file_type='tif')