In [1]:
from osgeo import gdal
import numpy as np
import pandas as pd

In [2]:
cities = ["darmstadt", "frankfurt_am_main", "freiburg_am_breisgau", "heidelberg",
          "heilbronn", "karlsruhe", "mainz", "munchen", "stuttgart", "wurzburg",
          "tubingen"]

classes = ["artificial", "agricultural", "forest", "wetlands", "water"]

distr = np.zeros((len(cities), 6))

for city_ix, city in enumerate(cities):
    label_data = gdal.Open(f"annotations/{city}_anno.tif")
    labels = label_data.GetRasterBand(1).ReadAsArray()
    
    for c in np.unique(labels):
        count = np.count_nonzero(labels == c)
        distr[city_ix, c] = count
        
    labels = None
    label_data = None

In [3]:
distr_valid = distr[:, 1:]
print(f"We are working with {int(np.sum(distr_valid))} annotated pixels in total.")

We are working with 243469334 annotated pixels in total.


In [4]:
df = pd.DataFrame(data=distr_valid,
                  index=cities,
                  columns=classes)

df["total"] = df.sum(axis=1)

display(df[classes].div(df.total, axis=0))

Unnamed: 0,artificial,agricultural,forest,wetlands,water
darmstadt,0.194228,0.411061,0.39051,0.000986,0.003216
frankfurt_am_main,0.201085,0.404456,0.38614,0.000187,0.008133
freiburg_am_breisgau,0.114357,0.395892,0.481119,1.7e-05,0.008615
heidelberg,0.190998,0.401007,0.397539,5.1e-05,0.010405
heilbronn,0.163132,0.561319,0.269775,6e-05,0.005715
karlsruhe,0.213525,0.409267,0.353717,0.000662,0.02283
mainz,0.201067,0.626139,0.146821,4.1e-05,0.025933
munchen,0.177855,0.517892,0.274555,0.000287,0.029411
stuttgart,0.227683,0.438433,0.330316,4e-06,0.003564
wurzburg,0.101241,0.509589,0.381665,6.4e-05,0.007441


In [5]:
def compute_distribution(test_cities, all_cities, index):
    train_cities = all_cities - test_cities
    train_cities = list(train_cities)
    test_cities = list(test_cities)

    df_sub = df.loc[[*train_cities, *test_cities]]
    
    train_distr = df_sub.loc[train_cities].sum(axis=0) / df_sub.loc[train_cities].sum(axis=0).total
    test_distr = df_sub.loc[test_cities].sum(axis=0) / df_sub.loc[test_cities].sum(axis=0).total

    df_distr = pd.DataFrame(data=[train_distr, test_distr],
                            index=index)
    df_distr["dataset_percentage"] = [
        df_sub.loc[train_cities, "total"].values.sum() / df_sub.total.sum(),
        df_sub.loc[test_cities, "total"].values.sum() / df_sub.total.sum()
    ]
    
    return df_distr

In [6]:
df_distr = compute_distribution(
    set(["heidelberg", "frankfurt_am_main"]),  # Test cities
    set(cities),
    ["train-val", "test"]
)

display(df_distr)

Unnamed: 0,artificial,agricultural,forest,wetlands,water,total,dataset_percentage
train-val,0.169743,0.480738,0.334751,0.000187,0.01458,1.0,0.775391
test,0.198929,0.403718,0.388577,0.000158,0.008619,1.0,0.224609


other lucrative options: ["heidelberg", "freiburg_am_breisgau", "mainz", "darmstadt"]

Suppose we decided to use Heidelberg and Frankfurt am Main as our test set. Let's split the train set into train and validation now

In [7]:
test_cities = ["heidelberg", "frankfurt_am_main"]

In [8]:
df_distr = compute_distribution(
    set(["freiburg_am_breisgau", "darmstadt", "mainz"]),  # Validation cities
    set(cities) - set(test_cities),  # Train-val cities 
    ["train", "val"]
)

display(df_distr)

Unnamed: 0,artificial,agricultural,forest,wetlands,water,total,dataset_percentage
train,0.175095,0.48994,0.319281,0.000177,0.015507,1.0,0.804358
val,0.147742,0.442907,0.398357,0.000226,0.010768,1.0,0.195642


Other lucrative options: ["freiburg_am_breisgau", "karlsruhe"]

Let's choose Freiburg am Breisgau, Darmstadt, and Mainz as the validation set, such that we have one bigger city in each subset.

In [9]:
validation_cities = ["freiburg_am_breisgau", "darmstadt", "mainz"]

Remainder are train cities

In [10]:
train_cities = list(set(cities) - set(test_cities) - set(validation_cities))
display(train_cities)

['karlsruhe', 'stuttgart', 'tubingen', 'munchen', 'wurzburg', 'heilbronn']

Final distribution:
* Train cities (62.37%): Karlsruhe, Munich, Stuttgart, Würzburg, Heilbronn, Tübingen
* Validation cities (15.17%): Freiburg am Breisgau, Darmstadt, Mainz
* Test cities (22.46%): Heidelberg, Frankfurt am Main