In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from omegaconf import OmegaConf
import pandas as pd

from src.constants import AOIS_TEST
from src.data import UNOSAT_S1TS_Dataset
from src.classification.model_factory import load_model
from src.classification.trainer import S1TSDD_Trainer

def extract_features(df, start, end, prefix=""):

    # columns are datetime -> can slice directly between two dates
    df = df.loc[:, start:end]

    # features
    df_features = pd.DataFrame(index=df.index)
    df_features["mean"] = df.mean(axis=1)
    df_features["std"] = df.std(axis=1)
    df_features["median"] = df.median(axis=1)
    df_features["min"] = df.min(axis=1)
    df_features["max"] = df.max(axis=1)
    df_features["skew"] = df.skew(axis=1)
    df_features["kurt"] = df.kurt(axis=1)

    # rename columns using band, prefix (eg pre/post/pre_3x3, ...)
    df_vv = df_features.xs("VV", level="band")
    df_vh = df_features.xs("VH", level="band")
    df_vv.columns = [f"VV_{prefix}_{col}" for col in df_vv.columns]
    df_vh.columns = [f"VH_{prefix}_{col}" for col in df_vh.columns]
    return pd.concat([df_vv, df_vh], axis=1)

cfg = OmegaConf.create(
    dict(
        aggregation_method="mean",
        model_name="random_forest",
        model_kwargs=dict(
            n_estimators=100,
            n_jobs=12,
        ),
        data=dict(
            aois_test = AOIS_TEST,
            damages_to_keep=[1,2,3],
            extract_winds = ['3x3'], # ['1x1', '3x3', '5x5']
            random_neg_labels=0.1,  # percentage of negative labels to add in training set (eg 0.1 for 10%)
            time_periods_pos = dict(
                pre= ('2021-04-01', '2021-11-01'),
                post= ('2022-04-01', '2022-11-01')
            ),
            time_periods_neg = dict(
                pre = ('2020-04-01', '2020-11-01'),
                post = ('2021-04-01', '2021-11-01')
            )
        ),
        seed=123,
        run_name=None,
    )
)

ds = UNOSAT_S1TS_Dataset(cfg.data, extract_features=extract_features)
model = load_model(cfg)
trainer = S1TSDD_Trainer(ds, model, aggregation=cfg.aggregation_method, seed=cfg.seed, verbose=1)


In [None]:
trainer.train_cv()

In [None]:
def concat_df_preds_cv(df_preds):
    df_preds_ = []
    for i, df in enumerate(df_preds):
        df['fold'] = i+1
        df_preds_.append(df)
    return pd.concat(df_preds_)

df_preds_cv = concat_df_preds_cv(trainer.df_preds_cv)
df_preds_agg_cv = concat_df_preds_cv(trainer.df_preds_agg_cv)

In [None]:
df_preds_cv.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from src.classification.utils import compute_metrics
from collections import defaultdict

def compute_metrics_across_folds_for_various_thresholds(
        df,
        metrics=['recall', 'precision'],
        thresholds=np.arange(0.5, 1, 0.05)
):

    d_metrics = defaultdict(list)
    for t in thresholds:
        d_metrics_folds = defaultdict(list)
        for fold, grp in df.groupby('fold'):
            y_true = grp['label']
            y_preds = (grp['preds_proba'] >= t).astype(int)
            scores = compute_metrics(y_true, y_preds, verbose=0)
            for m in metrics:
                d_metrics_folds[m].append(scores[m])
        for m in d_metrics_folds:
            d_metrics[m].append(np.mean(d_metrics_folds[m]))
            d_metrics[m + "_std"].append(np.std(d_metrics_folds[m]))
    return pd.DataFrame(d_metrics, index=thresholds)



def plot_metrics_curves(df_preds, metrics = ['recall', 'precision', 'f1'], agg=False):

    thresholds = np.arange(0.5, 1, 0.05)
    df_metrics = compute_metrics_across_folds_for_various_thresholds(df_preds, metrics=metrics,thresholds=thresholds)

    _, ax = plt.subplots(figsize=(10,6))
    for m in metrics:
        ax.plot(thresholds, df_metrics[m], label=m)
        ax.fill_between(thresholds, df_metrics[m]-2*df_metrics[m+"_std"], df_metrics[m]+2*df_metrics[m+"_std"], alpha=0.2, label=m+"Â±2*std")

    ax.set_xlabel("Threshold")
    ax.set_ylabel("Score")
    ax.legend(loc='lower left')
    title = 'Precision vs Recall for different thresholds across 5 folds'
    if agg:
        title += f" (aggregated)"
    ax.set_title(title)
    plt.show()

In [None]:
plot_metrics_curves(df_preds_agg_cv, metrics = ['recall', 'precision', 'f1'], agg=True)

In [None]:
trainer.train_and_test();

In [None]:
df_preds_agg = trainer.df_preds_agg

In [None]:
# df_neg = df_preds_agg_cv[df_preds_agg_cv['label'] == 0].copy()
# df_pos = df_preds_agg_cv[df_preds_agg_cv['label'] == 1].copy()

df_neg = df_preds_agg[df_preds_agg['label'] == 0].copy()
df_pos = df_preds_agg[df_preds_agg['label'] == 1].copy()

In [None]:
# plot number of false positive per threshold
thresholds = np.arange(5,9.5,0.5)/10 # to overcome floating point precisions

fps = [len(df_neg[df_neg['preds_proba'] >= t]) for t in thresholds]
tps = [len(df_pos[df_pos['preds_proba'] >= t]) for t in thresholds]


_, axs = plt.subplots(2,1,figsize=(10,10))
# Plot false positives
axs[0].semilogy(thresholds, fps, label="False Positives")
axs[0].set_xlabel("Threshold")
axs[0].set_ylabel("False Positives")
axs[0].legend(loc='lower left')
axs[0].set_title("Number of False Positives per threshold")
for t, fp in zip(thresholds, fps):
    axs[0].text(t, fp, f'{fp} ({100*fp/len(df_neg):.2f}%)', ha='left', va='bottom')
axs[0].grid(linestyle='--', linewidth=0.5)

# Plot false negatives
axs[1].semilogy(thresholds, tps, label="True Positives")
axs[1].set_xlabel("Threshold")
axs[1].set_ylabel("True Positives")
axs[1].legend(loc='lower left')
axs[1].set_title("Number of True Positives per threshold")
# ad a text box at 0.5, 0.6, 0.7, 0.8, 0.9 to show the number of false negatives
for t, tp in zip(thresholds, tps):
    axs[1].text(t, tp, f'{tp} ({100*tp/len(df_pos):.2f}%)', ha='left', va='bottom')
axs[1].grid(linestyle='--', linewidth=0.5)
plt.show()

In [None]:
# histogram of preds_proba
df_neg['preds_proba'].hist(bins=50)

In [None]:
df_neg.sort_values('preds_proba', ascending=False).head(10)

In [None]:
5*8.2*5 + 61.57

In [None]:
from src.data import load_unosat_labels

labels = load_unosat_labels('UKR1')
labels[['geometry']].loc[[10951]].explore()
# labels[['geometry']].loc[[22173, 22165, 22189, 22164, 22175]].explore()

In [None]:
from src.visualization.time_series import plot_ts_from_id, plot_all_ts_from_id

plot_all_ts_from_id('UKR1', 10951)

# Find best train/test split

In [None]:
from src.data.utils import aoi_to_city
aoi_to_city('UKR15')

In [None]:
from src.data import load_unosat_labels

aois_test = ["UKR6", "UKR7", "UKR8", "UKR10", "UKR12", "UKR14"]
labels = [1,2,3]
df = load_unosat_labels(labels_to_keep=labels)
grouped = df.groupby(['aoi', 'damage']).size().reset_index(name='counts')
df_count = grouped.pivot(index='aoi', columns='damage', values='counts').fillna(0).astype(int)

n_test = df_count.loc[aois_test].sum()
n_train = df_count.drop(aois_test).sum()
n_tot = df_count.sum()

for d in labels:
    print(f"Damage {d}:")
    print(f"  - Train: {n_train[d]} ({100*n_train[d]/n_tot[d]:.2f}%)")
    print(f"  - Test: {n_test[d]} ({100*n_test[d]/n_tot[d]:.2f}%)")
print('Total')
print(f"  - Train: {n_train.sum()} ({100*n_train.sum()/n_tot.sum():.2f}%)")
print(f"  - Test: {n_test.sum()} ({100*n_test.sum()/n_tot.sum():.2f}%)")

In [None]:
df_count.sum(axis=1)

In [None]:
df_count.sum().sum()

In [None]:
from src.data.utils import aoi_orbit_iterator

labels = [1,2,3]
df = load_unosat_labels(labels_to_keep=labels)
count = 0
for aoi, orbit in aoi_orbit_iterator():
    n = df[df.aoi==aoi].shape[0]
    print(aoi ,orbit, n)
    count+=n
count

In [None]:
labels = load_unosat_labels(labels_to_keep=[1,2,3])
labels.date.value_counts()

In [None]:
labels.groupby('aoi').date.value_counts()

In [None]:
aois = ["UKR6", "UKR7", "UKR8", "UKR12", "UKR15", "UKR16"]
from src.data.utils import aoi_to_city
for aoi in aois:
    print(aoi, aoi_to_city(aoi))

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
dates = labels.date.value_counts().index
values = labels.date.value_counts().values
ax.bar(dates, values, width=2)
ax.set_xlabel('Date')
ax.set_ylabel('Count')
ax.set_title('HDate of analysis')
plt.tight_layout()
plt.show()

In [None]:
import geopandas as gpd
from src.constants import PROCESSED_PATH
from src.data.unosat import assign_bins_to_labels

labels_fp = PROCESSED_PATH / "unosat_labels.feather"
gdf = gpd.read_feather(labels_fp).reset_index(drop=True)
gdf = assign_bins_to_labels(gdf)
gdf = gdf[gdf.aoi=='UKR7']

In [None]:
gdf[(gdf.prev_damage!=-99)&(gdf.damage>gdf.prev_damage)][['geometry', 'prev_damage', 'damage']]

In [None]:
gdf[(gdf.prev_damage!=-99)&(gdf.damage>gdf.prev_damage)][['geometry', 'prev_damage', 'damage']].explore('damage')

In [None]:
from src.constants import PROCESSED_PATH

folder = PROCESSED_PATH / 'stacked_ts' / '1x1'
import pandas as pd
pd.read_csv(folder / 'UKR1_orbit_43.csv')

In [None]:
from src.data import load_unosat_labels
from src.data.utils import aoi_orbit_iterator

labels = load_unosat_labels(labels_to_keep=[1,2,3])
aois_test = ["UKR6", "UKR7", "UKR8", "UKR10", "UKR12", "UKR14"]

count_train = 0
count_test = 0
for aoi, orbit in aoi_orbit_iterator():
    n = labels[labels.aoi==aoi].shape[0]
    if aoi in aois_test:
        count_test += n
    else:
        count_train += n
count_test, count_train

In [None]:
[aoi_to_city(aoi) for aoi in aois_test]

In [None]:
count_test/(count_train+count_test)

In [None]:
labels[labels.aoi.isin(aois_test)].shape[0]/len(labels)

In [None]:
labels[~labels.aoi.isin(aois_test)].shape[0]

In [None]:
5518/42435