In [1]:
import geopandas as gpd
import numpy as np
from pylr2 import regress2

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [2]:
from src.data import spark_postgis
from src import constants

In [None]:
spark = spark_postgis.get_spark()

## A. Setup: Intact control

In [None]:
sdf = spark.read.parquet((constants.RESULTS_PATH / "gedi_neighbors_nau_test").as_posix())
sdf.createOrReplaceTempView("shots_table")
sdf = spark.sql("SELECT *, ST_GeomFromWKB(t1_geometry) AS t1_geom, ST_GeomFromWKB(t2_geometry) AS t2_geom FROM shots_table")
sdf = sdf.drop("t1_geometry", "t2_geometry")
print(sdf.count())
sdf.show(3)

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col

@udf(returnType=IntegerType())
def get_days(time_delta):
  return time_delta.days

sdf = sdf.withColumn("time_diff", (sdf["t2_absolute_time"] - sdf["t1_absolute_time"]))
sdf = sdf.withColumn("time_diff", get_days(col("time_diff")))

In [None]:
from pyspark.sql.functions import mean
sdf_filtered = sdf.filter(sdf.time_diff != 0)
agbd_df = sdf_filtered.sample(withReplacement=False, fraction=0.1).select("t1_agbd_a0", "t2_agbd_a0", "time_diff").toPandas()
rh98_df = sdf_filtered.sample(withReplacement=False, fraction=0.1).select("t1_rh_98_a0", "t2_rh_98_a0", "time_diff").toPandas()
n = sdf_filtered.count()
print(n)
corr_agbd = sdf_filtered.corr('t1_agbd_a0', 't2_agbd_a0')
sdf = sdf.withColumn("agbd_diff", (sdf.t2_agbd_a0 - sdf.t1_agbd_a0))
bias_agbd = (sdf_filtered
                .withColumn("agbd_diff", (sdf.t2_agbd_a0 - sdf.t1_agbd_a0))
                .select(mean('agbd_diff'))
                .collect())[0]['avg(agbd_diff)']
corr_rh98 = sdf_filtered.corr('t1_rh_98_a0', 't2_rh_98_a0')
bias_rh98 = (sdf_filtered
                .withColumn("rh98_diff", (sdf.t2_rh_98_a0 - sdf.t1_rh_98_a0))
                .select(mean('rh98_diff'))
                .collect())[0]['avg(rh98_diff)']
print(bias_agbd)
print(bias_rh98)


In [None]:
(sdf_filtered
    .withColumn("proj", spark_postgis.get_utm_projection(col("t1_geom")))
    .createOrReplaceTempView("shots_table"))
sdf_30m = spark.sql("""
    SELECT * 
    FROM shots_table 
    WHERE ST_DISTANCE(
        ST_Transform(t1_geom, "epsg:4326", proj),
        ST_Transform(t2_geom, "epsg:4326", proj)) < 5""")
print(sdf_30m.count())

## B. Setup: Disturbance (RADD)

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col
degrade_sdf = spark.read.parquet((constants.RESULTS_PATH / "gedi_degradation_radd_singlelayer").as_posix())
@udf(returnType=IntegerType())
def get_days(time_delta):
  return time_delta.days

degrade_sdf = degrade_sdf.withColumn("time_diff", (degrade_sdf["t2_absolute_time"] - degrade_sdf["t1_absolute_time"]))
degrade_sdf = degrade_sdf.withColumn("time_diff", get_days(col("time_diff")))
radd_df = gpd.GeoDataFrame(degrade_sdf.toPandas(), geometry="t2_geom").copy()
radd_df.loc[radd_df.control_disturbance > 0, "sample_grp"] = "control"
# Note: points may have a control disturbance as well as a measured disturbance.
# in that case, we include them in the treatment group; we don't care that they
# were also disturbed at another, unmeasured time.
radd_df.loc[radd_df.measured_disturbance > 0, "sample_grp"] = "treatment"
print(len(radd_df))
print(len(radd_df[radd_df["sample_grp"] == "treatment"]))
print(len(radd_df[radd_df["sample_grp"] == "control"]))

In [None]:
control_df = radd_df[radd_df["sample_grp"] == "control"]
control_agbd_corr = control_df.t1_agbd_a0.corr(control_df.t2_agbd_a0)
control_agbd_bias = (control_df.t2_agbd_a0 - control_df.t1_agbd_a0).mean()
control_rh98_corr = control_df.t1_rh_98_a0.corr(control_df.t2_rh_98_a0)
control_rh98_bias = (control_df.t2_rh_98_a0 - control_df.t1_rh_98_a0).mean()
control_n = len(control_df)
del control_df

## C. Setup: Disturbance (AFC)

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col
degrade_sdf = spark.read.parquet((constants.RESULTS_PATH / "gedi_degradation_afc_2x2_control").as_posix())
@udf(returnType=IntegerType())
def get_days(time_delta):
  return time_delta.days

degrade_sdf = degrade_sdf.withColumn("time_diff", (degrade_sdf["t2_absolute_time"] - degrade_sdf["t1_absolute_time"]))
degrade_sdf = degrade_sdf.withColumn("time_diff", get_days(col("time_diff")))
afc_df = gpd.GeoDataFrame(degrade_sdf.toPandas(), geometry="t2_geom").copy()
afc_df.loc[afc_df.control_disturbance > 0, "sample_grp"] = "control"
# Note: points may have a control disturbance as well as a measured disturbance.
# in that case, we include them in the treatment group; we don't care that they
# were also disturbed at another, unmeasured time.
afc_df.loc[afc_df.measured_disturbance > 0, "sample_grp"] = "treatment"
print(len(afc_df))
print(len(afc_df[afc_df["sample_grp"] == "treatment"]))
print(len(afc_df[afc_df["sample_grp"] == "control"]))


## 1. Control groups

In [None]:
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
fig = plt.figure(layout='constrained', figsize=(20, 20))
subfigs = fig.subfigures(2, 1, hspace=0.05, height_ratios=[0.85, 1])
axs_top = subfigs[0].subplots(1, 2)
axs_bottom = subfigs[1].subplots(1, 2)

axi = axs_top[0]
xs = agbd_df.t1_agbd_a0
ys = agbd_df.t2_agbd_a0
axmax = 500

# increase gridsize for smaller hexagons
imi = axi.hexbin(xs, ys, gridsize=300, cmap='magma', vmin=0, vmax=1200, mincnt=100)
axi.axis([0, axmax, 0, axmax])
axi.plot([0, axmax], [0, axmax], color='black', linestyle='dashed', linewidth=3)
res = regress2(xs, ys, _method_type_2="reduced major axis")
dummy_xs = np.arange(axmax)
axi.plot(dummy_xs, res["intercept"] + res["slope"] * dummy_xs, color='green', linewidth=3)
axi.set_xlabel("AGBD 1 (Mg/ha)", fontsize=22)
axi.set_ylabel("AGBD 2 (Mg/ha)", fontsize=22)
textstr = f"Correlation: {corr_agbd:.2f}\nBias: {bias_agbd:.2f}\nSMA: y = {res['intercept']:.2f} + {res['slope']:.2f}x"
props = dict(boxstyle='round', facecolor='white', alpha=0.7)
axi.text(0.05, 0.95, textstr, transform=axi.transAxes, fontsize=22,
        verticalalignment='top', bbox=props)

axi = axs_top[1]
xs = rh98_df.t1_rh_98_a0
ys = rh98_df.t2_rh_98_a0
axmax = 50

# increase gridsize for smaller hexagons
imi = axi.hexbin(xs, ys, gridsize=100, cmap='magma', vmin=0, vmax=1200, mincnt=100)
axi.axis([0, axmax, 0, axmax])
axi.plot([0, axmax], [0, axmax], color='black', linestyle='dashed', linewidth=3)
res = regress2(xs, ys, _method_type_2="reduced major axis")
dummy_xs = np.arange(axmax)
axi.plot(dummy_xs, res["intercept"] + res["slope"] * dummy_xs, color='green', linewidth=3)

# cb = fig.colorbar(imi, ax=axi, orientation='vertical')
# cb.ax.tick_params(rotation=275)
axi.set_xlabel("RH 98 1 (m)", fontsize=22)
axi.set_ylabel("RH 98 2 (m)", fontsize=22)
textstr = f"Correlation: {corr_rh98:.2f}\nBias: {bias_rh98:.2f}\nSMA: y = {res['intercept']:.2f} + {res['slope']:.2f}x"
props = dict(boxstyle='round', facecolor='white', alpha=0.7)
axi.text(0.05, 0.95, textstr, transform=axi.transAxes, fontsize=22,
        verticalalignment='top', bbox=props)

subfigs[0].suptitle(f"All coincident footprints ({n:,} shot pairs)", fontsize=24)

axi = axs_bottom[0]
xs = radd_df[radd_df["sample_grp"] == "control"].t1_agbd_a0
ys = radd_df[radd_df["sample_grp"] == "control"].t2_agbd_a0
axmax = 500

# increase gridsize for smaller hexagons
imi = axi.hexbin(xs, ys, gridsize=175, cmap='magma', vmin=0, vmax=80, mincnt=4)
axi.axis([0, axmax, 0, axmax])
axi.plot([0, axmax], [0, axmax], color='black', linestyle='dashed', linewidth=3)
res = regress2(xs, ys, _method_type_2="reduced major axis")
dummy_xs = np.arange(axmax)
axi.plot(dummy_xs, res["intercept"] + res["slope"] * dummy_xs, color='green', linewidth=3)
axi.set_xlabel("AGBD 1 (Mg/ha)", fontsize=22)
axi.set_ylabel("AGBD 2 (Mg/ha)", fontsize=22)
textstr = f"Correlation: {control_agbd_corr:.2f}\nBias: {control_agbd_bias:.2f}\nSMA: y = {res['intercept']:.2f} + {res['slope']:.2f}x"
props = dict(boxstyle='round', facecolor='white', alpha=0.7)
axi.text(0.05, 0.95, textstr, transform=axi.transAxes, fontsize=22,
        verticalalignment='top', bbox=props)

axi = axs_bottom[1]
xs = radd_df[radd_df["sample_grp"] == "control"].t1_rh_98_a0
ys = radd_df[radd_df["sample_grp"] == "control"].t2_rh_98_a0
axmax = 50

# increase gridsize for smaller hexagons
imi = axi.hexbin(xs, ys, gridsize=100, cmap='magma', vmin=0, vmax=80, mincnt=4)
axi.axis([0, axmax, 0, axmax])
axi.plot([0, axmax], [0, axmax], color='black', linestyle='dashed', linewidth=3)
res = regress2(xs, ys, _method_type_2="reduced major axis")
dummy_xs = np.arange(axmax)
axi.plot(dummy_xs, res["intercept"] + res["slope"] * dummy_xs, color='green', linewidth=3)
cb = fig.colorbar(imi, ax=axs_bottom.ravel().tolist(), orientation='horizontal', ticks=[0, 80], shrink=0.5)
cb.ax.set_xticklabels(['Few pairs', 'Many pairs'])
axi.set_xlabel("RH 98 1 (m)", fontsize=22)
axi.set_ylabel("RH 98 2 (m)", fontsize=22)
textstr = f"Correlation: {control_rh98_corr:.2f}\nBias: {control_rh98_bias:.2f}\nSMA: y = {res['intercept']:.2f} + {res['slope']:.2f}x"
props = dict(boxstyle='round', facecolor='white', alpha=0.7)
axi.text(0.05, 0.95, textstr, transform=axi.transAxes, fontsize=22,
        verticalalignment='top', bbox=props)
subfigs[1].suptitle(f"Disturbed forest control ({control_n:,} shot pairs)", fontsize=24)

In [None]:
agbd_df["pct_diff"] = (agbd_df.t2_agbd_a0 - agbd_df.t1_agbd_a0) / (agbd_df.t1_agbd_a0 + 0.01) * 100
rh98_df["pct_diff"] = (rh98_df.t2_rh_98_a0 - rh98_df.t1_rh_98_a0) / (rh98_df.t1_rh_98_a0 + 0.01) * 100
radd_df["pct_diff_agbd"] = (radd_df.t2_agbd_a0 - radd_df.t1_agbd_a0) / (radd_df.t1_agbd_a0 + 0.01) * 100
radd_df["pct_diff_rh98"] = (radd_df.t2_rh_98_a0 - radd_df.t1_rh_98_a0) / (radd_df.t1_rh_98_a0 + 0.01) * 100
afc_df["pct_diff_agbd"] = (afc_df.t2_agbd_a0 - afc_df.t1_agbd_a0) / (afc_df.t1_agbd_a0 + 0.01) * 100
afc_df["pct_diff_rh98"] = (afc_df.t2_rh_98_a0 - afc_df.t1_rh_98_a0) / (afc_df.t1_rh_98_a0 + 0.01) * 100

import pandas as pd
intact_df = pd.DataFrame({
    "AGBD": agbd_df.pct_diff,
    "RH 98": rh98_df.pct_diff,
    "Group": "All pairs",
})
big_df = pd.concat([intact_df, pd.DataFrame({
    "AGBD": radd_df[radd_df.sample_grp == "control"].pct_diff_agbd,
    "RH 98": radd_df[radd_df.sample_grp == "control"].pct_diff_rh98,
    "Group": "Disturbed forest\n(RADD)",
}), pd.DataFrame({
    "AGBD": afc_df[afc_df.sample_grp == "control"].pct_diff_agbd,
    "RH 98": afc_df[afc_df.sample_grp == "control"].pct_diff_rh98,
    "Group": "Disturbed forest\n(AFC)",
})])

print(len(agbd_df))
print(len(rh98_df))
print(len(radd_df[radd_df.sample_grp == "control"]))
print(len(afc_df[afc_df.sample_grp == "control"]))
print(len(big_df))
print(len(big_df[big_df["Group"] == "All pairs"]))
print(len(big_df[big_df["Group"] == "Disturbed forest\n(AFC)"]))
print(len(big_df[big_df["Group"] == "Disturbed forest\n(RADD)"]))

In [None]:
import seaborn as sns

dd=pd.melt(big_df,id_vars=['Group'],value_vars=['AGBD','RH 98'], var_name='Metric')
fig, axs = plt.subplots(1, 1, figsize=(8, 8))
sns.boxplot(x='Group',y='value', data=dd, hue='Metric', showfliers=False)

iqrs = dd.groupby(['Group','Metric']).describe()['value'][['25%','75%']]
print(iqrs)

# Add labels to the IQR
for iqr in iqrs.iterrows():
    name, metric = iqr[0]
    if 'All' in name:
        loc = 0
    if 'RADD' in name:
        loc = 1
    if 'AFC' in name:
        loc = 2
    if 'RH' in metric:
        loc += 0.4
    loc -= 0.2
    axs.text(loc, iqr[1][0], str(round(iqr[1][0],1)), color='white', 
             bbox=dict(facecolor='black', alpha=0.5, edgecolor='black'), ha='center')
    axs.text(loc, iqr[1][1], str(round(iqr[1][1],1)), color='white', 
             bbox=dict(facecolor='black', alpha=0.5, edgecolor='black'), ha='center')

axs.plot([-0.5, 2.5], [0, 0], color='black', linestyle='dashed', alpha = 0.5, linewidth = 3)
axs.set_xticklabels(axs.get_xticklabels(), fontsize=14)
axs.set_ylabel("Percent difference", fontsize=16)
axs.set_xlabel("Control group", fontsize=16)
plt.setp(axs.get_legend().get_texts(), fontsize='16') # for legend text
plt.setp(axs.get_legend().get_title(), fontsize='16') # for legend title