In [None]:
# This file contains the code neccessary for the EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pyrealm import pmodel
from matplotlib.patches import Patch
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.impute import KNNImputer
from matplotlib.font_manager import FontProperties
from matplotlib import font_manager



In [None]:
### import the df from file 8.Data_imputation

df=pd.read_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/FINAL_FINAL_DF.csv',index_col=0)

In [None]:
df=df.drop(columns='NEE_VUT_REF') #drop NEE as not needed

In [None]:
df=df.reset_index() #reset index

In [None]:
df.shape #27210

In [None]:
#import sif data from file 7.SIF

sif=pd.read_csv('/Users/abigailbase/PROJECT FILES/SIF/sif_df.csv',index_col=0)

In [None]:
sif_rename={'Delta_Time':'date','site_id':'SITE_ID'} #dictionary for renaming

In [None]:
sif.rename(columns=sif_rename,inplace=True) #rename for easier merging 

In [None]:
# check the data types of date col to make sure they are the same before setting index

print(df['date'].dtypes)
print(sif['date'].dtypes)

In [None]:
#set index of df and sif to be 'date' and 'SITE_ID' for merging 

df=df.set_index(['date','SITE_ID']) 
sif=sif.set_index(['date','SITE_ID'])


In [None]:
sif.shape #27210,5

In [None]:
df.shape #27210,18 - same row dimensions, ready to merge

In [None]:
# import NDVI from 5.HDF process

In [None]:
ndvi=pd.read_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/NDVI.csv',index_col=0)

In [None]:
# scale for NDVI is 0.0001

In [None]:
ndvi.reset_index(inplace=True)

In [None]:
ndvi_rename={'index':'date','site_id':'SITE_ID'}

In [None]:
ndvi.rename(columns=ndvi_rename,inplace=True) #rename for easier merging 

In [None]:
# check datatype of date

print(ndvi['date'].dtypes)

In [None]:
ndvi=ndvi.set_index(['date','SITE_ID'])


In [None]:
ndvi['NDVI_point']=ndvi['NDVI_point']*0.0001

In [None]:
### merge sif and df on the common index

df_merged=pd.merge(df,sif,left_index=True,right_index=True)

In [None]:
# now merge the df_merged and ndvi 

df_merged = pd.merge(df_merged, ndvi, left_index=True, right_index=True)


In [None]:
df_merged.isna().sum() #FULL DF, missing values are 5442 for gpp

In [None]:
### there are 5442 missing GPP values. As previously established these are for AR-Vir,RU-Cok, ZU-Kru GPP 
### values which will be calculated using the P-Model to maitain temporal resolution of the full ds

In [None]:
sif_uncertainty=df_merged[['Daily_Averaged_SIF','SIF_Uncertainty','Cloud_Fraction']]

#will be used for further analysis later.


In [None]:
sif_drop=['SIF_Uncertainty','Cloud_Fraction'] #drop these from the analysis for now 

In [None]:
df_merged=df_merged.drop(columns=sif_drop)

In [None]:
### drop the additional latitude and longitude

df_merged=df_merged.drop(columns=['Latitude','Longitude'])

In [None]:
# fAPAR cacluation 


# 1.24*NDVI-0.168

df_merged['fapar'] = (1.24 * df_merged['NDVI_point'] - 0.168).clip(0, 1)


In [None]:
min_value = df_merged['fapar'].min() 
max_value = df_merged['fapar'].max()

print(f"Minimum value of fAPAR: {min_value}") #0
print(f"Maximum value of fAPAR: {max_value}") #1

In [None]:
heatmap_df=df_merged.drop(columns=['IGBP','hemisphere',
                                   'DAY','MONTH','YEAR',
                                   'LAT','LONG',
                                   'NDVI_point','NDVI'])

In [None]:
heatmap_labs = {
    'TA_F': 'Temperature (°C)',
    'PA_F': 'Surface Pressure (kPa)',
    'VPD_F': 'VPD (hPa)',
    'P_F': 'Precipitation (mm)',
    'WS_F': 'WS (m s$^{-1}$)',
    'PPFD_IN': 'PPFD (µmol Photon m$^{-2}$ s$^{-1}$)',
    'CO2_F_MDS': 'CO$_2$ MF (µmol CO$_2$ mol$^{-1}$)',
    'TS_F_MDS_1': 'Soil Temperature (°C)',
    'SWC_F_MDS_1': 'SWC (%)',
    'GPP_DT_VUT_REF': 'GPP (µmol CO$_2$ m$^{-2}$ s$^{-1}$)',
    'Daily_Averaged_SIF': 'SIF (mW m$^{-2}$ nm$^{-1}$ sr$^{-1}$)',
    'fapar': 'fAPAR',
    'NIRv': 'NIRv'
}


In [None]:
### heatmap 

corr_matrix = heatmap_df.corr()

# Rename the columns and rows of the correlation matrix using the custom labels
corr_matrix = corr_matrix.rename(columns=heatmap_labs, index=heatmap_labs)

# Create the clustermap with custom labels
sns.clustermap(corr_matrix, cmap='coolwarm', annot=True, fmt=".2f", linewidths=0.5, 
               dendrogram_ratio=(0.1, 0.2), figsize=(12, 8))

plt.show()

In [None]:
df_merged=df_merged.reset_index() #reset index to extract site_id and date

In [None]:
sites=df_merged['SITE_ID'].unique() #unique site ids

In [None]:
# seperate sites out based on site ID

AR_Vir=df_merged[df_merged['SITE_ID']=='AR-Vir']#1
AU_Dry=df_merged[df_merged['SITE_ID']=='AU-Dry']#2
BE_Vie=df_merged[df_merged['SITE_ID']=='BE-Vie']#3
CA_TP1=df_merged[df_merged['SITE_ID']=='CA-TP1']#4
CH_Cha=df_merged[df_merged['SITE_ID']=='CH-Cha']#5
DE_Gri=df_merged[df_merged['SITE_ID']=='DE-Gri']#6
FR_Pue=df_merged[df_merged['SITE_ID']=='FR-Pue']#7
GF_Guy=df_merged[df_merged['SITE_ID']=='GF-Guy']#8
IT_Col=df_merged[df_merged['SITE_ID']=='IT-Col']#9
NL_Loo=df_merged[df_merged['SITE_ID']=='NL-Loo']#10
RU_Cok=df_merged[df_merged['SITE_ID']=='RU-Cok']#11
RU_Fyo=df_merged[df_merged['SITE_ID']=='RU-Fyo']#12
US_PFa=df_merged[df_merged['SITE_ID']=='US-PFa']#13
US_Var=df_merged[df_merged['SITE_ID']=='US-Var']#14
ZA_Kru=df_merged[df_merged['SITE_ID']=='ZA-Kru']#15


In [None]:
### FLUXNET GPP were reintroduced as they seemed to have been lost for some sites during processing

### RU_Cok

ru_gpp=pd.read_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/additional gpps/ru.csv',index_col=0)

In [None]:
gpp_rename={'TIMESTAMP':'date'}

In [None]:
ru_gpp.rename(columns=gpp_rename,inplace=True) 

In [None]:
ru_gpp['date'].dtypes

In [None]:
ru_gpp.set_index('date',inplace=True)

In [None]:
RU_Cok.set_index('date',inplace=True)

In [None]:
# combine

RU_Cok['GPP_DT_VUT_REF'] = RU_Cok['GPP_DT_VUT_REF'].combine_first(ru_gpp['GPP_DT_VUT_REF'])


In [None]:
RU_Cok=RU_Cok.reset_index()

In [None]:
RU_Cok.isna().sum() #353

In [None]:
ru_na=(353/1814)*100
ru_na #19.25

In [None]:
ru_na_df=RU_Cok[RU_Cok['GPP_DT_VUT_REF'].isna()]

In [None]:
### AR_Vir

In [None]:
ar_gpp=pd.read_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/additional gpps/ar.csv')

In [None]:
ar_gpp.rename(columns=gpp_rename,inplace=True) 

In [None]:
ar_gpp['date'].dtypes

In [None]:
ar_gpp.set_index('date',inplace=True)

In [None]:
AR_Vir.set_index('date',inplace=True)

In [None]:
AR_Vir['GPP_DT_VUT_REF'] = AR_Vir['GPP_DT_VUT_REF'].combine_first(ar_gpp['GPP_DT_VUT_REF'])


In [None]:
AR_Vir.isna().sum() #718

In [None]:
# DROP MISSING

AR_Vir_cleaned = AR_Vir.dropna()


In [None]:
AR_Vir_cleaned.isna().sum()

In [None]:
AR_Vir_cleaned=AR_Vir_cleaned.reset_index()

In [None]:
ar_na=(718/1814)*100
ar_na #39.58

In [None]:
#ZA_Kru

za_gpp=pd.read_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/additional gpps/za.csv')

In [None]:
za_gpp.rename(columns=gpp_rename,inplace=True) 

In [None]:
za_gpp.set_index('date',inplace=True)

In [None]:
ZA_Kru.set_index('date',inplace=True)

In [None]:
ZA_Kru['GPP_DT_VUT_REF'] = ZA_Kru['GPP_DT_VUT_REF'].combine_first(za_gpp['GPP_DT_VUT_REF'])


In [None]:
ZA_Kru.isna().sum() #353

In [None]:
ZA_Kru.shape

In [None]:
za_na=(353/1814)*100
za_na #19%

In [None]:
ZA_Kru=ZA_Kru.reset_index()

In [None]:
AR_Vir['GPP_DT_VUT_REF'] = AR_Vir['GPP_DT_VUT_REF'].combine_first(missing_AR['GPP_DT_VUT_REF'])


In [None]:
AR_Vir.isna().sum()

In [None]:
ZA_Kru.isna().sum()

In [None]:
# RU_Cok

missing_RU = RU_Cok[RU_Cok['GPP_DT_VUT_REF'].isnull()]

In [None]:
# DROP MISSING

RU_Cok_cleaned = RU_Cok.dropna()


In [None]:
RU_Cok_cleaned.isna().sum()

In [None]:
# ZA-Kru


In [None]:
missing_ZA = ZA_Kru[ZA_Kru['GPP_DT_VUT_REF'].isnull()]

In [None]:
# DROP MISSING

ZA_Kru_cleaned = ZA_Kru.dropna()

In [None]:
ZA_Kru_cleaned.isna().sum()

In [None]:
### combine all the individual sites 

gpp_imputed=pd.concat([AR_Vir_cleaned,AU_Dry,BE_Vie,CA_TP1,CH_Cha,DE_Gri,FR_Pue,GF_Guy,IT_Col,NL_Loo,
                   RU_Cok_cleaned,RU_Fyo,US_PFa,US_Var,ZA_Kru_cleaned])

In [None]:
gpp_imputed.shape 

In [None]:
gpp_imputed.isna().sum() #now no missing values

In [None]:
gpp_imputed.shape

In [None]:
### create a copy of the gpp df ###

eda_df=gpp_imputed.copy()

In [None]:
eda_df.shape #27210

In [None]:
### OUTLIERS ###

In [None]:
# pair plot

In [None]:
var_ax_labs={'SITE_ID':'Site ID','TA_F':'Temperature (°C)','PA_F':'Surface Pressure (kPa)',
             'VPD_F':'Vapor Pressure Deficit (hPa)','P_F':'Precipitation (mm)',
            'WS_F':'Wind Speed (ms-1)','PPFD_IN':'PPFD (µmolPhoton m-2 s-1)',
            'CO2_F_MDS':'CO2 Mole Fraction (µmolCO2 mol-1)','TS_F_MDS_1':'Soil Temperature (°C)',
            'SWC_F_MDS_1':'Soil Water Content (%)',
            'GPP_DT_VUT_REF':'GPP (µmolCO2 m-2 s-1)'}


In [None]:
meteorological=eda_df[['TA_F','PA_F','VPD_F','P_F','WS_F','TS_F_MDS_1','SWC_F_MDS_1','CO2_F_MDS']]

In [None]:
meteorological.shape

In [None]:
radiometric=eda_df[['NIRv','PPFD_IN','Daily_Averaged_SIF','fapar']]

In [None]:
target=eda_df[['GPP_DT_VUT_REF']]

In [None]:
colors = sns.color_palette("husl", len(meteorological.columns))

# box plots for each variable 
plt.figure(figsize=(15, 10))
for i, (col, color) in enumerate(zip(meteorological.columns, colors), 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=meteorological[col], width=0.3, color=color, linewidth=1.5, notch=True, showfliers=True,
                flierprops=dict(marker='o', color='red', markersize=5))  # Adjust the width and set the color here
    plt.title(var_ax_labs[col], fontsize=14, fontweight='bold')  # Use the dictionary to set labels
    plt.ylabel(var_ax_labs[col], fontsize=12)  # Set y-axis label
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 10))

for i, (col, color) in enumerate(zip(meteorological.columns, colors), 1):
    plt.subplot(3, 3, i)
    
    # plot KDE for each variable
    sns.kdeplot(data=meteorological[col], color=color, linewidth=2, fill=True)
    
    plt.title(var_ax_labs[col], fontsize=14, fontweight='bold')
    plt.xlabel(var_ax_labs[col], fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
plt.tight_layout()
plt.show()



In [None]:
# identify outliers based on iqr

In [None]:
# temp, pressure,soil temp and swc outlier detection as these will be imputed with knn

knn_eda_df=eda_df[['TA_F','PA_F','TS_F_MDS_1','SWC_F_MDS_1']]

In [None]:
knn_eda_df.shape #25786

In [None]:
# plot of kde for all variables to compare 

plt.figure(figsize=(10, 6))

sns.kdeplot(knn_eda_df['TA_F'], label='TA_F', fill=True)
sns.kdeplot(knn_eda_df['PA_F'], label='PA_F', fill=True)
sns.kdeplot(knn_eda_df['TS_F_MDS_1'], label='TS_F_MDS_1', fill=True)
sns.kdeplot(knn_eda_df['SWC_F_MDS_1'], label='SWC_F_MDS_1', fill=True)

plt.xlabel('Value', fontsize=12)
plt.ylabel('Density', fontsize=12)

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# temperature 

# calculate Q1 and Q3 
Q1 = knn_eda_df['TA_F'].quantile(0.25)
Q3 = knn_eda_df['TA_F'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
knn_eda_df.loc[(knn_eda_df['TA_F'] < lower_bound) | (knn_eda_df['TA_F'] > upper_bound), 'TA_F'] = np.nan


In [None]:
knn_eda_df.isna().sum() #579 outliers

In [None]:
### surface pressure


# calculate Q1 and Q3 
Q1 = knn_eda_df['PA_F'].quantile(0.25)
Q3 = knn_eda_df['PA_F'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
knn_eda_df.loc[(knn_eda_df['PA_F'] < lower_bound) | (knn_eda_df['PA_F'] > upper_bound), 'PA_F'] = np.nan



In [None]:
knn_eda_df.isna().sum() #1857 outliers

In [None]:
### soil temp

# calculate Q1 and Q3 
Q1 = knn_eda_df['TS_F_MDS_1'].quantile(0.25)
Q3 = knn_eda_df['TS_F_MDS_1'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
knn_eda_df.loc[(knn_eda_df['TS_F_MDS_1'] < lower_bound) | (knn_eda_df['TS_F_MDS_1'] > upper_bound), 'TS_F_MDS_1'] = np.nan


In [None]:
knn_eda_df.isna().sum() #562

In [None]:
# swc

# calculate Q1 and Q3 
Q1 = knn_eda_df['SWC_F_MDS_1'].quantile(0.25)
Q3 = knn_eda_df['SWC_F_MDS_1'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
knn_eda_df.loc[(knn_eda_df['SWC_F_MDS_1'] < lower_bound) | (knn_eda_df['SWC_F_MDS_1'] > upper_bound), 'SWC_F_MDS_1'] = np.nan



In [None]:
knn_eda_df.isna().sum() #11

In [None]:
### apply KNN imputer algorithm

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)

In [None]:
knn_imputed_df=pd.DataFrame(imputer.fit_transform(knn_eda_df), columns=knn_eda_df.columns)

In [None]:
knn_imputed_df.isna().sum() #all zero imputation successful 

In [None]:
# plot of kde for all variables to compare 

plt.figure(figsize=(10, 6))

sns.kdeplot(knn_eda_df['TA_F'], label='TA_F', fill=True)
sns.kdeplot(knn_eda_df['PA_F'], label='PA_F', fill=True)
sns.kdeplot(knn_eda_df['TS_F_MDS_1'], label='TS_F_MDS_1', fill=True)
sns.kdeplot(knn_eda_df['SWC_F_MDS_1'], label='SWC_F_MDS_1', fill=True)

plt.xlabel('Value', fontsize=12)
plt.ylabel('Density', fontsize=12)

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
### create a copy of the eda df

post_knn=eda_df.copy()

In [None]:
### drop the columns from post_knn that have had outliers removed

post_knn=post_knn.drop(columns=['TA_F','PA_F','TS_F_MDS_1','SWC_F_MDS_1'])

In [None]:
# reset index of both dfs to prepare for merging

post_knn= post_knn.reset_index(drop=True)
knn_imputed_df= knn_imputed_df.reset_index(drop=True)

In [None]:
# concat the post_knn (with vars removed) and knn_imputed_df with the outliers removed

post_knn=pd.concat([post_knn,knn_imputed_df],axis=1)

In [None]:
post_knn.shape #check shape = 25789

In [None]:
# log transform VPD and precipitation 

In [None]:
post_knn['VPD_log']=np.log(post_knn['VPD_F']+1) #add 1 to avoid log(0) error

In [None]:
post_knn=post_knn.drop(columns=['VPD_F']) #drop non-log col

In [None]:
post_knn['PPT_log']=np.log(post_knn['P_F']+1) #add 1 to avoid log(0) error

In [None]:
post_knn=post_knn.drop(columns=['P_F']) #drop non-log col

In [None]:
post_knn['date']=pd.to_datetime(post_knn['date'])

In [None]:
post_knn.dtypes

In [None]:
# impute wind speed with the median value

#calculate median
median_ws=post_knn['WS_F'].median()

# calculate Q1 and Q3
Q1 = post_knn['WS_F'].quantile(0.25)
Q3 = post_knn['WS_F'].quantile(0.75)

# calculate IQR 
IQR = Q3 - Q1

# outlier thresholds 
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace values outside the lower and upper bounds with NaN
post_knn.loc[(post_knn['WS_F'] < lower_bound) | (post_knn['WS_F'] > upper_bound), 'WS_F'] = np.nan

# impute missing value NA with  median
post_knn['WS_F'].fillna(median_ws, inplace=True)



In [None]:
# impute CO2 with the mean value

#calculate median
mean_co2=post_knn['CO2_F_MDS'].mean()

# calculate Q1 and Q3
Q1 = post_knn['CO2_F_MDS'].quantile(0.25)
Q3 = post_knn['CO2_F_MDS'].quantile(0.75)

# calculate IQR 
IQR = Q3 - Q1

# outlier thresholds 
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace values outside the lower and upper bounds with NaN
post_knn.loc[(post_knn['CO2_F_MDS'] < lower_bound) | (post_knn['CO2_F_MDS'] > upper_bound), 'CO2_F_MDS'] = np.nan

# impute missing value NA with  mean
post_knn['CO2_F_MDS'].fillna(mean_co2, inplace=True)


In [None]:
###################################################################

In [None]:
# Radiometric (light-based) outliers 

In [None]:
radiometric_df=eda_df[['date','SITE_ID','NIRv','PPFD_IN','Daily_Averaged_SIF','fapar']]

In [None]:
radiometric_df['date']=pd.to_datetime(radiometric_df['date'])

In [None]:
radiometric_df.shape #25786

In [None]:
radiometric_df = radiometric_df.sort_values(by='date')


In [None]:
radiometric_df_box=radiometric_df[['NIRv','PPFD_IN','Daily_Averaged_SIF','fapar']]

In [None]:
radiometric_labs={'NIRv':'NIRv','PPFD_IN':'PPFD','Daily_Averaged_SIF':'SIF','fapar':'fAPAR'}

In [None]:
### box plots for radiometric

In [None]:
colors = sns.color_palette("husl", len(radiometric_df_box.columns))

# Create box plots for each variable with different colors and various customizations
plt.figure(figsize=(15, 10))
for i, (col, color) in enumerate(zip(radiometric_df_box.columns, colors), 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=radiometric_df[col], width=0.3, color=color, linewidth=1.5, notch=True, showfliers=True,
                flierprops=dict(marker='o', color='red', markersize=5))  # Adjust the width and set the color here
    plt.title(radiometric_labs[col], fontsize=14, fontweight='bold')  # Use the dictionary to set labels
    plt.ylabel(radiometric_labs[col], fontsize=12)  # Set y-axis label
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 10))

# Iterate over each column and color
for i, (col, color) in enumerate(zip(radiometric_df_box.columns, colors), 1):
    plt.subplot(3, 3, i)
    
    # Plot KDE for each variable
    sns.kdeplot(data=radiometric_df_box[col], color=color, linewidth=2, fill=True)
    
    # Set the title and labels using the dictionary
    plt.title(radiometric_labs[col], fontsize=14, fontweight='bold')
    plt.xlabel(radiometric_labs[col], fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
plt.tight_layout()
plt.show()



In [None]:
## kNN outlier imputation on NIRv and PPFD

In [None]:
radiometric_df.dtypes

In [None]:
# NIRv

# calculate Q1 and Q3 
Q1 = radiometric_df['NIRv'].quantile(0.25)
Q3 = radiometric_df['NIRv'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
radiometric_df.loc[(radiometric_df['NIRv'] < lower_bound) | (radiometric_df['NIRv'] > upper_bound), 'NIRv'] = np.nan




In [None]:
radiometric_df.isna().sum() #580

In [None]:
# PPFD

# calculate Q1 and Q3 
Q1 = radiometric_df['PPFD_IN'].quantile(0.25)
Q3 = radiometric_df['PPFD_IN'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
radiometric_df.loc[(radiometric_df['PPFD_IN'] < lower_bound) | (radiometric_df['PPFD_IN'] > upper_bound), 'PPFD_IN'] = np.nan



In [None]:
radiometric_df.isna().sum() #45

In [None]:
# sif

# calculate Q1 and Q3 
Q1 = radiometric_df['Daily_Averaged_SIF'].quantile(0.25)
Q3 = radiometric_df['Daily_Averaged_SIF'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
radiometric_df.loc[(radiometric_df['Daily_Averaged_SIF'] < lower_bound) | (radiometric_df['Daily_Averaged_SIF'] > upper_bound), 'Daily_Averaged_SIF'] = np.nan


In [None]:
radiometric_df.isna().sum() #1505

In [None]:
# fapar


# calculate Q1 and Q3 
Q1 = radiometric_df['fapar'].quantile(0.25)
Q3 = radiometric_df['fapar'].quantile(0.75)


# calculate IQR
IQR = Q3 - Q1


# define  thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# replace the outliers with NA
radiometric_df.loc[(radiometric_df['fapar'] < lower_bound) | (radiometric_df['fapar'] > upper_bound), 'fapar'] = np.nan


In [None]:
# apply knn imputation 

In [None]:
radiometric_df.set_index(['date','SITE_ID'],inplace=True)

In [None]:
radiometric_imputed=pd.DataFrame(imputer.fit_transform(radiometric_df), columns=radiometric_df.columns)

In [None]:
radiometric_imputed.isna().sum() #no NA values so was successful 

In [None]:
radiometric_imputed_rename={'NIRv':'NIRv_imp','PPFD_IN':'PPFD_imp'}

In [None]:
radiometric_imputed.rename(columns=radiometric_imputed_rename,inplace=True)

In [None]:
#replace with knn imputed values
radiometric_df.loc[:, :] = radiometric_imputed.values


In [None]:
### merge the post_knn (meterologcal) and radiometric for
### local outlier analysis

local_eda = pd.merge(post_knn, radiometric_df, on=['SITE_ID', 'date'], how='inner')


In [None]:
local_eda.isna().sum()

In [None]:
local_eda=local_eda.drop(columns=['PPFD_IN_x','NIRv_x','NDVI','Daily_Averaged_SIF_x','fapar_x'])

In [None]:
local_eda.shape #25786

In [None]:
# export the final dfs

In [None]:
local_eda_2014_emit=local_eda[local_eda['YEAR']!=2014]

In [None]:
### remove US-Var from the EDA to avoid data leakage

local_eda_2014_emit=local_eda_2014_emit[local_eda_2014_emit['SITE_ID']!='US-Var']

In [None]:
only_2014=local_eda[local_eda['YEAR']==2014]

In [None]:
only_2014=only_2014[only_2014['SITE_ID']!='US-Var']

In [None]:
US_Var_validation=local_eda[local_eda['SITE_ID']=='US-Var']

In [None]:
# model training ds

local_eda_2014_emit.to_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/Modelling dfs/training.csv')

In [None]:
# model testing ds

only_2014.to_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/Modelling dfs/test.csv')

In [None]:
# generalisation test ds

US_Var_validation.to_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/Modelling dfs/US_Var_Validation.csv')

In [None]:
## target variable analysis


In [None]:
cat_eda=local_eda_2014_emit.copy()

In [None]:
cat_eda['SITE_ID'].unique()

In [None]:
# historgram of gpp

In [None]:
mean_value =cat_eda['GPP_DT_VUT_REF'].mean()
median_value = cat_eda['GPP_DT_VUT_REF'].median()
mode_value = cat_eda['GPP_DT_VUT_REF'].mode()[0]  
std_dev = cat_eda['GPP_DT_VUT_REF'].std()
variance = cat_eda['GPP_DT_VUT_REF'].var()
skewness = cat_eda['GPP_DT_VUT_REF'].skew()
kurtosis =cat_eda['GPP_DT_VUT_REF'].kurtosis()

print('Mean:',mean_value)
print('Median:',median_value)
print('Model:',mode_value)


In [None]:
# optimal number of bins determined using the Freedman-Diaconis rule

q75, q25 = np.percentile(cat_eda['GPP_DT_VUT_REF'], [75, 25])
iqr = q75 - q25
bin_width = 2 * iqr * len(cat_eda['GPP_DT_VUT_REF']) ** (-1/3)
num_bins = int(np.ptp(cat_eda['GPP_DT_VUT_REF']) / bin_width)
num_bins

In [None]:
# histogram of GPP distribution 
plt.figure(figsize=(10, 6))
ax=sns.histplot(data=cat_eda,x='GPP_DT_VUT_REF', 
             bins=45,color='lightgray',
             edgecolor='dimgrey',
             alpha=0.6,
             kde=True)


ax.lines[0].set_color('red')
ax.lines[0].set_lw(1)

plt.xlabel('GPP (gC m$^{-2}$ d$^{-1}$)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()


In [None]:
cat_eda['GPP_DT_VUT_REF'].mean()

In [None]:
### investigating the prevelance of zeros

zero_gpp = cat_eda[cat_eda['GPP_DT_VUT_REF'] == 0]


In [None]:
total_counts = cat_eda.groupby(['hemisphere', 'MONTH']).size().reset_index(name='total_count')


In [None]:
# split by hemisphere due to the opposite seasons

zero_gpp_count = zero_gpp.groupby(['hemisphere', 'MONTH','IGBP']).size().reset_index(name='zero_count')
zero_gpp_count_nh=zero_gpp_count[zero_gpp_count['hemisphere']=='NH']
zero_gpp_count_sh=zero_gpp_count[zero_gpp_count['hemisphere']=='SH']

In [None]:
# merge the total and zero counts

merged_counts = pd.merge(zero_gpp_count, total_counts, on=['hemisphere', 'MONTH'])


In [None]:
# calculate percentage

merged_counts['fraction_zero'] = merged_counts['zero_count'] / merged_counts['total_count']


In [None]:
zero_gpp_perc = zero_gpp.groupby(['hemisphere']).size().reset_index(name='zero_count')

In [None]:
total_perc = cat_eda.groupby(['hemisphere']).size().reset_index(name='total_count')


In [None]:
total_perc=pd.merge(zero_gpp_perc, total_perc, on=['hemisphere'])

In [None]:
total_perc['percentage']=(total_perc['zero_count']/total_perc['total_count'])*100

In [None]:
merged_nh=merged_counts[merged_counts['hemisphere']=='NH']

In [None]:
merged_sh=merged_counts[merged_counts['hemisphere']=='SH']

In [None]:
# function to assign season based on hemisphere and month

def assign_season(row):
    hemisphere = row['hemisphere']
    month = row['MONTH']
    
    if hemisphere == 'NH':
        if month in [12, 1, 2]:  # December, January, February
            return 'Winter'
        elif month in [3, 4, 5]:  # March, April, May
            return 'Spring'
        elif month in [6, 7, 8]:  # June, July, August
            return 'Summer'
        elif month in [9, 10, 11]:  # September, October, November
            return 'Autumn'
    elif hemisphere == 'SH':
        if month in [12, 1, 2]:  # December, January, February
            return 'Summer'
        elif month in [3, 4, 5]:  # March, April, May
            return 'Autumn'
        elif month in [6, 7, 8]:  # June, July, August
            return 'Winter'
        elif month in [9, 10, 11]:  # September, October, November
            return 'Spring'
    
    return None  


In [None]:
# apply the function 
merged_counts['season'] = merged_counts.apply(assign_season, axis=1)

In [None]:
summer_rows = pd.DataFrame({
    'season': ['Summer', 'Summer'],
    'hemisphere': ['NH', 'SH'],
    'zero_count': [0, 0]
})

In [None]:
merged_counts_full = pd.concat([merged_counts, summer_rows], ignore_index=True)


In [None]:
# plot of zero counts by hemisphere and season

plt.figure(figsize=(15, 8))

sns.barplot(x='season', y='zero_count', edgecolor='black', 
            data=merged_counts_full, hue='hemisphere', 
            palette={'NH': 'red', 'SH': 'deepskyblue'},
            errorbar=None,
            alpha=0.8)

plt.xlabel('Season', fontsize=14)
plt.ylabel('Count of GPP = 0 $\mu$mol CO$_2$ m$^{-2}$ s$^{-1}$', fontsize=14)
plt.xticks(fontsize=12)
plt.legend(title='Hemisphere', fontsize=12)

plt.show()

In [None]:
### zero counts by IGBP 

In [None]:
zero_gpp_IGBP = zero_gpp.groupby(['hemisphere', 'IGBP']).size().reset_index(name='zero_count')

In [None]:
zero_gpp_IGBP_nh=zero_gpp_IGBP[zero_gpp_IGBP['hemisphere']=='NH']

In [None]:
total_counts_IGBP = cat_eda.groupby(['hemisphere', 'IGBP']).size().reset_index(name='total_count')

In [None]:
merged_counts_igbp = pd.merge(zero_gpp_IGBP_nh, total_counts_IGBP, on=['hemisphere', 'IGBP'])


In [None]:
merged_counts_igbp['fraction_zero'] = merged_counts_igbp['zero_count'] / merged_counts_igbp['total_count']


In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x='IGBP', y='zero_count',edgecolor='black' ,color='lightsteelblue', data=zero_gpp_IGBP_nh)

# Add the fraction of zero values as text above the bars
for i in range(len(merged_counts_igbp)):
    row = merged_counts_igbp.iloc[i]
    plt.text(
        x=i % 12,  # position horizontally based on the month (x-axis)
        y=row['zero_count'] + 6,  # position slightly above the bar
        s=f"{row['fraction_zero']:.2%}",  # format the text
        ha='center',
        fontweight='bold'
    )

plt.xlabel('IGBP', fontsize=14)
plt.ylabel('Count of zero GPP values in the Northern Hemisphere', fontsize=14)

plt.show()

In [None]:
### the only OSH site is RU-Cok which is in a unique location and may not represent
### OSH types in reality. 

In [None]:
## Categorical variables ##

In [None]:
# hemishpere and GPP 

In [None]:
cat_eda=cat_eda[cat_eda['YEAR']!=2014]

In [None]:
## decompose time series components 

In [None]:
dec=cat_eda.copy()

In [None]:
dec['season'] = dec.apply(assign_season, axis=1)


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
dec.set_index('date',inplace=True)

In [None]:
dec_NH=dec[dec['hemisphere']=='NH']
dec_SH=dec[dec['hemisphere']=='SH']

In [None]:
result_NH=seasonal_decompose(dec_NH['GPP_DT_VUT_REF'], model='additive',period=int(365),extrapolate_trend='freq')

In [None]:
result_NH.plot()
plt.show()

In [None]:
trend_NH=result_NH.trend
seasonal_NH=result_NH.seasonal
residua_NH=result_NH.resid

In [None]:

# two subplots

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7), sharey=True)

#plot NH
axes[0].plot(residua_NH, color='gray', linewidth=1.5)
axes[0].set_title('Northern Hemisphere', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Time', fontsize=12)
axes[0].set_ylabel('GPP (gC m$^{-2}$ d$^{-1}$)', fontsize=12)

#plot SH
axes[1].plot(residua_SH, color='gray', linewidth=1.5)
axes[1].set_title('Southern Hemisphere', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Time', fontsize=12)

# Adjust layout to make space for the titles and labels
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
result_SH=seasonal_decompose(dec_SH['GPP_DT_VUT_REF'], model='additive',period=int(365),extrapolate_trend='freq')

In [None]:
# plot of GPP timeseries per site NH

plt.figure(figsize=(16,8))

palette = sns.color_palette("Set2", len(dec_NH['SITE_ID'].unique()))

sns.lineplot(x=dec_NH.index, y=dec_NH["GPP_DT_VUT_REF"], hue=dec_NH['SITE_ID'], palette=palette)

plt.xlabel('Date', fontsize=15)
plt.ylabel('GPP (gC m$^{-2}$ d$^{-1}$)', fontsize=15)

handles = [lines.Line2D([0], [0], color=palette[i], marker='o', linestyle='', markersize=10, label=site_id) 
           for i, site_id in enumerate(dec_NH['SITE_ID'].unique())]

plt.legend(handles=handles, title='Site ID', fontsize=12, title_fontsize='13', 
           loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()  
plt.show()


In [None]:
# plot of GPP timeseries per site SH

plt.figure(figsize=(16,8))

palette = {'AR-Vir': 'orange',
           'AU-Dry': 'seagreen',
           'ZA-Kru': 'mediumorchid'} 

sns.lineplot(x=dec_SH.index, y=dec_SH["GPP_DT_VUT_REF"], 
             hue=dec_SH['SITE_ID'], palette=palette,alpha=0.7)

plt.xlabel('Date', fontsize=15)
plt.ylabel('GPP (gC m$^{-2}$ d$^{-1}$)', fontsize=15)

handles = [lines.Line2D([0], [0], color=palette[site_id], marker='o', linestyle='', markersize=10, label=site_id) 
           for site_id in dec_SH['SITE_ID'].unique() if site_id in palette]

plt.legend(handles=handles, title='Site ID', fontsize=12, title_fontsize='13', 
           loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()  
plt.show()

In [None]:
### investigating seasonality 

In [None]:
eda_hem

In [None]:
### NH ### 

# winter = DJF (1,2,12)
# spring = MAM (3,4,5)
# summer = JJA (6,7,8)
# autumn = SON (9,10,11)

In [None]:
eda_hem_NH=eda_hem[eda_hem['hemisphere']=='NH']
eda_hem_SH=eda_hem[eda_hem['hemisphere']=='SH']

In [None]:
def assign_season(month):
    if month in [12,1,2]:
        return 'DJF'
    elif month in [3,4,5]:
        return 'MAM'
    elif month in [6,7,8]:
        return 'JJA'
    elif month in [9,10,11]:
        return 'SON'

In [None]:
eda_hem_NH['season']=eda_hem_NH['month'].apply(assign_season)

In [None]:
mon_average_gpp=eda_hem.groupby(['hemisphere', 'month'])['GPP_DT_VUT_REF'].mean().reset_index()


In [None]:
mon_average_gpp_NH=mon_average_gpp[mon_average_gpp['hemisphere']=='NH']
mon_average_gpp_SH=mon_average_gpp[mon_average_gpp['hemisphere']=='SH']

In [None]:
mon_average_gpp_NH['season']=mon_average_gpp_NH['month'].apply(assign_season)
mon_average_gpp_SH['season']=mon_average_gpp_SH['month'].apply(assign_season)

In [None]:
mon_average_gpp_NH['month'] = pd.Categorical(mon_average_gpp_NH['month'], categories=month_order, ordered=True)
mon_average_gpp_SH['month'] = pd.Categorical(mon_average_gpp_SH['month'], categories=month_order, ordered=True)


In [None]:
# GPP and IGBP 

# timeseries of mean gpp by IGBP

#group by IBP and Month and calculate the mean GPP 

time_df=cat_eda.copy()


time_df['DATE_combined'] = pd.to_datetime(time_df[['YEAR', 'MONTH']].assign(DAY=1))

mean_gpp_per_month = time_df.groupby(['IGBP', 'DATE_combined'])['GPP_DT_VUT_REF'].mean().reset_index()



In [None]:
plt.figure(figsize=(16, 6))

for igbp_type in mean_gpp_per_month['IGBP'].unique():
    subset = mean_gpp_per_month[mean_gpp_per_month['IGBP'] == igbp_type]
    plt.plot(subset['DATE_combined'], subset['GPP_DT_VUT_REF'], marker='o', label=igbp_type)

plt.xlabel('Date',fontsize=16)
plt.ylabel('Average GPP (gC m$^{-2}$ d$^{-1}$)',fontsize=16)
plt.legend(title='IGBP Type')
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)

plt.legend(title='IGBP Type', bbox_to_anchor=(1.05, 1), 
           loc='upper left',fontsize=16,title_fontsize=16)

plt.tight_layout()
plt.show()

In [None]:
# pair plot of met vars

In [None]:
par_plot_labs={'hemisphere':'Hemisphere',
               'TA_F':'Temperature (°C)',
               'VPD_log':'VPD (hPa)',
               'GPP_DT_VUT_REF': 'GPP (gC m$^{-2}$ d$^{-1}$)',
               'SWC_F_MDS_1':'SWC (%)',
               'NIRv_y':'NIRv',
               'PPFD_IN_y':'PPFD (µmol Photon m$^{-2}$ s$^{-1}$)',
               'Daily_Averaged_SIF_y':'SIF (mW m$^{-2}$ nm$^{-1}$ sr$^{-1}$)'
}

In [None]:
pair_vars_renamed = pair_vars.rename(columns=par_plot_labs)


In [None]:
pair_cols={'NH':'darkgray','SH':'darkorange'}

In [None]:
from matplotlib.lines import Line2D


g=sns.pairplot(pair_vars_renamed, 
               hue='Hemisphere',
               markers=['8','D'],
               palette=pair_cols,
               plot_kws={'s': 10,'alpha':0.5}
              )

                
for ax in g.axes.flatten():
    ax.set_xlabel(ax.get_xlabel(), fontsize=12)  
    ax.set_ylabel(ax.get_ylabel(), fontsize=12)  

legend = g._legend
legend.set_title('Hemisphere', prop={'size': 13})  
legend.set_bbox_to_anchor((1, 1))  
for text in legend.get_texts():
    text.set_fontsize(14)  

# Add a box to the legend
legend.get_frame().set_edgecolor('black')  
legend.get_frame().set_linewidth(1.5)     

            
plt.show()

In [None]:
# isolate SAV IGBP data

sav=cat_eda[cat_eda['IGBP']=='SAV']

In [None]:
# histogram of temp in Savanna ecosystem
plt.figure(figsize=(10, 6))

sns.histplot(data=sav, x='TA_F', bins=30, kde=True, color='blue')

plt.title('Savanna temperature distribution', fontsize=16)
plt.xlabel('Temperature (°C)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

plt.show()


In [None]:
# plot of different temp bins average GPP

cat_eda['Temp_Bin'] = pd.cut(cat_eda['TA_F'], bins=bins, labels=labels, right=False)


In [None]:
bins = [-45, -30, -15, 0, 15, 30, 45, 60]  
labels = ['<-30°C', '-30°C to -15°C', '-15°C to 0°C', '0°C to 15°C', '15°C to 30°C', '30°C to 45°C', '>45°C']  # Corresponding labels for each bin
colors = ['blue', 'lightcyan', 'turquoise', 'gold', 'limegreen', 'darkorange', 'red']



In [None]:
print(cat_eda['TA_F'].max()) #42.75
print(cat_eda['TA_F'].min()) #-23.96

In [None]:
plt.figure(figsize=(14, 7))

barplot=sns.barplot(x='IGBP', y='GPP_DT_VUT_REF', hue='Temp_Bin', data=cat_eda, edgecolor='black',
            palette=colors, errorbar=None)  # Use the custom color palette

plt.xlabel('IGBP', fontsize=14)
plt.ylabel('Average GPP (gC m$^{-2}$ d$^{-1}$)', fontsize=14)
plt.legend(title='Temperature Range', loc='upper right')


mean_temp = cat_eda.groupby(['IGBP', 'Temp_Bin'])['TA_F'].mean().reset_index()


# Show the plot
plt.show()

In [None]:
## SIF Uncertainty


uncertainty=sif_uncertainty.copy()

In [None]:
uncertainty.reset_index(inplace=True)

In [None]:
uncertainty['date']=pd.to_datetime(uncertainty['date'])

In [None]:
uncertainty['year']=uncertainty['date'].dt.year

In [None]:
uncertainty=uncertainty[uncertainty['year']==2014]

In [None]:
uncertainty.dtypes

In [None]:
uncertainty.set_index('date', inplace=True)


In [None]:
numeric_columns = [ 'SIF_Uncertainty', 'Cloud_Fraction']


In [None]:
daily_avg = uncertainty[numeric_columns].groupby('date').mean()


In [None]:
# plot of SIF uncertainty and cloud fraction 

plt.figure(figsize=(12, 8))

# Plot Daily Averaged SIF

# Plot SIF Uncertainty
plt.plot(daily_avg.index, daily_avg['SIF_Uncertainty'], label='SIF Uncertainty', color='orange')

# Plot Cloud Fraction
plt.plot(daily_avg.index, daily_avg['Cloud_Fraction'], label='Cloud Fraction', color='skyblue')

# Formatting the plot
plt.xlabel('Date',fontsize=14)
plt.ylabel('Average Values',fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability
plt.tight_layout()

# Show plot
plt.show()