In [None]:
### This file was used to add the downloaded ERA5 data to impute the missing FLUXNET data

# ERA5 data was sourced in file 3 and processed in file 4

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import CubicSpline
 


In [None]:
### fill in the missing data with ERA5 land ###

In [None]:
# import the df from  2.missing_data_exploration

df=pd.read_csv('/Users/abigailbase/PROJECT FILES/Pre imputation data/pre_imp.csv',index_col=0,parse_dates=['date'])

In [None]:
#import csvs

ERA5_AR_Vir=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/AR-Vir.csv',parse_dates=['valid_time'])
ERA5_AU_Dry=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/AU-Dry.csv',parse_dates=['valid_time'])
ERA5_BE_Vie=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/BE-Vie.csv',parse_dates=['valid_time'])
ERA5_CA_TP1=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/CA-TP1.csv',parse_dates=['valid_time'])
ERA5_CH_Cha=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/CH-Cha.csv',parse_dates=['valid_time'])
ERA5_DE_Gri=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/DE-Gri.csv',parse_dates=['valid_time'])
ERA5_FR_Pue=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/FR-Pue.csv',parse_dates=['valid_time'])
ERA5_GF_Guy=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/GF-Guy.csv',parse_dates=['valid_time'])
ERA5_IT_Col=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/IT-Col.csv',parse_dates=['valid_time'])
ERA5_NL_Loo=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/NL-Loo.csv',parse_dates=['valid_time'])
ERA5_RU_Cok=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/RU-Cok.csv',parse_dates=['valid_time'])
ERA5_RU_Fyo=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/RU-Fyo.csv',parse_dates=['valid_time'])
ERA5_US_PFa=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/US-PFa.csv',parse_dates=['valid_time'])
ERA5_US_Var=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/US-Var.csv',parse_dates=['valid_time'])
ERA5_ZA_Kru=pd.read_csv('/Users/abigailbase/PROJECT FILES/ERA5 CSVs/ZA-Kru.csv',parse_dates=['valid_time'])


In [None]:
# prepare the ERA5 data for imputing by converting the units 
# and creating required variables

In [None]:
def process_era5_data(df):
    
    # Convert temperatures to Celsius from Kelvin
    df['t2m'] = df['t2m'] - 273.15  # temp
    df['d2m'] = df['d2m'] - 273.15  # dpt
    
    # Convert pressure to kPa from Pa
    df['sp'] = df['sp'] / 1000
    
    # Calculate Saturation Vapor Pressure (svp) in kPa
    def svp(T): 
        return 0.6112 * np.exp((17.67 * T) / (T + 243.5))
    
    # Calculate VPD in hPa
    def calc_vpd(temp, DPT):
        svp_temp = svp(temp)
        svp_dpt = svp(DPT)
        vpd_kPa = svp_temp - svp_dpt
        vpd_hPa = vpd_kPa * 10  # convert to hPa
        return vpd_hPa
    
    df['VPD'] = df.apply(lambda row: calc_vpd(row['t2m'], row['d2m']), axis=1)
    
    # Convert SSRD from J/m² to W/m²
    df['ssrd'] = df['ssrd'] / 86400
    
    # Convert SSRD to PPFD
    kEC = 2.04  # conversion factor
    df['PPFD'] = df['ssrd'] * kEC
    
    # Convert precipitation from m to mm
    df['tp'] = df['tp'] * 1000
    
    # Calculate wind speed
    df['wind_speed'] = np.sqrt(df['u10']**2 + df['v10']**2)
    
    # Convert soil water content to percentage
    df['swvl1'] = df['swvl1'] * 100
    
    # Convert soil temp (tsn) to Celcius
    df['tsn']=df['tsn']- 273.15
    
    return df


In [None]:
def ppfd_process(df):
    
    # Convert SSRD from J/m² to W/m²
    df['ssrd'] = df['ssrd'] / 86400
    
    # Convert SSRD to PPFD
    kEC = 2.04  # conversion factor
    
    return df

In [None]:
def swc_process(df):
    # Convert soil water content to percentage
    df['swvl1'] = df['swvl1'] * 100
    
    return df

In [None]:
def st_process(df):
    # Convert soil temp to deg c
    df['tsn']=df['tsn']- 273.15 
    
    return df
    

In [None]:
# apply the required processing (See report appendix)

In [None]:
ERA5_AR_Vir = process_era5_data(ERA5_AR_Vir) #all variables

In [None]:
ERA5_AU_Dry=ppfd_process(ERA5_AU_Dry) #ppfd only

In [None]:
ERA5_BE_Vie=ppfd_process(ERA5_BE_Vie) #ppfd only

In [None]:
ERA5_CA_TP1=ppfd_process(ERA5_CA_TP1) #ppfd only

In [None]:
ERA5_CH_Cha=ppfd_process(ERA5_CH_Cha) #ppfd only

In [None]:
ERA5_DE_Gri=ppfd_process(ERA5_DE_Gri) #ppfd only

In [None]:
ERA5_FR_Pue=ppfd_process(ERA5_FR_Pue) 
ERA5_FR_Pue=swc_process(ERA5_FR_Pue) #ppfd and swc

In [None]:
ERA5_GF_Guy=ppfd_process(ERA5_GF_Guy)
ERA5_GF_Guy=st_process(ERA5_GF_Guy) #soil temp and ppfd

In [None]:
ERA5_IT_Col=ppfd_process(ERA5_IT_Col)
ERA5_IT_Col=swc_process(ERA5_IT_Col) #ppfd and swc

In [None]:
ERA5_NL_Loo=ppfd_process(ERA5_NL_Loo) #ppfd

In [None]:
ERA5_RU_Cok=ppfd_process(ERA5_RU_Cok)
ERA5_RU_Cok=swc_process(ERA5_RU_Cok)
ERA5_RU_Cok=st_process(ERA5_RU_Cok) #ppfd, swc st

In [None]:
ERA5_RU_Fyo=ppfd_process(ERA5_RU_Fyo)

In [None]:
ERA5_US_PFa=ppfd_process(ERA5_US_PFa) 
ERA5_US_PFa=st_process(ERA5_US_PFa) #ppfd, st

In [None]:
ERA5_US_Var=ppfd_process(ERA5_US_Var) #ppfd only


In [None]:
ERA5_ZA_Kru = process_era5_data(ERA5_ZA_Kru) #all variables

In [None]:
#seperate out the data frames by site_id

AR_Vir=df[df['SITE_ID']=='AR-Vir']#1
AU_Dry=df[df['SITE_ID']=='AU-Dry']#2
BE_Vie=df[df['SITE_ID']=='BE-Vie']#3
CA_TP1=df[df['SITE_ID']=='CA-TP1']#4
CH_Cha=df[df['SITE_ID']=='CH-Cha']#5
DE_Gri=df[df['SITE_ID']=='DE-Gri']#6
FR_Pue=df[df['SITE_ID']=='FR-Pue']#7
GF_Guy=df[df['SITE_ID']=='GF-Guy']#8
IT_Col=df[df['SITE_ID']=='IT-Col']#9
NL_Loo=df[df['SITE_ID']=='NL-Loo']#10
RU_Cok=df[df['SITE_ID']=='RU-Cok']#11
RU_Fyo=df[df['SITE_ID']=='RU-Fyo']#12
US_PFa=df[df['SITE_ID']=='US-PFa']#13
US_Var=df[df['SITE_ID']=='US-Var']#14
ZA_Kru=df[df['SITE_ID']=='ZA-Kru']#15



In [None]:
### now fill in the missing data from each site ###

In [None]:
# AR-Vir

In [None]:
# want to impute, TA_F, PA_F,VPD_F,P_F,WS_F,TS_F_MDS_1,SWC_F_MDS_1


In [None]:
ERA5_AR_Vir=ERA5_AR_Vir.sort_values(by='valid_time') # sort date

In [None]:
ERA5_AR_Vir.set_index('valid_time',inplace=True) # set index 

In [None]:
AR_Vir.set_index('date',inplace=True)# set index 

In [None]:
# dictionary for renaming the columns to have same names

ERA5_renames={'t2m':'TA_F','sp':'PA_F','ssrd':'PPFD_IN','tp':'P_F',
              'tsn':'TS_F_MDS_1','swvl1':'SWC_F_MDS_1','wind_speed':'WS_F',
             'VPD':'VPD_F'}

In [None]:
ERA5_AR_Vir.rename(columns=ERA5_renames,inplace=True)

In [None]:
## from this need to drop Co2 GPP, NEE as this data needs to come from
## somewhere else from AR_Vir

In [None]:
AR_Vir_drop=['CO2_F_MDS','CO2_F_MDS','GPP_DT_VUT_REF','NEE_VUT_REF']

In [None]:
AR_Vir=AR_Vir.drop(columns=AR_Vir_drop)

In [None]:
# now impute with ERA5 land data

for column in AR_Vir.columns:
    
    # iterate  rows
    
    for index, value in AR_Vir[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            # Impute the missing value with the corresponding value from ERA_AR_Vir
            AR_Vir.at[index, column] = ERA5_AR_Vir.at[index, column]




In [None]:
AR_Vir.shape

In [None]:
AR_Vir.isna().sum()

In [None]:
### AU-Dry (PPFD) ###

In [None]:
ERA5_AU_Dry=ERA5_AU_Dry.sort_values(by='valid_time') # sort date

In [None]:
ERA5_AU_Dry.set_index('valid_time',inplace=True) # set index 

In [None]:
AU_Dry.set_index('date',inplace=True) # set index 

In [None]:
ERA5_AU_Dry.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in AU_Dry.columns:
    
    # iterate  rows
    
    for index, value in AU_Dry[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            AU_Dry.at[index, column] = ERA5_AU_Dry.at[index, column]



In [None]:
AU_Dry.isna().sum()

In [None]:
### BE-Vie (PPFD) ###

ERA5_BE_Vie=ERA5_BE_Vie.sort_values(by='valid_time') # sort date

In [None]:
ERA5_BE_Vie.set_index('valid_time',inplace=True) # set index 

In [None]:
BE_Vie.set_index('date',inplace=True) # set index 

In [None]:
ERA5_BE_Vie.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in BE_Vie.columns:
    
    # iterate  rows
    
    for index, value in BE_Vie[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            BE_Vie.at[index, column] = ERA5_BE_Vie.at[index, column]


In [None]:
BE_Vie.isna().sum()

In [None]:
BE_Vie.shape

In [None]:
### CA-TP1 (PPFD) ###

ERA5_CA_TP1=ERA5_CA_TP1.sort_values(by='valid_time') # sort date

In [None]:
ERA5_CA_TP1.set_index('valid_time',inplace=True) # set index 

In [None]:
CA_TP1.set_index('date',inplace=True) # set index 

In [None]:
ERA5_CA_TP1.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in CA_TP1.columns:
    
    # iterate  rows
    
    for index, value in CA_TP1[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            CA_TP1.at[index, column] = ERA5_CA_TP1.at[index, column]


In [None]:
CA_TP1.isna().sum()

In [None]:
### CH-Cha (PPFD) ###

ERA5_CH_Cha=ERA5_CH_Cha.sort_values(by='valid_time') # sort date


In [None]:
ERA5_CH_Cha.set_index('valid_time',inplace=True)

In [None]:
CH_Cha.set_index('date',inplace=True) # set index 

In [None]:
ERA5_CH_Cha.rename(columns=ERA5_renames,inplace=True) # rename cols


In [None]:
for column in CH_Cha.columns:
    
    # iterate  rows
    
    for index, value in CH_Cha[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            CH_Cha.at[index, column] = ERA5_CH_Cha.at[index, column]


In [None]:
CH_Cha.isna().sum()

In [None]:
### DE-Gri (PPFD) ###

ERA5_DE_Gri=ERA5_DE_Gri.sort_values(by='valid_time') # sort date


In [None]:
ERA5_DE_Gri.set_index('valid_time',inplace=True) #set index

In [None]:
DE_Gri.set_index('date',inplace=True) #set index

In [None]:
ERA5_DE_Gri.rename(columns=ERA5_renames,inplace=True) # rename cols


In [None]:
for column in DE_Gri.columns:
    
    # iterate  rows
    
    for index, value in DE_Gri[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            DE_Gri.at[index, column] = ERA5_DE_Gri.at[index, column]


In [None]:
DE_Gri.isna().sum()

In [None]:
### FR-Pue (PPFD, SWC)

In [None]:
ERA5_FR_Pue=ERA5_FR_Pue.sort_values(by='valid_time') # sort date

In [None]:
ERA5_FR_Pue.set_index('valid_time',inplace=True) #set index

In [None]:
FR_Pue.set_index('date',inplace=True) #set index

In [None]:
ERA5_FR_Pue.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in FR_Pue.columns:
    
    # iterate  rows
    
    for index, value in FR_Pue[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            FR_Pue.at[index, column] = ERA5_FR_Pue.at[index, column]


In [None]:
FR_Pue.isna().sum()

In [None]:
### GF-Guy ###

In [None]:
ERA5_GF_Guy=ERA5_GF_Guy.sort_values(by='valid_time') # sort date

In [None]:
ERA5_GF_Guy.set_index('valid_time',inplace=True) #set index

In [None]:
GF_Guy.set_index('date',inplace=True) #set index

In [None]:
ERA5_GF_Guy.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in GF_Guy.columns:
    
    # iterate  rows
    
    for index, value in GF_Guy[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            GF_Guy.at[index, column] = ERA5_GF_Guy.at[index, column]


In [None]:
### IT-Col ###

In [None]:
ERA5_IT_Col=ERA5_GF_Guy.sort_values(by='valid_time') # sort date

In [None]:
ERA5_IT_Col.set_index('valid_time',inplace=True) #set index

In [None]:
IT_Col.set_index('date',inplace=True) #set index

In [None]:
ERA5_IT_Col.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in IT_Col.columns:
    
    # iterate  rows
    
    for index, value in IT_Col[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            IT_Col.at[index, column] = ERA5_IT_Col.at[index, column]


In [None]:
IT_Col.isna().sum()

In [None]:
### NL-Loo ###

In [None]:
ERA5_NL_Loo=ERA5_GF_Guy.sort_values(by='valid_time') # sort date

In [None]:
NL_Loo.set_index('date',inplace=True) #set index

In [None]:
ERA5_NL_Loo.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
NL_Loo.isna().sum()

In [None]:
NL_Loo=NL_Loo.drop(columns='CO2_F_MDS')

In [None]:
for column in NL_Loo.columns:
    
    # iterate  rows
    
    for index, value in NL_Loo[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            NL_Loo.at[index, column] = ERA5_NL_Loo.at[index, column]


In [None]:
NL_Loo.isna().sum()

In [None]:
### RU-Cok ###

In [None]:
ERA5_RU_Cok=ERA5_RU_Cok.sort_values(by='valid_time') # sort date

In [None]:
ERA5_RU_Cok.set_index('valid_time',inplace=True) # set index 

In [None]:
RU_Cok.set_index('date',inplace=True)# set index 

In [None]:
ERA5_RU_Cok.rename(columns=ERA5_renames,inplace=True)

In [None]:
## from this need to drop Co2 GPP, NEE as this data needs to come from
## somewhere else from AR_Vir

In [None]:
AR_Vir_drop=['CO2_F_MDS','CO2_F_MDS','GPP_DT_VUT_REF','NEE_VUT_REF']

In [None]:
RU_Cok=RU_Cok.drop(columns=AR_Vir_drop)

In [None]:
# now impute with ERA5 land data

for column in RU_Cok.columns:
    
    # iterate  rows
    
    for index, value in RU_Cok[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            # Impute the missing value with the corresponding value from ERA_AR_Vir
            RU_Cok.at[index, column] = ERA5_RU_Cok.at[index, column]




In [None]:
RU_Cok.isna().sum()

In [None]:
## RU-Fyo ###

In [None]:
ERA5_RU_Fyo=ERA5_RU_Fyo.sort_values(by='valid_time') # sort date

In [None]:
ERA5_RU_Fyo.set_index('valid_time',inplace=True) #set index

In [None]:
RU_Fyo.set_index('date',inplace=True) #set index

In [None]:
ERA5_RU_Fyo.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in RU_Fyo.columns:
    
    # iterate  rows
    
    for index, value in RU_Fyo[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            RU_Fyo.at[index, column] = ERA5_RU_Fyo.at[index, column]


In [None]:
RU_Fyo.isna().sum()

In [None]:
### US-PFa ###

In [None]:
ERA5_US_PFa

In [None]:
ERA5_US_PFa=ERA5_US_PFa.sort_values(by='valid_time') # sort date

In [None]:
ERA5_US_PFa.set_index('valid_time',inplace=True) #set index

In [None]:
US_PFa.set_index('date',inplace=True) #set index

In [None]:
ERA5_US_PFa.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in US_PFa.columns:
    
    # iterate  rows
    
    for index, value in US_PFa[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            US_PFa.at[index, column] = ERA5_US_PFa.at[index, column]


In [None]:
US_PFa.isna()

In [None]:
### US-Var ###

In [None]:
ERA5_US_Var=ERA5_US_Var.sort_values(by='valid_time') # sort date

In [None]:
ERA5_US_Var.set_index('valid_time',inplace=True) #set index

In [None]:
US_Var.set_index('date',inplace=True) #set index

In [None]:
ERA5_US_Var.rename(columns=ERA5_renames,inplace=True) # rename cols

In [None]:
for column in US_Var.columns:
    
    # iterate  rows
    
    for index, value in US_Var[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            US_Var.at[index, column] = ERA5_US_Var.at[index, column]


In [None]:
US_Var.isna().sum()

In [None]:
### ZA-Kru ###

In [None]:
ERA5_ZA_Kru=ERA5_ZA_Kru.sort_values(by='valid_time') # sort date

In [None]:
ERA5_ZA_Kru.set_index('valid_time',inplace=True) # set index 

In [None]:
ZA_Kru.set_index('date',inplace=True)# set index 

In [None]:
ERA5_ZA_Kru.rename(columns=ERA5_renames,inplace=True)

In [None]:
## from this need to drop Co2 GPP, NEE as this data needs to come from
## somewhere else from AR_Vir

In [None]:
ZA_Kru=ZA_Kru.drop(columns=AR_Vir_drop)

In [None]:
# now impute with ERA5 land data

for column in ZA_Kru.columns:
    
    # iterate  rows
    
    for index, value in ZA_Kru[column].items():
        
        # check for NA values
        
        if pd.isna(value):
            
            # Impute the missing value with the corresponding value from ERA_AR_Vir
            ZA_Kru.at[index, column] = ERA5_ZA_Kru.at[index, column]




In [None]:
ZA_Kru.isna().sum()

In [None]:
### i think for the dropped columns i will reseperate the DFs 
### and keep the date to impute them and then join right at the end
### on the date column

In [None]:
### import co2 data

In [None]:
co2_2010=pd.read_csv('/Users/abigailbase/PROJECT FILES/CO2 CSVs/co2_2010.csv',index_col=0)
co2_2011=pd.read_csv('/Users/abigailbase/PROJECT FILES/CO2 CSVs/co2_2011.csv',index_col=0)
co2_2012=pd.read_csv('/Users/abigailbase/PROJECT FILES/CO2 CSVs/co2_2012.csv',index_col=0)
co2_2013=pd.read_csv('/Users/abigailbase/PROJECT FILES/CO2 CSVs/co2_2013.csv',index_col=0)
co2_2014=pd.read_csv('/Users/abigailbase/PROJECT FILES/CO2 CSVs/co2_2014.csv',index_col=0)



In [None]:
co2_2010['time']=pd.to_datetime(co2_2010['time']).dt.date
co2_2011['time']=pd.to_datetime(co2_2011['time']).dt.date
co2_2012['time']=pd.to_datetime(co2_2012['time']).dt.date
co2_2013['time']=pd.to_datetime(co2_2013['time']).dt.date
co2_2014['time']=pd.to_datetime(co2_2014['time']).dt.date

In [None]:
co2_drop=['level','LAT','LON']

In [None]:
co2_2010=co2_2010.drop(columns=co2_drop)
co2_2011=co2_2011.drop(columns=co2_drop)
co2_2012=co2_2012.drop(columns=co2_drop)
co2_2013=co2_2013.drop(columns=co2_drop)
co2_2014=co2_2014.drop(columns=co2_drop)

In [None]:
#combine the years into one


co2=pd.concat([co2_2010,co2_2011,co2_2012,co2_2013,co2_2014],ignore_index=True)

In [None]:
AR_Vir_co2=co2[co2['site_id']=='AR-Vir']
RU_Cok_co2=co2[co2['site_id']=='RU-Cok']
NL_Loo_co2=co2[co2['site_id']=='NL-Loo']
ZA_Kru_co2=co2[co2['site_id']=='ZA-Kru']

In [None]:
NL_Loo_co2=co2[co2['site_id']=='NL-Loo']

In [None]:
AR_Vir_co2

In [None]:
co2_rename={'co2':'CO2_F_MDS'}

In [None]:
AR_Vir_co2=AR_Vir_co2.set_index('time')

In [None]:
AR_Vir_co2.rename(columns=co2_rename,inplace=True)

In [None]:
AR_Vir_co2

In [None]:
### need to impute co2 for RU-Cok, AR-Vir and ZA_Kru

AR_Vir

In [None]:
### AR-Vir ###

In [None]:
# want to join AR_Vir_co2 and AR_Vir

In [None]:
AR_Vir_imp=AR_Vir.join(AR_Vir_co2)

In [None]:
AR_Vir_imp=AR_Vir_imp.drop(columns='site_id')

In [None]:
AR_Vir_imp.isna().sum()

In [None]:
## fill missing 12 with mean DELETE THIS

In [None]:
AR_Vir_imp['CO2_F_MDS']=AR_Vir_imp['CO2_F_MDS'].fillna(AR_Vir_imp['CO2_F_MDS'].mean())

In [None]:
AR_Vir_imp.isna().sum()

In [None]:
### RU-Cok ###

In [None]:
RU_Cok_co2=RU_Cok_co2.set_index('time')

In [None]:
RU_Cok_co2.rename(columns=co2_rename,inplace=True)

In [None]:
RU_Cok_imp=RU_Cok.join(RU_Cok_co2)

In [None]:
RU_Cok_imp['CO2_F_MDS']=RU_Cok_imp['CO2_F_MDS'].fillna(RU_Cok_imp['CO2_F_MDS'].mean())

In [None]:
### NL_Loo ###

In [None]:
NL_Loo_original=df[df['SITE_ID']=='NL-Loo']

In [None]:
NL_Loo_original=NL_Loo_original[['CO2_F_MDS','date']]

In [None]:
NL_Loo_original=NL_Loo_original.set_index('date')

In [None]:
NL_Loo_original.isna().sum()

In [None]:
NL_Loo_original['CO2_F_MDS']=NL_Loo_original['CO2_F_MDS'].fillna(NL_Loo_original['CO2_F_MDS'].mean())

In [None]:
NL_Loo_original.isna().sum()

In [None]:
NL_Loo_imp=NL_Loo.join(NL_Loo_original)

In [None]:
NL_Loo_imp.isna().sum()

In [None]:
NL_Loo_imp.head()

In [None]:
NL_Loo_imp.isna().sum()

In [None]:
### ZA-Kru ###

In [None]:
ZA_Kru_co2=ZA_Kru_co2.set_index('time')

In [None]:
ZA_Kru_co2.rename(columns=co2_rename,inplace=True)

In [None]:
ZA_Kru_imp=ZA_Kru.join(AR_Vir_co2)

In [None]:
ZA_Kru_imp.isna().sum()

In [None]:
ZA_Kru_imp['CO2_F_MDS']=ZA_Kru_imp['CO2_F_MDS'].fillna(ZA_Kru_imp['CO2_F_MDS'].mean())

In [None]:
ZA_Kru_imp=ZA_Kru_imp.drop(columns='site_id')

---

In [None]:
### NIRv data from file 5 (hdf process) was also added here

In [None]:
#import 

In [None]:
AR_Vir_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/AR_Vir_NIR.csv')
AU_Dry_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/AU_Dry_NIRv.csv')
BE_Vie_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/BE_Vie_NIR.csv')
CA_TP1_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/CA_TP1_NIR.csv')
CH_Cha_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/CH_Cha_NIR.csv')
DE_Gri_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/DE_Gri_NIR.csv')
FR_Pue_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/FR_Pue_NIR.csv')
GF_Guy_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/GF_Guy_NIR.csv')

NL_Loo_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/NL_Loo_NIR.csv')
RU_Cok_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/RU_CokNIR.csv')
RU_Fyo_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/RU_Fyo_NIR.csv')
US_PFa_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/US_PFa_NIR.csv')
US_Var_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/US_Var_NIR.csv')
ZA_Kru_NIRv=pd.read_csv('/Users/abigailbase/PROJECT FILES/NIRv DFs/ZA_Kru_NIR.csv')



In [None]:
file_paths = [
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/AR_Vir_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/AU_Dry_NIRv.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/BE_Vie_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/CA_TP1_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/CH_Cha_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/DE_Gri_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/FR_Pue_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/GF_Guy_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/IT_Col_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/NL_Loo_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/RU_CokNIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/RU_Fyo_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/US_PFa_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/US_Var_NIR.csv',
    '/Users/abigailbase/PROJECT FILES/NIRv DFs/ZA_Kru_NIR.csv'
]


In [None]:
keys = [
    'AR_Vir_NIRv', 'AU_Dry_NIRv', 'BE_Vie_NIRv', 'CA_TP1_NIRv', 'CH_Cha_NIRv',
    'DE_Gri_NIRv', 'FR_Pue_NIRv', 'GF_Guy_NIRv','IT_Col_NIRv', 'NL_Loo_NIRv', 'RU_Cok_NIRv',
    'RU_Fyo_NIRv', 'US_PFa_NIRv', 'US_Var_NIRv', 'ZA_Kru_NIRv'
]


In [None]:
nirv_dict = {}

In [None]:
for key, file_path in zip(keys, file_paths):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    # Add the site_id column
    df['site_id'] = key
    # Store the DataFrame in the dictionary
    nirv_dict[key] = df


In [None]:
### analyse trend to see which interpolation most ideal

dfs_to_concat = [nirv_dict[key] for key in keys]

In [None]:
combined_nirv_df = pd.concat(dfs_to_concat, ignore_index=True)


In [None]:
combined_nirv_df['date'] = pd.to_datetime(combined_nirv_df['date'])


In [None]:
combined_nirv_df = combined_nirv_df.sort_values(by='date')


In [None]:
sites = combined_nirv_df['site_id'].unique()


In [None]:
#  time series of each site revealed seasonality so a cubic spline interpolator was appropriate

plt.figure(figsize=(14, 7))

for site in combined_nirv_df['site_id'].unique():
    site_data = combined_nirv_df[combined_nirv_df['site_id'] == site]
    plt.plot(site_data['date'], site_data['NIRv'], marker='o', linestyle='-', label=site)

plt.xlabel('Date')
plt.ylabel('NIRv')
plt.title('NIRv Time Series for All Sites')
plt.legend(title='Site ID')
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
AR_Vir_NIRv=nirv_dict['AR_Vir_NIRv']

In [None]:
AR_Vir_NIRv

In [None]:
NIRV_drop=['year','day_of_year','day_of_year','NIR_point','NDVI_point','site_id']

In [None]:
AR_Vir_NIRv=AR_Vir_NIRv.drop(columns=NIRV_drop)

In [None]:
AR_Vir_NIRv['date']=pd.to_datetime(AR_Vir_NIRv['date'])

In [None]:
AR_Vir_NIRv.set_index('date', inplace=True)

In [None]:
#create the missing days

AR_Vir_NIRv_daily = AR_Vir_NIRv.resample('D').asfreq()


In [None]:
## apply the spine interpolator ###

valid_data = AR_Vir_NIRv_daily.dropna(subset=['NIRv'])

# convert dates to ordinal for interpolation
dates = valid_data.index.map(pd.Timestamp.toordinal).values
nirv_values = valid_data['NIRv'].values

cs = CubicSpline(dates, nirv_values, extrapolate=True)

full_dates = AR_Vir_NIRv_daily.index.map(pd.Timestamp.toordinal)
AR_Vir_NIRv_daily['NIRv'] = cs(full_dates)





In [None]:

def interpolate_nirv(nirv_dict, key, NIRV_drop):
    AR_Vir_NIRv = nirv_dict[key]
    
    AR_Vir_NIRv = AR_Vir_NIRv.drop(columns=NIRV_drop)
    
    #  to datetime and set as index
    AR_Vir_NIRv['date'] = pd.to_datetime(AR_Vir_NIRv['date'])
    AR_Vir_NIRv.set_index('date', inplace=True)
    
    # drop data from before 2010
    AR_Vir_NIRv = AR_Vir_NIRv[AR_Vir_NIRv.index >= '2010-01-01']
    
    
    # daily frequency
    AR_Vir_NIRv_daily = AR_Vir_NIRv.resample('D').asfreq()
    
    valid_data = AR_Vir_NIRv_daily.dropna(subset=['NIRv'])
    
    # convert dates to ordinal for interpolation
    dates = valid_data.index.map(pd.Timestamp.toordinal).values
    nirv_values = valid_data['NIRv'].values
    
    # cubic spline interpolator
    cs = CubicSpline(dates, nirv_values, extrapolate=True)
    
    full_dates = AR_Vir_NIRv_daily.index.map(pd.Timestamp.toordinal)
    AR_Vir_NIRv_daily['NIRv'] = cs(full_dates)
    
    return AR_Vir_NIRv_daily



In [None]:
#AR-Vir

AR_Vir_NIRv=nirv_dict['AR_Vir_NIRv']

In [None]:
AR_Vir_NIRv_daily=interpolate_nirv(nirv_dict,'AR_Vir_NIRv',NIRV_drop)

In [None]:
#AU-Dry

AU_Dry_NIRv=nirv_dict['AU_Dry_NIRv']

In [None]:
AU_Dry_NIRv_daily=interpolate_nirv(nirv_dict,'AU_Dry_NIRv',NIRV_drop)

In [None]:
#BE-Vie

BE_Vie_NIRv=nirv_dict['BE_Vie_NIRv']

In [None]:
BE_Vie_NIRv_daily=interpolate_nirv(nirv_dict,'BE_Vie_NIRv',NIRV_drop)

In [None]:
# CA-TP1

CA_TP1_NIRv=nirv_dict['CA_TP1_NIRv']

In [None]:
CA_TP1_NIRv_daily=interpolate_nirv(nirv_dict,'CA_TP1_NIRv',NIRV_drop)

In [None]:
# CH-Cha

CH_Cha_NIRv=nirv_dict['CH_Cha_NIRv']

In [None]:
CH_Cha_NIRv_daily=interpolate_nirv(nirv_dict,'CH_Cha_NIRv',NIRV_drop)

In [None]:
#DE-Gri

DE_Gri_NIRv=nirv_dict['DE_Gri_NIRv']

In [None]:
DE_Gri_NIRv_daily=interpolate_nirv(nirv_dict,'DE_Gri_NIRv',NIRV_drop)

In [None]:
#FR-Pue

FR_Pue_NIRv=nirv_dict['FR_Pue_NIRv']

In [None]:
FR_Pue_NIRv_daily=interpolate_nirv(nirv_dict,'FR_Pue_NIRv',NIRV_drop)

In [None]:
#GF-Guy

GF_Guy_NIRv=nirv_dict['GF_Guy_NIRv']

In [None]:
GF_Guy_NIRv_daily=interpolate_nirv(nirv_dict,'GF_Guy_NIRv',NIRV_drop)

In [None]:
#IT-Col

IT_Col_NIRv=nirv_dict['IT_Col_NIRv']

In [None]:
IT_Col_NIRv_daily=interpolate_nirv(nirv_dict,'IT_Col_NIRv',NIRV_drop)

In [None]:
# NL-Loo

In [None]:
NL_Loo_NIRv=nirv_dict['NL_Loo_NIRv']

In [None]:
NL_Loo_NIRv_daily=interpolate_nirv(nirv_dict,'NL_Loo_NIRv',NIRV_drop)

In [None]:
NL_Loo_NIRv_daily.isna().sum()

In [None]:
#RU-Cok

RU_Cok_NIRv=nirv_dict['RU_Cok_NIRv']

In [None]:
RU_Cok_NIRv

In [None]:
RU_Cok_NIRv['date']=pd.to_datetime(RU_Cok_NIRv['date'])

In [None]:
RU_Cok_NIRv= RU_Cok_NIRv.sort_values(by='date')


In [None]:
RU_Cok_NIRv=RU_Cok_NIRv.dropna()

In [None]:
RU_Cok_NIRv=RU_Cok_NIRv.drop_duplicates(subset='date',keep='last')

In [None]:
RU_Cok_NIRv.set_index('date', inplace=True)


In [None]:
RU_Cok_NIRv = RU_Cok_NIRv[RU_Cok_NIRv.index >= '2010-01-01']
    

In [None]:
RU_Cok_NIRv_daily = RU_Cok_NIRv.resample('D').asfreq()

In [None]:
valid_data = RU_Cok_NIRv_daily.dropna(subset=['NIRv'])

dates = valid_data.index.map(pd.Timestamp.toordinal).values
nirv_values = valid_data['NIRv'].values

cs = CubicSpline(dates, nirv_values, extrapolate=True)
    
full_dates = RU_Cok_NIRv_daily.index.map(pd.Timestamp.toordinal)
RU_Cok_NIRv_daily['NIRv'] = cs(full_dates)
    
RU_Cok_NIRv_daily


In [None]:
RU_Cok_imp

In [None]:
RU_Cok_NIRv_daily.isna().sum()

In [None]:
### RU-Fyo ###

In [None]:
RU_Fyo_NIRv=nirv_dict['RU_Fyo_NIRv']

In [None]:
RU_Fyo_NIRv_daily=interpolate_nirv(nirv_dict,'RU_Fyo_NIRv',NIRV_drop)

In [None]:
### US-PFa ###

In [None]:
US_PFa_NIRv=nirv_dict['US_PFa_NIRv']

In [None]:
US_PFa_NIRv_daily=interpolate_nirv(nirv_dict,'RU_Fyo_NIRv',NIRV_drop)

In [None]:
### US-Var ###

In [None]:
US_Var_NIRv=nirv_dict['US_Var_NIRv']

In [None]:
US_Var_NIRv_daily=interpolate_nirv(nirv_dict,'US_Var_NIRv',NIRV_drop)

In [None]:
### ZA-Kru ###

In [None]:
ZA_Kru_NIRv=nirv_dict['ZA_Kru_NIRv']

In [None]:
ZA_Kru_NIRv_daily=interpolate_nirv(nirv_dict,'ZA_Kru_NIRv',NIRV_drop)

In [None]:
### now add the NIRv data to the final df

In [None]:
test=AR_Vir_imp.copy()

In [None]:
test

In [None]:
AR_Vir_imp['NIRv']=AR_Vir_NIRv_daily['NIRv']

In [None]:
cutoff=pd.to_datetime('2014-12-19')

In [None]:
# AR-Vir

AR_Vir_imp=AR_Vir[AR_Vir.index<=cutoff]

In [None]:
AR_Vir_imp

In [None]:
# AU_Dry

AU_Dry['NIRv']=AU_Dry_NIRv_daily['NIRv']

In [None]:
AU_Dry=AU_Dry[AU_Dry.index<=cutoff]

In [None]:
# BE-Vie

BE_Vie['NIRv']=BE_Vie_NIRv_daily['NIRv']

In [None]:
BE_Vie=BE_Vie[BE_Vie.index<=cutoff]

In [None]:
# CA-TP1

In [None]:
CA_TP1['NIRv']=CA_TP1_NIRv_daily['NIRv']

In [None]:
CA_TP1=CA_TP1[CA_TP1.index<=cutoff]

In [None]:
# CH-CHa

In [None]:
CH_Cha['NIRv']=CH_Cha_NIRv_daily['NIRv']

In [None]:
CH_Cha=CH_Cha[CH_Cha.index<=cutoff]

In [None]:
# DE-Gri

In [None]:
DE_Gri['NIRv']=DE_Gri_NIRv_daily['NIRv']

In [None]:
DE_Gri=DE_Gri[DE_Gri.index<=cutoff]

In [None]:
# FR_Pue

In [None]:
FR_Pue['NIRv']=FR_Pue_NIRv_daily['NIRv']

In [None]:
FR_Pue=FR_Pue[FR_Pue.index<=cutoff]

In [None]:
#GF_Guy

GF_Guy['NIRv']=GF_Guy_NIRv_daily['NIRv']

In [None]:
GF_Guy=GF_Guy[GF_Guy.index<=cutoff]

In [None]:
GF_Guy.isna().sum()

In [None]:
# IT-Col

In [None]:
IT_Col['NIRv']=IT_Col_NIRv_daily['NIRv']

In [None]:
IT_Col=IT_Col[IT_Col.index<=cutoff]

In [None]:
IT_Col.isna().sum()

In [None]:
# NL-Loo

In [None]:
NL_Loo

In [None]:
NL_Loo_imp['NIRv']=NL_Loo_NIRv_daily['NIRv']

In [None]:
NL_Loo_imp=NL_Loo[NL_Loo.index<=cutoff]

In [None]:
NL_Loo_imp.isna().sum()

In [None]:
# RU-Cok

In [None]:
RU_Cok_imp['NIRv']=RU_Cok_NIRv_daily['NIRv']

In [None]:
RU_Cok_imp=RU_Cok_imp[RU_Cok_imp.index<=cutoff]

In [None]:
RU_Cok_imp.isna().sum()

In [None]:
# RU_Fyo

In [None]:
RU_Fyo['NIRv']=RU_Fyo_NIRv_daily['NIRv']

In [None]:
RU_Fyo=RU_Fyo[RU_Fyo.index<=cutoff]

In [None]:
# US-PFA

In [None]:
US_PFa['NIRv']=US_PFa_NIRv_daily['NIRv']

In [None]:
US_PFa=US_PFa[US_PFa.index<=cutoff]

In [None]:
# US-Var

In [None]:
US_Var['NIRv']=US_Var_NIRv_daily['NIRv']

In [None]:
US_Var=US_Var[US_Var.index<=cutoff]

In [None]:
# ZA-Kru

In [None]:
ZA_Kru_imp['NIRv']=ZA_Kru_NIRv_daily['NIRv']

In [None]:
ZA_Kru_imp=ZA_Kru[ZA_Kru.index<=cutoff]

In [None]:
RU_Cok_imp=RU_Cok_imp.drop(columns='site_id')

In [None]:
US_Var.shape

In [None]:
# COMBINE INTO THE FINAL DF


final_df=pd.concat([AR_Vir_imp,AU_Dry,BE_Vie,CA_TP1,CH_Cha,DE_Gri,FR_Pue,GF_Guy,IT_Col,NL_Loo_imp,
                   RU_Cok_imp,RU_Fyo,US_PFa,US_Var,ZA_Kru_imp])

In [None]:
RU_Fyo['NIRv']=RU_Fyo['NIRv'].fillna(RU_Fyo['NIRv'].mean())

In [None]:
final_df.to_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/FINAL_FINAL_DF.csv',index=True)