In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import glob

from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning removal
warnings.simplefilter(action='ignore', category=RuntimeWarning) # FutureWarning removal

In [2]:
def sort_season(var1,yrs,yre,season):
    x  = []
    y  = []
    
    for i in range(int(yrs), int(yre)+1):
        yr = i
        # Spring (N), Fall (S)
        if (season == 'MOM'):
            sort = var1[str(yr) + '-03-01':str(yr) + '-5-31']
            x = pd.DataFrame(sort)
                
        # Summer (N), Winter (S)
        if (season == 'JJA'):
            sort  = var1[str(yr) + '-06-01':str(yr) + '-8-31']
            x = pd.DataFrame(sort)
                
        # Fall (N), Spring (S)
        if (season == 'SON'):            
            sort = var1[str(yr) + '-09-01':str(yr) + '-11-30']
            x = pd.DataFrame(sort)
               
        # Winter (N), Summer (S)
        if (season == 'DJF'):            
            sort  = var1[str(yr) + '-01-01':str(yr) + '-02-28']
            sort2 = var1[str(yr) + '-12-01':str(yr) + '-12-31']
            x = pd.concat([pd.DataFrame(sort),pd.DataFrame(sort2)], axis=0)

        if (int(yr) == int(yrs)):
            y = x
        else:
            y = pd.concat([y, x], ignore_index=False)
        
    return y

In [3]:
dir = '/glade/derecho/scratch/sungyoon/data/FLUXNET/FULLSET/'
site_list = sorted(glob.glob(dir+'*_HR_CI_QC.csv'))

In [6]:
for ii in range(0,4):
    ind = ii
    ind_seasons = ['spring','summer','fall','winter']

    for s,site in enumerate(site_list):
    # for s,site in enumerate(site_list[f_start:f_stop]):
        # open hourly flux site observations
        # print(site)
        f_siteid = site.split("/")[8].split("_")[0]
        df1 = pd.read_csv(site,na_values=-9999)
        time1 = pd.date_range(df1['time'][0],df1['time'][len(df1['time'])-1], freq='1H')
        df1.index = pd.to_datetime(time1)

        # create daily 
        swc = df1['SWC_F_MDS_1'].resample('1D').mean() # LAND
        le  = df1['LE_F_MDS'].resample('1D').mean() # Mosit
        h   = df1['H_F_MDS'].resample('1D').mean() # Thermal 
        radn= df1['NETRAD_DV'].resample('1D').mean() # Radiation
        # print(swc)

        df_site = pd.read_csv((dir+f_siteid+'_site.csv'),sep=',')    
        lat     = df_site['Latitude'].item()        
        tm  = df1['time']
        yrs = str(tm[0])[:4]
        yre = str(tm[int(len(tm)-1)])[:4]    

        if lat > 0:  # Northern Hemisphere
            seasons = {'spring': 'MOM', 'summer': 'JJA', 'fall': 'SON', 'winter': 'DJF'}
        else:  # Southern Hemisphere
            seasons = {'spring': 'SON', 'summer': 'DJF', 'fall': 'MOM', 'winter': 'JJA'}

        season = seasons[ind_seasons[ind]]

        var1 = sort_season(swc ,yrs,yre,season)['SWC_F_MDS_1']
        var2 = sort_season(le  ,yrs,yre,season)['LE_F_MDS']
        var3 = sort_season(h   ,yrs,yre,season)['H_F_MDS']
        var4 = sort_season(radn,yrs,yre,season)['NETRAD_DV']   

        nan_sort = ~np.logical_or.reduce((np.isnan(var1),np.isnan(var2) ,np.isnan(var3),np.isnan(var4)))
        tm_indx = var1.index[nan_sort]
        cmp1 = var1 [nan_sort].values
        cmp2 = var2 [nan_sort].values
        cmp3 = var3 [nan_sort].values
        cmp4 = var4 [nan_sort].values

        if (len(cmp1) > 90.):
            print(yrs,yre,lat, f_siteid)

            N = 5
            r1 = np.convolve(cmp1, np.ones(N)/N, mode='valid')
            r2 = np.convolve(cmp2, np.ones(N)/N, mode='valid')
            r3 = np.convolve(cmp3, np.ones(N)/N, mode='valid')
            r4 = np.convolve(cmp4, np.ones(N)/N, mode='valid')

            #5dys        
            r1 = [np.mean(cmp1[:3])] + [np.mean(cmp1[:4])] +list(r1) + [np.mean(cmp1[-4:])] +[np.mean(cmp1[-3:])]
            r2 = [np.mean(cmp2[:3])] + [np.mean(cmp2[:4])] +list(r2) + [np.mean(cmp2[-4:])] +[np.mean(cmp2[-3:])]
            r3 = [np.mean(cmp3[:3])] + [np.mean(cmp3[:4])] +list(r3) + [np.mean(cmp3[-4:])] +[np.mean(cmp3[-3:])]
            r4 = [np.mean(cmp4[:3])] + [np.mean(cmp4[:4])] +list(r4) + [np.mean(cmp4[-4:])] +[np.mean(cmp4[-3:])]

            cmp1 = r1
            cmp2 = r2
            cmp3 = r3
            cmp4 = r4

            components = {    'SWC': cmp1,    'LE': cmp2,    'H': cmp3,    'RADN': cmp4}

            # Function to perform PCA after standardization
            def perform_pca(data_dict, n_components=2):
                """
                Standardizes the dataset and applies PCA.

                Parameters:
                - data_dict (dict): Dictionary of selected features (keys = column names, values = data arrays).
                - n_components (int): Number of PCA components to retain.

                Returns:
                - pca_result (array): Transformed PCA array.
                - explained_variance (array): Explained variance ratio.
                """
                df = pd.DataFrame(data_dict)  # Convert to DataFrame
                scaled_data = StandardScaler().fit_transform(df.values)  # Standardize
                pca = PCA(n_components=n_components)
                pca_result = pca.fit_transform(scaled_data)
                return pca_result, pca.explained_variance_ratio_

            # Define datasets for different PCA variations
            datasets = {
                "ALL": components,                        # All components included
                "NoSWC": {k: v for k, v in components.items() if k != "SWC"},
                "NoLE":  {k: v for k, v in components.items() if k != "LE"},
                "NoH":   {k: v for k, v in components.items() if k != "H"},
                "NoRAD": {k: v for k, v in components.items() if k != "RADN"}
            }

            pca_results     = {key: perform_pca(data) for key, data in datasets.items()}
            pca_transformed = {key: result[0] for key, result in pca_results.items()}
            pca_ratio       = {key: result[1] for key, result in pca_results.items()}

            dfpc1    = pd.DataFrame(data=pca_transformed["ALL"]  , columns = ['PC1','PC2'])
            dfpc2    = pd.DataFrame(data=pca_transformed["NoSWC"], columns = ['PC1','PC2'])
            dfpc3    = pd.DataFrame(data=pca_transformed["NoLE"] , columns = ['PC1','PC2'])
            dfpc4    = pd.DataFrame(data=pca_transformed["NoH"]  , columns = ['PC1','PC2'])
            dfpc5    = pd.DataFrame(data=pca_transformed["NoRAD"], columns = ['PC1','PC2'])

            dfpct1   = pd.DataFrame(data=[pca_ratio["ALL"]]  , columns = ['PC1','PC2'])
            dfpct2   = pd.DataFrame(data=[pca_ratio["NoSWC"]], columns = ['PC1','PC2'])
            dfpct3   = pd.DataFrame(data=[pca_ratio["NoLE"]] , columns = ['PC1','PC2'])
            dfpct4   = pd.DataFrame(data=[pca_ratio["NoH"]]  , columns = ['PC1','PC2'])
            dfpct5   = pd.DataFrame(data=[pca_ratio["NoRAD"]], columns = ['PC1','PC2'])

            dfpc1.index = tm_indx
            dfpc2.index = tm_indx
            dfpc3.index = tm_indx
            dfpc4.index = tm_indx
            dfpc5.index = tm_indx
            
            dirout = './data/'
            dfpc1.to_csv(dirout+f_siteid+'_PCA_ALL_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g')
            dfpc2.to_csv(dirout+f_siteid+'_PCA_NoSWC_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g')
            dfpc3.to_csv(dirout+f_siteid+'_PCA_NoLE_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g')
            dfpc4.to_csv(dirout+f_siteid+'_PCA_NoH_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g')
            dfpc5.to_csv(dirout+f_siteid+'_PCA_NoRAD_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g')

            dfpct1.to_csv(dirout+f_siteid+'_PCApct_ALL_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g',index=False)
            dfpct2.to_csv(dirout+f_siteid+'_PCApct_NoSWC_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g',index=False)
            dfpct3.to_csv(dirout+f_siteid+'_PCApct_NoLE_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g',index=False)
            dfpct4.to_csv(dirout+f_siteid+'_PCApct_NoH_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g',index=False)
            dfpct5.to_csv(dirout+f_siteid+'_PCApct_NoRAD_'+ind_seasons[ind]+'_5dys.csv',float_format='%.4g',index=False)

            
print('done')


2002 2012 47.11667 AT-Neu
2010 2014 -22.283 AU-ASM
2007 2009 -13.0769 AU-Ade
2010 2014 -34.0021 AU-Cpr
2007 2013 -14.0633 AU-DaP
2008 2014 -14.1593 AU-DaS
2008 2014 -15.2588 AU-Dry
2011 2013 -23.8587 AU-Emr
2006 2008 -12.5452 AU-Fog
2013 2014 -30.1913 AU-GWW
2011 2014 -31.3764 AU-Gin
2001 2014 -12.4943 AU-How
2008 2009 -34.4704 AU-Lox
2011 2013 -14.5636 AU-RDF
2011 2014 -36.6499 AU-Rig
2014 2014 -17.1175 AU-Rob
2008 2014 -17.1507 AU-Stp
2012 2014 -22.287 AU-TTE
2001 2014 -35.6566 AU-Tum
2005 2008 -37.4259 AU-Wac
2011 2014 -36.6732 AU-Whr
2010 2014 -37.4222 AU-Wom
2012 2014 -34.9893 AU-Ync
2004 2018 50.55162 BE-Lon
1996 2018 50.30493 BE-Vie
2000 2004 -3.01803 BR-Sa3
2011 2015 52.695 CA-ARB
2011 2015 52.7008 CA-ARF
1996 2010 49.8673 CA-Ca1
1999 2010 49.8705 CA-Ca2
2000 2016 49.5346 CA-Ca3
1994 2018 44.3167 CA-Cbo
2003 2014 48.2167 CA-Gro
1998 2008 49.7093 CA-Let
2001 2005 55.87917 CA-NS1
2001 2005 55.90583 CA-NS2
2001 2005 55.91167 CA-NS3
2001 2005 55.91437 CA-NS4
2001 2005 55.91667 CA-N