In [None]:
# This file contains the code for extracting and processing the SIF data 


# SOURCE: https://daac.ornl.gov/SIF-ESDR/guides/MetOpA_GOME2_SIF.html

In [None]:
import os
import pandas as pd
import numpy as np
import xarray as xr
from tqdm import tqdm
from sklearn.impute import KNNImputer


In [None]:
file='/Users/abigailbase/Downloads/2010/NSIFv2.6.2.GOME-2A.20100105_all.nc'

In [None]:
dataset=xr.open_dataset(file)

In [None]:
variables_of_interest = ['Daily_Averaged_SIF', 
                         'SIF_Uncertainty', 
                         'Cloud_Fraction', 
                         'Latitude', 
                         'Longitude']

In [None]:
selected_data = dataset[variables_of_interest]


In [None]:
df = selected_data.to_dataframe().reset_index()

In [None]:
# import site info df

sites=pd.read_csv('/Users/abigailbase/PROJECT FILES/selected_sites.csv',index_col=0)

In [None]:
lat_lon=sites[['LAT','LONG']]

In [None]:
# create bounding box to filter df further 

min_lat = lat_lon['LAT'].min()
max_lat = lat_lon['LAT'].max()
min_lon = lat_lon['LONG'].min()
max_lon = lat_lon['LONG'].max()

bounding_box = {
    'min_lat': min_lat,
    'max_lat': max_lat,
    'min_lon': min_lon,
    'max_lon': max_lon
}

In [None]:
filtered_df = df[
    (df['Latitude'] >= bounding_box['min_lat']) &
    (df['Latitude'] <= bounding_box['max_lat']) &
    (df['Longitude'] >= bounding_box['min_lon']) &
    (df['Longitude'] <= bounding_box['max_lon'])
] 


In [None]:
# functions to find points in SIF df that are closest to FLUXNET tower locations

def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # radius of Earth in kilometers
    return c * r

def find_closest(df, sites):
    closest_points = []
    
    for i, site_row in sites.iterrows():
        site_lat = site_row['LAT']
        site_lon = site_row['LONG']
        
        distances = df.apply(lambda row: haversine(site_lat, site_lon, row['Latitude'], row['Longitude']), axis=1)
        min_index = distances.idxmin()
        closest_points.append(df.loc[min_index])
    
    return pd.DataFrame(closest_points).reset_index(drop=True)


In [None]:
closest_points_df = find_closest(filtered_df, lat_lon)

In [None]:
site_ids=sites.index.to_numpy()

In [None]:
closest_points_df['site_id']=site_ids

In [None]:
closest_points_df=closest_points_df.drop(columns='obs')

In [None]:
# process each year of data and export to CSV

In [None]:
final_df = pd.DataFrame()

directory = '/Users/abigailbase/Downloads/2011'

# Get the list of all .nc files in the directory and its subdirectories
files_to_process = [os.path.join(root, file)
                    for root, dirs, files in os.walk(directory)
                    for file in files if file.endswith('.nc')]

# Iterate over the files with a progress bar
for file_path in tqdm(files_to_process, desc="Processing files"):
    # Load the NetCDF file
    ds = xr.open_dataset(file_path)
    
    # Select the variables of interest
    variables_of_interest = ['Daily_Averaged_SIF', 'SIF_Uncertainty', 'Cloud_Fraction', 'Latitude', 'Longitude']
    selected_data = ds[variables_of_interest]
    
    # Convert to a pandas DataFrame
    df = selected_data.to_dataframe().reset_index()

    # Filter the DataFrame using the bounding box
    filtered_df = df[
        (df['Latitude'] >= bounding_box['min_lat']) &
        (df['Latitude'] <= bounding_box['max_lat']) &
        (df['Longitude'] >= bounding_box['min_lon']) &
        (df['Longitude'] <= bounding_box['max_lon'])
    ]
    
    # Find the closest points
    closest_points_df = find_closest(filtered_df, sites)

    # Add site IDs if needed
    closest_points_df['site_id'] = sites.index.to_numpy()

    # Drop unnecessary columns if needed
    closest_points_df = closest_points_df.drop(columns='obs', errors='ignore')  # 'obs' column removed if exists

    # Concatenate to the final DataFrame
    final_df = pd.concat([final_df, closest_points_df], ignore_index=True)

In [None]:
final_df.to_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2011.csv')

In [None]:
final_df = pd.DataFrame()

directory = '/Users/abigailbase/Downloads/2012'

# Get the list of all .nc files in the directory and its subdirectories
files_to_process = [os.path.join(root, file)
                    for root, dirs, files in os.walk(directory)
                    for file in files if file.endswith('.nc')]

# Iterate over the files with a progress bar
for file_path in tqdm(files_to_process, desc="Processing files"):
    # Load the NetCDF file
    ds = xr.open_dataset(file_path)
    
    # Select the variables of interest
    variables_of_interest = ['Daily_Averaged_SIF', 'SIF_Uncertainty', 'Cloud_Fraction', 'Latitude', 'Longitude']
    selected_data = ds[variables_of_interest]
    
    # Convert to a pandas DataFrame
    df = selected_data.to_dataframe().reset_index()

    # Filter the DataFrame using the bounding box
    filtered_df = df[
        (df['Latitude'] >= bounding_box['min_lat']) &
        (df['Latitude'] <= bounding_box['max_lat']) &
        (df['Longitude'] >= bounding_box['min_lon']) &
        (df['Longitude'] <= bounding_box['max_lon'])
    ]
    
    # Find the closest points
    closest_points_df = find_closest(filtered_df, sites)

    # Add site IDs if needed
    closest_points_df['site_id'] = sites.index.to_numpy()

    # Drop unnecessary columns if needed
    closest_points_df = closest_points_df.drop(columns='obs', errors='ignore')  # 'obs' column removed if exists

    # Concatenate to the final DataFrame
    final_df = pd.concat([final_df, closest_points_df], ignore_index=True)

In [None]:
final_df.to_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2012.csv')

In [None]:
final_df = pd.DataFrame()

directory = '/Users/abigailbase/Downloads/201'

# Get the list of all .nc files in the directory and its subdirectories
files_to_process = [os.path.join(root, file)
                    for root, dirs, files in os.walk(directory)
                    for file in files if file.endswith('.nc')]

# Iterate over the files with a progress bar
for file_path in tqdm(files_to_process, desc="Processing files"):
    # Load the NetCDF file
    ds = xr.open_dataset(file_path)
    
    # Select the variables of interest
    variables_of_interest = ['Daily_Averaged_SIF', 'SIF_Uncertainty', 'Cloud_Fraction', 'Latitude', 'Longitude']
    selected_data = ds[variables_of_interest]
    
    # Convert to a pandas DataFrame
    df = selected_data.to_dataframe().reset_index()

    # Filter the DataFrame using the bounding box
    filtered_df = df[
        (df['Latitude'] >= bounding_box['min_lat']) &
        (df['Latitude'] <= bounding_box['max_lat']) &
        (df['Longitude'] >= bounding_box['min_lon']) &
        (df['Longitude'] <= bounding_box['max_lon'])
    ]
    
    # Find the closest points
    closest_points_df = find_closest(filtered_df, sites)

    # Add site IDs if needed
    closest_points_df['site_id'] = sites.index.to_numpy()

    # Drop unnecessary columns if needed
    closest_points_df = closest_points_df.drop(columns='obs', errors='ignore')  # 'obs' column removed if exists

    # Concatenate to the final DataFrame
    final_df = pd.concat([final_df, closest_points_df], ignore_index=True)

In [None]:
final_df.to_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2013.csv')

In [None]:
import os
import pandas as pd
import xarray as xr
from tqdm import tqdm

final_df = pd.DataFrame()

directory = '/Users/abigailbase/Downloads/2014'

# Get the list of all .nc files in the directory and its subdirectories
files_to_process = [os.path.join(root, file)
                    for root, dirs, files in os.walk(directory)
                    for file in files if file.endswith('.nc')]

# Iterate over the files with a progress bar
for file_path in tqdm(files_to_process, desc="Processing files"):
    # Print the current file being processed
    print(f"Processing file: {file_path}")
    
    # Load the NetCDF file
    ds = xr.open_dataset(file_path)
    
    # Select the variables of interest
    variables_of_interest = ['Daily_Averaged_SIF', 'SIF_Uncertainty', 'Cloud_Fraction', 'Latitude', 'Longitude']
    selected_data = ds[variables_of_interest]
    
    # Convert to a pandas DataFrame
    df = selected_data.to_dataframe().reset_index()

    # Filter the DataFrame using the bounding box
    filtered_df = df[
        (df['Latitude'] >= bounding_box['min_lat']) &
        (df['Latitude'] <= bounding_box['max_lat']) &
        (df['Longitude'] >= bounding_box['min_lon']) &
        (df['Longitude'] <= bounding_box['max_lon'])
    ]
    
    # Find the closest points
    closest_points_df = find_closest(filtered_df, sites)

    # Add site IDs if needed
    closest_points_df['site_id'] = sites.index.to_numpy()

    # Drop unnecessary columns if needed
    closest_points_df = closest_points_df.drop(columns='obs', errors='ignore')  # 'obs' column removed if exists

    # Concatenate to the final DataFrame
    final_df = pd.concat([final_df, closest_points_df], ignore_index=True)


In [None]:
final_df.to_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2014.csv')

In [None]:
### import the csvs

In [None]:
df_2010=pd.read_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2010.csv',index_col=0)
df_2011=pd.read_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2011.csv',index_col=0)
df_2012=pd.read_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2012.csv',index_col=0)
df_2013=pd.read_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2013.csv',index_col=0)
df_2014=pd.read_csv('/Users/abigailbase/PROJECT FILES/SIF/SIF_2014.csv',index_col=0)

In [None]:
# convert dates to datetime

df_2010['Delta_Time']=pd.to_datetime(df_2010['Delta_Time'].dt.date)
df_2011['Delta_Time']=pd.to_datetime(df_2011['Delta_Time'].dt.date)
df_2012['Delta_Time']=pd.to_datetime(df_2012['Delta_Time'])
df_2012['Delta_Time']=df_2012['Delta_Time'].dt.date
df_2013['Delta_Time']=pd.to_datetime(df_2013['Delta_Time'])
df_2013['Delta_Time']=df_2013['Delta_Time'].dt.date
df_2014['Delta_Time']=pd.to_datetime(df_2014['Delta_Time'])
df_2014['Delta_Time']=df_2014['Delta_Time'].dt.date


In [None]:
# concat all the years into final df

df=pd.concat([df_2010,df_2011,df_2012,df_2013,df_2014],axis=0)

In [None]:
df['Delta_Time']=pd.to_datetime(df['Delta_Time']) #convert date

In [None]:
#need to end SIF on 2014-12-19
cutoff=pd.to_datetime('2014-12-19') 

In [None]:
df['Delta_Time'] = pd.to_datetime(df['Delta_Time'])

In [None]:
df=df[df['Delta_Time']<=cutoff] #cut off data at 2014-12-19

In [None]:
all_dates = pd.date_range(start='2010-01-01', end='2014-12-19')

In [None]:
all_sites = df['site_id'].unique() 


In [None]:
# identify any dates with missing values

missing_dates = {}
for site in df['site_id'].unique():
    site_dates = df[df['site_id'] == site]['Delta_Time']
    missing = all_dates.difference(site_dates)
    if not missing.empty:
        missing_dates[site] = missing

missing_dates

In [None]:
#identify any duplicate rows

duplicates = merged_df[merged_df.duplicated(['site_id', 'Delta_Time'], keep=False)]

In [None]:
lat_target=-15.2588

In [None]:
lon_target=132.3706

In [None]:
duplicates['lat_diff'] = np.abs(duplicates['Latitude'] - lat_target)
duplicates['lon_diff'] = np.abs(duplicates['Longitude'] - lon_target)


In [None]:
duplicates['total_diff'] = duplicates['lat_diff'] + duplicates['lon_diff']


In [None]:
dup_sorted=duplicates.sort_values(by='Delta_Time')

In [None]:
dup_sorted = dup_sorted.sort_values(by=['Delta_Time', 'total_diff'])


In [None]:
df_closest = dup_sorted.drop_duplicates(subset=['Delta_Time'], keep='first')


In [None]:
indices_to_keep = df_closest.index


In [None]:
indices_to_drop = dup_sorted.index.difference(indices_to_keep)

In [None]:
df_cleaned_test = merged_df.drop(index=indices_to_drop[0])


In [None]:
## drop the duplicates

df_cleaned = merged_df.drop(index=indices_to_drop)


In [None]:
df_cleaned = merged_df.copy()


In [None]:
for idx in indices_to_drop:
    df_cleaned = df_cleaned.drop(index=idx)


In [None]:
all_combinations = pd.MultiIndex.from_product([all_dates, all_sites], names=['Delta_Time', 'site_id'])


In [None]:
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()


In [None]:
merged_df = pd.merge(all_combinations_df, df_cleaned, on=['site_id', 'Delta_Time'], how='left')


In [None]:
merged_df.shape

In [None]:
merged_df=merged_df.drop(columns=['Latitude','Longitude'])

In [None]:
# impute missing with knn imputer

knn_imputer = KNNImputer(n_neighbors=10)

In [None]:
columns_to_impute = merged_df.columns[[2, 3, 4]]


In [None]:
imputed_columns = pd.DataFrame(knn_imputer.fit_transform(merged_df[columns_to_impute]), columns=columns_to_impute, index=merged_df.index)


In [None]:
merged_df_imp=merged_df[columns_to_impute] = imputed_columns


In [None]:
merged_df_split=merged_df.reset_index()

In [None]:
AR_Vir=merged_df_split[merged_df_split['site_id']=='AR-Vir']#1
AU_Dry=merged_df_split[merged_df_split['site_id']=='AU-Dry']#2
BE_Vie=merged_df_split[merged_df_split['site_id']=='BE-Vie']#3
CA_TP1=merged_df_split[merged_df_split['site_id']=='CA-TP1']#4
CH_Cha=merged_df_split[merged_df_split['site_id']=='CH-Cha']#5
DE_Gri=merged_df_split[merged_df_split['site_id']=='DE-Gri']#6
FR_Pue=merged_df_split[merged_df_split['site_id']=='FR-Pue']#7
GF_Guy=merged_df_split[merged_df_split['site_id']=='GF-Guy']#8
IT_Col=merged_df_split[merged_df_split['site_id']=='IT-Col']#9
NL_Loo=merged_df_split[merged_df_split['site_id']=='NL-Loo']#10
RU_Cok=merged_df_split[merged_df_split['site_id']=='RU-Cok']#11
RU_Fyo=merged_df_split[merged_df_split['site_id']=='RU-Fyo']#12
US_PFa=merged_df_split[merged_df_split['site_id']=='US-PFa']#13
US_Var=merged_df_split[merged_df_split['site_id']=='US-Var']#14
ZA_Kru=merged_df_split[merged_df_split['site_id']=='ZA-Kru']#15


In [None]:
AR_Vir_knn=AR_Vir.set_index(['Delta_Time','site_id'])

In [None]:
AR_Vir_knn_index=AR_Vir_knn.index

In [None]:
AR_Vir_knn=pd.DataFrame(knn_imputer.fit_transform(AR_Vir_knn), columns=AR_Vir_knn.columns)

In [None]:
AR_Vir_knn=AR_Vir_knn.set_index(AR_Vir_knn.index,inplace=True)

In [None]:
knn_imputed_df=pd.DataFrame(knn_imputer.fit_transform(merged_df), columns=merged_df.columns)

In [None]:
# export final to df

df_cleaned.to_csv('/Users/abigailbase/PROJECT FILES/SIF/sif_df.csv')