In [None]:
# The first file will import the key data required for the study. 
# FLUXNET data is imported along with additional information on
# the sites. 

### Sources ###

# FLUXNET data: https://fluxnet.org
# Additional site data: https://fluxnet.org/sites/site-list-and-pages/



In [None]:
import os
import re
import pandas as pd
import seaborn as sns
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import calendar
from shapely.geometry import Point
from mpl_toolkits.basemap import Basemap
from pyrealm import pmodel
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [None]:
### Import the additional site informations 
### SITE_ID, SITE_NAME, LAT, LONG

site_key=pd.read_csv('/Users/abigailbase/PROJECT FILES/site_key.csv')

In [None]:
#Import the FLUXNET daily data for all sites

df=pd.read_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/final_df.csv')

In [None]:
### Assign hemisphere to the points in the df. Above the equator (0) 
### is Northern Hemisphere (NH) 


def assign_hemisphere(latitude):
    if LAT > 0:
        return "Northern Hemisphere" 
    elif LONG < 0:
        return "Southern Hemisphere"
    else:
        return "Equator"  # This handles the case where latitude is exactly 0

In [None]:
### Assign hemisphere to the points in site data 

site_key['hemisphere']=site_key['LAT'].apply(lambda x: 'NH' if x>=0 else 'SH')

In [None]:
### Map of the sites ###


fig, ax = plt.subplots(figsize=(22, 14))

map = Basemap(projection='cyl', llcrnrlat=-90, urcrnrlat=90,
              llcrnrlon=-180, urcrnrlon=180, resolution='c', ax=ax)

map.drawcoastlines()

# assign colors based on the hemisphere
colors = site_key['hemisphere'].map({
    'NH': 'red',
    'SH': 'blue'
})

# plot the sites
map.scatter(site_key['LONG'], site_key['LAT'], marker='v', c=colors, edgecolor='black', s=140)


# draw parallels and meridians
parallels = np.arange(-90., 91., 30.)
meridians = np.arange(-180., 181., 60.)
map.drawparallels(parallels, labels=[1, 0, 0, 0], linewidth=0.5, color='grey')
map.drawmeridians(meridians, labels=[0, 0, 0, 1], linewidth=0.5, color='grey')


# legend
dummy_scatter_north = plt.scatter([], [], color='red', marker='v', s=200, edgecolors='black', label='Northern Hemisphere')
dummy_scatter_south = plt.scatter([], [], color='blue', marker='v', s=200, edgecolors='black', label='Southern Hemisphere')


legend = plt.legend(handles=[dummy_scatter_north, dummy_scatter_south], loc='lower left', fontsize=18)
legend.set_title("FLUXNET Sites", prop={'size': 20, 'weight': 'bold'})



frame = legend.get_frame()
frame.set_edgecolor('black')      
frame.set_linewidth(1.5)          
frame.set_alpha(1)                


plt.show()

In [None]:
### select the variables of interest ###


final_df=df[['TIMESTAMP','SITE_ID','TA_F','PA_F','VPD_F','P_F','WS_F',
            'PPFD_IN','PPFD_OUT','CO2_F_MDS','TS_F_MDS_1',
           'SWC_F_MDS_1','NEE_VUT_REF','GPP_DT_VUT_REF']]

In [None]:
final_df.head()

In [None]:
final_df['TIMESTAMP'].dtype #check data type = int64

In [None]:
### investigate the dates to see the format

print(final_df['TIMESTAMP'].unique().tolist())


In [None]:
### There were datapoints which had the format YYYY%MM% so these
### were seperated out 

### seperate the data into 2 dfs for the differing datatime format 

def identify_format(date_str):
    if len(date_str) == 8:  #YYYY%MM%DD 
        return 'full_date'
    elif len(date_str) == 6:  #YYYY%MM 
        return 'month_year'
    else:
        return 'invalid'

In [None]:
date_df=final_df.copy()

In [None]:
date_df['TIMESTAMP'] = date_df['TIMESTAMP'].astype(str) #convert the date to string

In [None]:
date_df['TIMESTAMP']

In [None]:
#create dummy column to flag the date type

date_df['date_type']=date_df['TIMESTAMP'].apply(identify_format)

In [None]:
full_date=date_df[date_df['date_type']=='full_date'] #YYYY%MM%DD

In [None]:
month_year=date_df[date_df['date_type']=='month_year'] #YYYY%MM

In [None]:
full_date.head()

In [None]:
full_date=full_date.drop(columns='date_type') #drop the dummy col

In [None]:
full_date['TIMESTAMP']=full_date['TIMESTAMP'].astype(int) #convert date to integer

In [None]:
full_date['TIMESTAMP'].dtype

In [None]:
### convert the date to pandas datetime 

full_date['TIMESTAMP'] = pd.to_datetime(full_date['TIMESTAMP'], format='%Y%m%d')


In [None]:
full_date['TIMESTAMP'].dtype

In [None]:
### extract the year, month and day for full_date

full_date.loc[:, 'YEAR'] = full_date['TIMESTAMP'].dt.year


In [None]:
full_date.loc[:, 'MONTH'] = full_date['TIMESTAMP'].dt.month

In [None]:
full_date.loc[:, 'DAY'] = full_date['TIMESTAMP'].dt.day

In [None]:
### drop the timestamp column

full_date=full_date.drop(columns='TIMESTAMP') 

In [None]:
full_date.isna().sum() #No missing values in each column

In [None]:
print(full_date.shape) #

In [None]:
# now the final df with the timestamp seperated out as year, month, day is saved
# and will be imported into the next file. 


full_date.to_csv('/Users/abigailbase/PROJECT FILES/FINAL DFs/full_date.csv',index=False)