In [None]:
# This file is for processing the downloaded CO2 data for imputing

# Source: https://gml.noaa.gov/aftp//products/carbontracker/co2/CT2022/molefractions/

In [None]:
import netCDF4 as nc
import pandas as pd
import xarray as xr
import os
import numpy as np

In [None]:
# import site information 

sites=pd.read_csv('/Users/abigailbase/PROJECT FILES/selected_sites.csv',index_col=0)

In [None]:
# extract lat and lon of all the sites

lat_lon=sites[['LAT','LONG']]
rename={'LONG':'LON'}

In [None]:
# create bounding box to filter df further 

min_lat = lat_lon['LAT'].min()
max_lat = lat_lon['LAT'].max()
min_lon = lat_lon['LON'].min()
max_lon = lat_lon['LON'].max()

bounding_box = {
    'min_lat': min_lat,
    'max_lat': max_lat,
    'min_lon': min_lon,
    'max_lon': max_lon
}

In [None]:
site_ids=sites.index.to_numpy()

In [None]:
# test file before applying to whole directory

file_path='/Users/abigailbase/PROJECT FILES/CO2/2014/CT2022.molefrac_components_glb3x2_2014-01-14.nc'

In [None]:
ds=xr.open_dataset(file_path,mode='r')

In [None]:
print("Variables in the dataset:", ds.variables.keys())

In [None]:
# extract co2

co2 = ds['co2']

In [None]:
df = co2.to_dataframe().reset_index() #convert to df

In [None]:
df_lvl2=df[df['level']==2] #filter to level 2 

In [None]:
df_lvl2['time'].unique() #check time values 

In [None]:
# filter the data for 13:30 (closest mid day)

filtered_df = df_lvl2[(df_lvl2['time'].dt.hour == 13) & (df_lvl2['time'].dt.minute == 30)]


In [None]:
df_rename={'latitude':'LAT','longitude':'LON'}

In [None]:
df.rename(columns=df_rename,inplace=True)

In [None]:
## filter co2 df by bounding box 

filtered_df = filtered_df[
    (filtered_df['LAT'] >= bounding_box['min_lat']) &
    (filtered_df['LAT'] <= bounding_box['max_lat']) &
    (filtered_df['LON'] >= bounding_box['min_lon']) &
    (filtered_df['LON'] <= bounding_box['max_lon'])
]



In [None]:
### function for finding closest point

def haversine(lat1, lon1, lat2, lon2):
    
    # convert lat lon from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # radius of earth in kilometers
    return c * r

# function to find closest lat/lon
def find_closest(df, sites):
    closest_points = []
    
    for i, site_row in sites.iterrows():
        site_lat = site_row['LAT']
        site_lon = site_row['LON']
        
        # calculate distances to all points in df
        distances = df.apply(lambda row: haversine(site_lat, site_lon, row['LAT'], row['LON']), axis=1)
        
        # find the index of the closest point
        min_index = distances.idxmin()
        closest_points.append(df.loc[min_index])
    
    return pd.DataFrame(closest_points).reset_index(drop=True)



In [None]:
closest_points_df = find_closest(filtered_df, lat_lon)

In [None]:
closest_points_df['site_id']=site_ids

In [None]:
directory = '/Users/abigailbase/PROJECT FILES/CO2/2014/'

In [None]:
all_data = []

In [None]:
# walk through the directory with the files

for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.nc'):
            file_path = os.path.join(root, file)
            ds = xr.open_dataset(file_path, mode='r')
            co2 = ds['co2']
            df = co2.to_dataframe().reset_index()
            df_lvl2 = df[df['level'] == 2]
            filtered_df = df_lvl2[(df_lvl2['time'].dt.hour == 13) & (df_lvl2['time'].dt.minute == 30)]
            df_rename = {'latitude': 'LAT', 'longitude': 'LON'}
            filtered_df.rename(columns=df_rename, inplace=True)
            filtered_df = filtered_df[
                (filtered_df['LAT'] >= bounding_box['min_lat']) &
                (filtered_df['LAT'] <= bounding_box['max_lat']) &
                (filtered_df['LON'] >= bounding_box['min_lon']) &
                (filtered_df['LON'] <= bounding_box['max_lon'])
            ]
            closest_points_df = find_closest(filtered_df, lat_lon)
            closest_points_df['site_id'] = site_ids
            all_data.append(closest_points_df)

# concat all dfs
final_df = pd.concat(all_data).reset_index(drop=True)


In [None]:
final_df=final_df.sort_values(by='time')

In [None]:
final_df.to_csv('/Users/abigailbase/PROJECT FILES/CO2 CSVs/co2_2014.csv')