# OCO-2 Data Product Merging

### Author: Jon Hobbs
### Date Authored: 05-01-24

This notebook merges key variables from multiple product streams for the OCO-2 mission. The source products include

* Level 2 lite products
* Level 2 diagnostic products





## Import Libraries

In [1]:
import earthaccess

## Search CMR Catalogs using earthaccess and Obtain S3 URLs

### Search for OCO-2 Products

Request the desired OCO-2 products for specified date range and product version. Here the search is by the product's short name. The [OCO-2 "lite" CO2 product](https://doi.org/10.5067/8E4VLCK16O6Q) is used for this analysis. 

In [2]:
short_name = 'OCO2_L2_Lite_FP'
version = '11.1r'
start_time = '2020-07-05'
end_time = '2020-07-07'

results = earthaccess.search_data(
    short_name=short_name,
    version=version,
    cloud_hosted=True,
    temporal=(start_time,end_time)
)

Granules found: 4


In [3]:
s3_urls_v11 = [granule.data_links(access="direct")[0] for granule in results]
s3_urls_v11

['s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Lite_FP.11.1r/2020/oco2_LtCO2_200704_B11100Ar_230603215457s.nc4',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Lite_FP.11.1r/2020/oco2_LtCO2_200705_B11100Ar_230603215543s.nc4',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Lite_FP.11.1r/2020/oco2_LtCO2_200706_B11100Ar_230603215547s.nc4',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Lite_FP.11.1r/2020/oco2_LtCO2_200707_B11100Ar_230603215704s.nc4']

## Open files and aggregate

In [4]:
from netrc import netrc
from subprocess import Popen
from platform import system
from getpass import getpass
import os
import requests
import xarray as xr
import s3fs
import boto3

### Get S3 Token

This step requires a valid .netrc file. Confirm your Earthdata Login credentials are stored in a .netrc file below.

In [5]:
auth = earthaccess.login(strategy="netrc")

In [6]:
daac = 'GES_DISC'
temp_s3_credentials = earthaccess.get_s3_credentials(daac)

# Define a function for S3 access credentials (uses earthaccess function)

def begin_s3_direct_access(url: str=daac):
    response = earthaccess.get_s3_credentials(daac)
    return s3fs.S3FileSystem(key=response['accessKeyId'],
                             secret=response['secretAccessKey'],
                             token=response['sessionToken'],
                             client_kwargs={'region_name':'us-west-2'})

fs = begin_s3_direct_access()

In [7]:
type(fs)

s3fs.core.S3FileSystem

By calling `fs.info()` with S3FS, we can see some of the metadata assigned to files uploaded to S3 buckets.

In [8]:
fs.info(s3_urls_v11[1])

{'ETag': '"9aa0b86e018a6c7c8e9785d41cdd5ed5-1"',
 'LastModified': datetime.datetime(2023, 6, 29, 20, 21, 23, tzinfo=tzutc()),
 'size': 64686781,
 'name': 'gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Lite_FP.11.1r/2020/oco2_LtCO2_200705_B11100Ar_230603215543s.nc4',
 'type': 'file',
 'StorageClass': 'STANDARD',
 'VersionId': None,
 'ContentType': 'binary/octet-stream'}

In [9]:
### Access dataset

import pandas
import numpy

ltnc = xr.open_dataset(fs.open(s3_urls_v11[1]),
                     decode_cf=True,)
ltsdg = ltnc.sounding_id.values[:]
ltlat = ltnc.latitude.values[:]
ltlon = ltnc.longitude.values[:]
ltxco2 = ltnc.xco2.values[:]
ltflg = ltnc.xco2_quality_flag[:]
ltnc.close()

ltncrtr = xr.open_dataset(fs.open(s3_urls_v11[1]),
                              decode_cf=True,group="Retrieval")
sfctp = ltncrtr.surface_type.values[:]
ltncrtr.close()
# Surface type: 0=water, 1=land

# Orbit info
ltncsdg = xr.open_dataset(fs.open(s3_urls_v11[1]),
                          decode_cf=True,group="Sounding")
orbit = ltncsdg.orbit.values[:]
ltncsdg.close()

ltfrm = pandas.DataFrame({'SoundingID': ltsdg, 'Orbit': orbit, 'SfcType':sfctp,
                          'Latitude': ltlat, 'Longitude': ltlon, 'XCO2': ltxco2, 'V11QFlag': ltflg})
ltfrm['Sdg10s'] = numpy.floor(ltfrm['SoundingID'] / 1.0e3)


In [10]:
# Change some data types
ltfrm['SfcType'] = ltfrm['SfcType'].astype(numpy.int16)
ltfrm['Orbit'] = ltfrm['Orbit'].astype(numpy.int32)
ltfrm['V11QFlag'] = ltfrm['V11QFlag'].astype(numpy.int16)
ltfrm[1580:1590]

Unnamed: 0,SoundingID,Orbit,SfcType,Latitude,Longitude,XCO2,V11QFlag,Sdg10s
1580,2020071000000000.0,31961,0,-16.293089,-167.911102,410.25238,0,2020071000000.0
1581,2020071000000000.0,31961,0,-16.290752,-167.926483,412.038422,0,2020071000000.0
1582,2020071000000000.0,31961,0,-16.288286,-167.941956,411.092804,0,2020071000000.0
1583,2020071000000000.0,31961,0,-16.285707,-167.957321,410.759308,0,2020071000000.0
1584,2020071000000000.0,31961,0,-16.283003,-167.972763,410.616028,0,2020071000000.0
1585,2020071000000000.0,31961,0,-16.280186,-167.98822,411.209045,0,2020071000000.0
1586,2020071000000000.0,31961,0,-16.27726,-168.003693,411.726562,0,2020071000000.0
1587,2020071000000000.0,31961,0,-16.277039,-167.899597,410.083405,1,2020071000000.0
1588,2020071000000000.0,31961,0,-16.274851,-167.914871,409.861877,1,2020071000000.0
1589,2020071000000000.0,31961,0,-16.272524,-167.930252,410.692932,0,2020071000000.0


### Group by Orbit

Group results by orbit and surface type, summarize

In [11]:
def qsummary(df,grpvr,vrlst):
    # Summarize with quantiles
    nmtch = df.shape[0] 
    dfout = pandas.DataFrame({'NSmp' : nmtch}, index=[0])
    #dfout[grpvr] = df[grpvr].values[0]
    for j in range(len(vrlst)):
        tmpdt = df[vrlst[j]]
        dtvld = tmpdt[numpy.isfinite(tmpdt)]
        dtvld = dtvld[dtvld != 0.0]
        vrnm = '%s_Med' % (vrlst[j])
        dfout[vrnm] = numpy.median(dtvld)

    return dfout

# Use water only
wtrfrm = ltfrm[ltfrm['SfcType'] == 0]

grpwtr = wtrfrm.groupby(['Orbit'])
wtrqs = grpwtr.apply(qsummary,include_groups=False,grpvr='Orbit',vrlst=['XCO2','Latitude','Longitude'])
wtrqs.reset_index(drop=False,inplace=True)
print(wtrqs.shape)

(11, 6)


In [12]:
wtrqs

Unnamed: 0,Orbit,level_1,NSmp,XCO2_Med,Latitude_Med,Longitude_Med
0,31961,0,14187,412.680939,6.771222,-172.616287
1,31962,0,13063,412.958282,16.250383,160.425903
2,31963,0,7776,413.273743,13.496302,136.316086
3,31965,0,4101,410.993103,-16.43572,93.233505
4,31967,0,8351,411.602783,-5.221122,41.438992
5,31969,0,6035,411.628632,-7.692656,-7.485498
6,31970,0,13569,411.9711,-1.588485,-33.470554
7,31971,0,7442,412.549988,29.825089,-65.565956
8,31973,0,5599,411.83847,-5.944365,-106.699272
9,31974,0,11157,411.782623,-6.617579,-131.318207


## Level 2 Standard Products

Access Level 2 standard products for same date range. This processing will use the range of orbits from the previous collection to identify appropriate granules.

In [13]:
short_name = 'OCO2_L2_Standard'
version = '11r'
start_time = '2020-07-05'
end_time = '2020-07-07'

results = earthaccess.search_data(
    short_name=short_name,
    version=version,
    cloud_hosted=True,
    temporal=(start_time,end_time)
)

s3_urls_l2std = [granule.data_links(access="direct")[0] for granule in results]
print(len(s3_urls_l2std))

Granules found: 33
33


In [14]:
s3_urls_l2std[0:10]

['s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31961a_200705_B11006r_220728091845.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31962a_200705_B11006r_220728093344.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31963a_200705_B11006r_220728093721.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdND_31964a_200705_B11006r_220728095527.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31965a_200705_B11006r_220728100742.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdND_31966a_200705_B11006r_220728101546.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31967a_200705_B11006r_220728104727.h5',
 's3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_

In [15]:
# Create a data frame with L2Std info

l2sfrm = pandas.DataFrame({'S3File': s3_urls_l2std})
# Match glint only
l2sfrm['ModeOrbStr'] = l2sfrm['S3File'].str.extract(r'(L2StdGL_[0-9]{5}[a-z]{1})')
#l2sfrm['OrbStr'] = l2sfrm['S3File'].str.extract(r'(L2Std[A-Z]{2}_[0-9]{5}[a-z]{1})')

l2sfrm.dropna(subset=['ModeOrbStr'],inplace=True)
l2sfrm['OrbStr'] = l2sfrm['ModeOrbStr'].str.replace('L2StdGL_', '', regex=False)
l2sfrm['Orbit'] = l2sfrm['OrbStr'].str.replace('[a-z]{1}','',regex=True)
l2sfrm['Orbit'] = l2sfrm['Orbit'].astype(numpy.int32)

In [16]:
l2sfrm

Unnamed: 0,S3File,ModeOrbStr,OrbStr,Orbit
0,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31961a,31961a,31961
1,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31962a,31962a,31962
2,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31963a,31963a,31963
4,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31965a,31965a,31965
6,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31967a,31967a,31967
8,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31969a,31969a,31969
9,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31970a,31970a,31970
10,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31971a,31971a,31971
12,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31973a,31973a,31973
16,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31973b,31973b,31973


### Merge Lite and L2Std Info

Merge the lists of orbits from grouped Lite and L2Std. Then open remaining L2Std collection

In [17]:
mrgorb = pandas.merge(wtrqs,l2sfrm, on='Orbit', how='inner')
mrgorb

Unnamed: 0,Orbit,level_1,NSmp,XCO2_Med,Latitude_Med,Longitude_Med,S3File,ModeOrbStr,OrbStr
0,31961,0,14187,412.680939,6.771222,-172.616287,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31961a,31961a
1,31962,0,13063,412.958282,16.250383,160.425903,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31962a,31962a
2,31963,0,7776,413.273743,13.496302,136.316086,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31963a,31963a
3,31965,0,4101,410.993103,-16.43572,93.233505,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31965a,31965a
4,31967,0,8351,411.602783,-5.221122,41.438992,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31967a,31967a
5,31969,0,6035,411.628632,-7.692656,-7.485498,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31969a,31969a
6,31970,0,13569,411.9711,-1.588485,-33.470554,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31970a,31970a
7,31971,0,7442,412.549988,29.825089,-65.565956,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31971a,31971a
8,31973,0,5599,411.83847,-5.944365,-106.699272,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31973a,31973a
9,31973,0,5599,411.83847,-5.944365,-106.699272,s3://gesdisc-cumulus-prod-protected/OCO2_DATA/...,L2StdGL_31973b,31973b


In [26]:
# Loop L2Std collection
nl2std = mrgorb.shape[0]
nbtch = 0

import h5py
#fs.info(s3_urls_v11[1])

for k in range(nl2std): 
    s3nmcr = mrgorb['S3File'].values[k]
    print(s3nmcr)
    #print(fs.info(s3nmcr))
    #l2hdr = xr.open_dataset(fs.open(s3nmcr),group="RetrievalHeader")
    l2h5 = h5py.File(fs.open(s3nmcr),'r')
    l2sdg = l2h5['/RetrievalHeader/sounding_id'][:]
    l2oflg = l2h5['/RetrievalResults/outcome_flag'][:]
    l2smth = l2h5['/RetrievalResults/xco2_uncert_smooth'][:]
    l2unc = l2h5['/RetrievalResults/xco2_uncert'][:]
    l2intrf = l2h5['/RetrievalResults/xco2_uncert_interf'][:]
    l2eps = l2h5['/RetrievalResults/xco2_uncert_noise'][:]
    l2h5.close()

    l2frm = pandas.DataFrame({'SoundingID': l2sdg, 'OFlag': l2oflg, 'XCO2Unc': 1.0e6 * l2unc, 'XCO2Smooth': 1.0e6 * l2smth, \
                              'XCO2Interf': 1.0e6 * l2intrf, 'XCO2Noise': 1.0e6 * l2eps}) 
    l2frm['OFlag'] = l2frm['OFlag'].astype(numpy.int16)
    l2frm['Orbit'] = mrgorb['Orbit'].values[k]
    
    if nbtch == 0:
        oco_all = l2frm
    else:
        oco_all = pandas.concat([oco_all,l2frm], ignore_index=True) 
    nbtch = nbtch + l2frm.shape[0]

s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31961a_200705_B11006r_220728091845.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31962a_200705_B11006r_220728093344.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31963a_200705_B11006r_220728093721.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31965a_200705_B11006r_220728100742.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31967a_200705_B11006r_220728104727.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31969a_200705_B11006r_220728114152.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31970a_200705_B11006r_220728114801.h5
s3://gesdisc-cumulus-prod-protected/OCO2_DATA/OCO2_L2_Standard.11r/2020/187/oco2_L2StdGL_31971a_200705_B11006r_

In [22]:
oco_all['OFlag'].describe()

count    131386.000000
mean          1.286256
std           0.577356
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           4.000000
Name: OFlag, dtype: float64

In [23]:
oco_all['XCO2Smooth'].describe()

count    131386.000000
mean          0.162198
std           0.159400
min           0.061692
25%           0.144449
50%           0.156917
75%           0.170604
max          13.912113
Name: XCO2Smooth, dtype: float64

## Combine Full Data Frames

Combine the LtCO2 and L2Std full data frames and tabulate


In [27]:
mrgoco = pandas.merge(wtrfrm,oco_all, on=['SoundingID','Orbit'], how='outer')
mrgoco[1500:1510]

Unnamed: 0,SoundingID,Orbit,SfcType,Latitude,Longitude,XCO2,V11QFlag,Sdg10s,OFlag,XCO2Unc,XCO2Smooth,XCO2Interf,XCO2Noise
1500,2020071000000000.0,31961,0.0,-16.493748,-167.869522,412.059479,1.0,2020071000000.0,1,0.496486,0.183159,0.134821,0.441332
1501,2020071000000000.0,31961,0.0,-16.491383,-167.884949,409.728119,1.0,2020071000000.0,1,0.498769,0.179331,0.177106,0.430401
1502,2020071000000000.0,31961,0.0,-16.48888,-167.900452,411.294861,0.0,2020071000000.0,1,0.480798,0.177011,0.140947,0.424226
1503,2020071000000000.0,31961,0.0,-16.486263,-167.915878,411.191742,0.0,2020071000000.0,1,0.500497,0.183963,0.188791,0.425457
1504,2020071000000000.0,31961,0.0,-16.483521,-167.931366,411.908997,0.0,2020071000000.0,1,0.485264,0.180051,0.156339,0.422636
1505,2020071000000000.0,31961,0.0,-16.480667,-167.946854,411.655518,0.0,2020071000000.0,1,0.494945,0.182533,0.139866,0.43828
1506,2020071000000000.0,31961,0.0,-16.477705,-167.962372,411.135223,0.0,2020071000000.0,1,0.488295,0.180102,0.152292,0.427554
1507,2020071000000000.0,31961,0.0,-16.477695,-167.857925,410.439545,0.0,2020071000000.0,1,0.492378,0.17625,0.147079,0.435591
1508,2020071000000000.0,31961,0.0,-16.475473,-167.873245,410.872803,0.0,2020071000000.0,1,0.49078,0.178959,0.149611,0.431805
1509,2020071000000000.0,31961,0.0,-16.47311,-167.888672,409.948853,1.0,2020071000000.0,1,0.499236,0.186399,0.200402,0.417529


In [28]:
pandas.crosstab(mrgoco['V11QFlag'],mrgoco['OFlag'],dropna=False)

OFlag,1,2,3,4
V11QFlag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,58903,4052,0,0
1.0,23164,11217,0,0
,17558,13270,595,2627
