# Download selected variables from the INGRID CMIP6 database
* Organize data by variable/model and save each individual year into a separate file. Easiest way, I think, to more uniformly handle things because of loack of standardization in datafiles.
* Database here: http://mary.ldeo.columbia.edu:81/CMIP6/
* For master list of everything available: wget http://mary.ldeo.columbia.edu/master_collection/mary_cmip6.csv

## Start Clean

In [2]:
# Reset the environment (start clean)
%reset -f

# Import Modules and define functions
import calendar
import datetime
import os
import numpy as np
import numpy.ma as ma
import netCDF4
import matplotlib
import copy
from matplotlib import pyplot as plt
import scipy
import scipy.signal
import scipy.io as sio
import seaborn as sns
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from IPython.display import display
import requests
import pandas as pd
import os
#from mpl_toolkits.basemap import Basemap, cm, maskoceans

# cartopy stuff
import cartopy
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from shapely.geometry.polygon import LinearRing
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter

# Borders for mapping: Cultural borders
states_provinces = cfeature.NaturalEarthFeature(
    category='cultural',
    name='admin_1_states_provinces_lines',
    scale='50m',
    facecolor='none')
# Coastline
newcoast = cfeature.NaturalEarthFeature('physical', 'coastline', '10m',
                                        edgecolor='k',
                                        facecolor='none')
#Lakes
newlake = cfeature.NaturalEarthFeature('physical', 'lakes', '10m',
                                        edgecolor='k',
                                        facecolor='none')

# Embeds plots inside the notebook (use in iPython Notebook)
%matplotlib inline

# For plotting a rectangle on the maps
def plot_rectangle(ax, lonmin,lonmax,latmin,latmax):
    xs = [lonmin,lonmax,lonmax,lonmin,lonmin]
    ys = [latmin,latmin,latmax,latmax,latmin]
    #ax.plot(xs, ys,latlon = True, color='k', linestyle='--', linewidth=3)
    ax.plot(xs,ys,color='k',linestyle='--',linewidth=3,transform=ccrs.PlateCarree())

# For plotting a point on the maps
def plot_point(ax,lon,lat):
    xs = [lon]
    ys = [lat]
    #ax.plot(xs, ys,latlon = True, color='k', linestyle='--', linewidth=3)
    ax.scatter(xs,ys,color=np.array([0.5,0.5,0.5]),marker='*',linewidth=3,transform=ccrs.PlateCarree())

ocean_color = np.float64([209,230,241])/255
      
# Month Vector
mons     = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

# Set plot styles
# Formatting for titles
fontdict_title = {'fontsize': 36}
fig_size = np.array([10,10])

# Formatting for figures
style_new = {'xtick.direction': 'in', \
             'ytick.direction': 'in', \
             'font.sans-serif': 'Arial'}

# Model Names
model_names=(    'AWI-CM-1-1-MR',	
                 'BCC-CSM2-MR',		
                 'BCC-ESM1',
                 'CAMS-CSM1-0',      # NEED TO REDO
                 'CanESM5',			
                 'CESM2',			
                 'CESM2-WACCM',		
                 'CNRM-CM6-1',		
                 'CNRM-ESM2-1',		
                 'E3SM-1-0',		
                 'EC-Earth3',       # NEED TO REDO
                 'EC-Earth3-LR',    # NEED TO REDO
                 'EC-Earth3-Veg',   # NEED TO REDO
                 'FGOALS-f3-L',     # NEED TO REDO
                 'GFDL-AM4', 
                 'GFDL-CM4',  # PI CONTROL PROBLEMS		
                 'GFDL-ESM4',		
                 'GISS-E2-1-G',		
                 'GISS-E2-1-H',		
                 'HadGEM3-GC31-LL',
                 'IPSL-CM6A-LR',	 
                 'MIROC6',			# mrsos and tas done
                 'MIROC-ES2L',
                 'MPI-ESM1-2-HR',   
                 'MRI-ESM2-0',       # mrsos and tas done
                 'NESM3',             # mrsos and tas done
                 'SAM0-UNICON',		# mrsos and tas done
                 'UKESM1-0-LL',	    # mrsos and tas done
              )




## Setup: Choose models/ensemble members/variables/etc

In [3]:
# Baseline directory
base_dir   = 'http://mary.ldeo.columbia.edu:81/CMIP6/.'

# Output directory
out_dir = '/Volumes/Apollo/CMIP6/cmip6rawyr'

# MIP/Sim Name (WITH YEAR RANGES)
experiment_id = ('piControl')   # historical
#experiment_id = ('historical')   # historical
#experiment_id = ('ssp245')       # RCP 4.5 equivalent
#experiment_id = ('ssp585')       # RCP 8.5 equivalent
#experiment_id = ('ssp370')       # RCP 7.0 equivalent
    
# Variable info: name, category, units_old, units_new, scale for new unites 
var_info = np.array([  ['pr','kg m-2 s-1', 'mm day-1',86400, 'precipitation rate, all phases'],    
                       ['evspsbl', 'kg m-2 s-1', 'mm day-1',86400, 'evaporation rate, including transpiration and sublimation'],
                       ['prsn','kg m-2 s-1','mm day-1',86400,'snowfall rate'],
                       ['lai','unitless','unitless',1,'leaf area index'],
                       ['mrros','kg m-2 s-1','mm day-1',86400,'total surface run off leaving the land portion of the grid cell (excluding drainage through the base of the soil model)'],
                       ['mrro','kg m-2 s-1','mm day-1',86400,'total run-off (including drainage through the base of the soil model)'],
                       ['mrso','kg m-2','kg m-2',1,'total soil moisture content (all phases, summed over all layers)'],
                       ['mrsos','kg m-2','kg m-2',1,'surface (top 10cm) soil moisture content (all phases)'],  
                       ['tas','K','K',1,'near surface (2-m) air temperature'],
            ])

#var_info = np.array([   ['mrsos','kg m-2','kg m-2',1,'surface (top 10cm) soil moisture content (all phases)'],  
#            ])

#var_info = np.array([  ['evspsbl', 'kg m-2 s-1', 'mm day-1',86400, 'evaporation rate, including transpiration and sublimation'],
#
#            ])

# Load dataframe containing all information available
ingrid_cmip6 = pd.read_csv('../filelists/mary_cmip6_092419.csv')
#ingrid_cmip6 = pd.read_csv('../filelists/mary_cmip6_092319.csv')
#ingrid_cmip6 = pd.read_csv('../filelists/mary_cmip6_082619.csv')
#ingrid_cmip6 = pd.read_csv('../filelists/mary_cmip6_073119.csv')
#ingrid_cmip6 = pd.read_csv('../filelists/mary_cmip6_073019.csv')
print(var_info)
ingrid_cmip6

# Initialize master list to use for post-processing
df_proclist = pd.DataFrame(columns=['model','sim','ensemble','variable'])

# just some debugging and testing
#df1 = ingrid_cmip6[(ingrid_cmip6.source_id==curr_mod) & (ingrid_cmip6.variable_id==curr_var) & (ingrid_cmip6.experiment_id == experiment_id)]
#df1 = ingrid_cmip6[(ingrid_cmip6.source_id=='BCC-CSM2-MR') & (ingrid_cmip6.variable_id=='lai')]
#df1
df_proclist

[['pr' 'kg m-2 s-1' 'mm day-1' '86400' 'precipitation rate, all phases']
 ['evspsbl' 'kg m-2 s-1' 'mm day-1' '86400'
  'evaporation rate, including transpiration and sublimation']
 ['mrros' 'kg m-2 s-1' 'mm day-1' '86400'
  'total surface run off leaving the land portion of the grid cell (excluding drainage through the base of the soil model)']
 ['mrro' 'kg m-2 s-1' 'mm day-1' '86400'
  'total run-off (including drainage through the base of the soil model)']
 ['mrso' 'kg m-2' 'kg m-2' '1'
  'total soil moisture content (all phases, summed over all layers)']
 ['mrsos' 'kg m-2' 'kg m-2' '1'
  'surface (top 10cm) soil moisture content (all phases)']
 ['tas' 'K' 'K' '1' 'near surface (2-m) air temperature']]


Unnamed: 0,model,sim,ensemble,variable


## For Accessing Directory CMIP6 (NOT CMIP6i)
* Grab everything available. Put the years in the filenames.
* also grabbed fixed variables, if available: sftlf, areacella

In [4]:
# Nested model/variable loop
for n_mod in enumerate(model_names):

    # Variable/model Info
    curr_mod     = n_mod[1]
    
    # LOAD GRIDCELL AREA, IF AVAILABLE (units, m2)----------------------------------------------------------------------------------------------------------------------------------
    df_areacell       = ingrid_cmip6[(ingrid_cmip6.source_id==curr_mod) & (ingrid_cmip6.variable_id=='areacella') & (ingrid_cmip6.experiment_id == experiment_id)]

    if df_areacell.empty==False:
        areacell_link = base_dir+df_areacell.activity_id.iloc[0]+'/.'+df_areacell.institution_id.iloc[0]+'/.'+curr_mod+'/.'+experiment_id+'/.'+df_areacell.member_id.iloc[0]+'/.'+df_areacell.table_id.iloc[0]+ \
                            '/.areacella/.'+df_areacell.grid_label.iloc[0]+'/.'+df_areacell.version.iloc[0]+'/.'+df_areacell .file_basename.iloc[0]+'/.areacella/dods'
    
        # Create a Remote Link
        nc_file = netCDF4.Dataset(areacell_link) 
    
        # Pull out variables 
        areacell = nc_file.variables['areacella'][:]
        lat      = nc_file.variables['lat'][:]
        lon      = nc_file.variables['lon'][:]
    
        # Close
        nc_file.close
        
    # LOAD LAND FRACTION OF GRID CELL, IF AVAILABLE (percent)--------------------------------------------------------------------------------------------------------------------
    df_landfrac       = ingrid_cmip6[(ingrid_cmip6.source_id==curr_mod) & (ingrid_cmip6.variable_id=='sftlf') & (ingrid_cmip6.experiment_id == experiment_id)]

    if df_landfrac.empty==False:
        landfrac_link     = base_dir+df_landfrac.activity_id.iloc[0]+'/.'+df_landfrac.institution_id.iloc[0]+'/.'+curr_mod+'/.'+experiment_id+'/.'+df_landfrac.member_id.iloc[0]+'/.'+df_landfrac.table_id.iloc[0]+ \
                        '/.sftlf/.'+df_landfrac.grid_label.iloc[0]+'/.'+df_landfrac.version.iloc[0]+'/.'+df_landfrac.file_basename.iloc[0]+'/.sftlf/dods'

        # Create a Remote Link
        nc_file = netCDF4.Dataset(landfrac_link) 
    
        # Pull out variables 
        landfrac = nc_file.variables['sftlf'][:]
        lat      = nc_file.variables['lat'][:]
        lon      = nc_file.variables['lon'][:]
    
        # Close
        nc_file.close 
    else:
        landfrac=np.array([0,0])
        
    # Variables Loop--------------------------------------------------------------------------------------------------------------------------------------------------------------
    for n_var in enumerate(var_info):
        
        curr_var     = n_var[1][0]
        curr_units   = n_var[1][2]
        curr_scale   = np.float(n_var[1][3])
        curr_name    = n_var[1][4]
    
        # search for all versions for one model
        df1 = ingrid_cmip6[(ingrid_cmip6.source_id==curr_mod) & (ingrid_cmip6.variable_id==curr_var) & (ingrid_cmip6.experiment_id == experiment_id)]

        # OLD Drop duplicate ensemble members (split by year)
        #df1.drop_duplicates(subset ="member_id",keep = False, inplace = True) 

        # Number of ensemble members available
        num_ens=df1.shape[0]
     
        #fig = plt.figure(figsize=(12, 7),facecolor="white")
        #plt.pcolormesh(lon,lat,areacell),plt.colorbar()
        #fig = plt.figure(figsize=(12, 7),facecolor="white")
        #plt.pcolormesh(lon,lat,landfrac),plt.colorbar()
        #fig = plt.figure(figsize=(12, 7),facecolor="white")
        #plt.pcolormesh(lon,lat,(landfrac/100)*areacell),plt.colorbar()
        
        # Ensemble Loop----------------------------------------------------------------------------------------------------------------------------------------------------------
        for i_ens in np.arange(num_ens):

            # Construct Remote OpenDAP Link
            nc_link = base_dir+df1.activity_id.iloc[i_ens]+'/.'+df1.institution_id.iloc[i_ens]+'/.'+curr_mod+'/.'+experiment_id+'/.'+df1.member_id.iloc[i_ens]+'/.'+df1.table_id.iloc[i_ens]+ \
                        '/.'+curr_var+'/.'+df1.grid_label.iloc[i_ens]+'/.'+df1.version.iloc[i_ens]+'/.'+df1.file_basename.iloc[i_ens]+'/.'+curr_var+'/dods'

            print(curr_mod+':'+experiment_id+':'+df1.member_id.iloc[i_ens]+':'+curr_var)
            df_proclist.append(pd.Series([curr_mod,experiment_id,df1.member_id.iloc[i_ens],curr_var],index=df_proclist.columns),ignore_index=True)
            
            # Check if link exists
            request = requests.get(nc_link)    
        
            # If this Exists, then load the data
            if request.status_code==200:
                #print(nc_link)       
        
                # Open the remote file-----------------------------------------------------------------------------------------------------------------------------------------

                # Create a Remote Link
                nc_file = netCDF4.Dataset(nc_link) 
    
                # Pull out variables 
                var_nc = nc_file.variables[curr_var]
                lat    = nc_file.variables['lat'][:]
                lon    = nc_file.variables['lon'][:]

                # Time Stuff
                nctime    = nc_file.variables['time'][:]          # time data
                unit_time = nc_file.variables['time'].units       # units
                cal_time  = nc_file.variables['time'].calendar    # calendar 

                # Loop through and pull out year and month vectors from datetime object. 
                # I will use these for indexing the 20th Century Reanalysis Data
                yr_vect  = np.zeros(np.shape(nctime))
                mon_vect = np.zeros(np.shape(nctime))

                for i_date in enumerate(nctime):
                    yr_vect[i_date[0]]  = netCDF4.num2date(nctime[i_date[0]],units=unit_time,calendar=cal_time).year
                    mon_vect[i_date[0]] = netCDF4.num2date(nctime[i_date[0]],units=unit_time,calendar=cal_time).month

                # Year Vector (one value per year)
                yrs     = np.unique(yr_vect)

                # -------------------------------------------------------------------------------------------------------------------------------------------------------------
                # Pull out the data year by year and save in individual year files
                
                # Make model directory if it does not exist
                if os.path.isdir(out_dir+'/'+curr_var+'/'+curr_mod)==False:
                    os.makedirs(out_dir+'/'+curr_var+'/'+curr_mod)
                
                # Year Loop
                for i_yr in enumerate(yrs):
                    # Locate all months, current year
                    loc_yr = np.where(yr_vect==i_yr[1])[0]
    
                    # Pull out current year of data and put in the storage array
                    var_data = copy.deepcopy(var_nc[loc_yr,:,:])
    
                    # Scale the data
                    var_data = var_data*curr_scale

                    # Save current year of data to netcdf file
   
                    # Output name
                    fname_out = out_dir+'/'+curr_var+'/'+curr_mod+'/'+curr_var+'.'+experiment_id+'.'+curr_mod+'.'+df1.member_id.iloc[i_ens]+'.' \
                            +np.str(np.int(i_yr[1]))+'.nc'
      
                    # If this file exists, delete it and overwrite
                    if os.path.isfile(fname_out)==True:
                        os.remove(fname_out)
    
                    # Create and open output file
                    ncout = netCDF4.Dataset(fname_out,'w',clobber=True,format="NETCDF4_CLASSIC")

                    # Create Dimensions
                    ncout.createDimension('lat',  np.size(lat))
                    ncout.createDimension('lon',  np.size(lon))
                    ncout.createDimension('mon',  np.size(mons))

                    # Create Variables (info: name, precision, dimensions)
                    # dimensions
                    lat_nc   = ncout.createVariable('lat', float, ('lat'), zlib=True)
                    lon_nc   = ncout.createVariable('lon', float, ('lon'), zlib=True)
                    mon_nc   = ncout.createVariable('mon', float, ('mon'), zlib=True)
    
                    # Two-Dimensional Variables
                    varout_nc  = ncout.createVariable(curr_var, float, ('mon','lat','lon'), zlib=True)

                    # Write out dimensional data
                    lat_nc[:]  = lat;         lat_nc.long_name  = 'latitude';  lat_nc.units = 'degrees_north';
                    lon_nc[:]  = lon;         lon_nc.long_name  = 'longitude'; lon_nc.units = 'degrees_east';
                    mon_nc[:]  = mons;        mon_nc.long_name  = 'month';     mon_nc.units = 'unitless';   

                    # Write out data
                    varout_nc[:] = var_data; varout_nc.long_name = curr_name;  varout_nc.units = curr_units;

                    # If time invariant information is available
                    if df_areacell.empty==False:
                        areacell_nc    = ncout.createVariable('areacell', float, ('lat','lon'), zlib=True)
                        areacell_nc[:] = areacell; areacell_nc.long_name = 'grid cell area for atmospheric grid variables';  areacell_nc.units = 'm2';

                    # For some reason GFDL land mask not the same resolution. Just ignore for now.
                    #if df_landfrac.empty==False:
                    if ((df_landfrac.empty==False) & (landfrac.shape[0]==lat.size)):
                        landfrac_nc    = ncout.createVariable('landfrac', float, ('lat','lon'), zlib=True)
                        landfrac_nc[:] = landfrac; landfrac_nc.long_name = 'percentage of the grid cell occupied by land (including lakes)';  landfrac_nc.units = 'percent';
            
                    # File Information
                    ncout.comment = 'CMIP6, '+experiment_id+' run. '+curr_mod+': '+df1.member_id.iloc[i_ens]

                # Close the file
                ncout.close()

MRI-ESM2-0:ssp245:r1i1p1f1:pr
MRI-ESM2-0:ssp245:r3i1p1f1:pr
MRI-ESM2-0:ssp245:r4i1p1f1:pr
MRI-ESM2-0:ssp245:r2i1p1f1:pr
MRI-ESM2-0:ssp245:r5i1p1f1:pr
MRI-ESM2-0:ssp245:r1i1p1f1:evspsbl
MRI-ESM2-0:ssp245:r4i1p1f1:evspsbl
MRI-ESM2-0:ssp245:r2i1p1f1:evspsbl
MRI-ESM2-0:ssp245:r3i1p1f1:evspsbl
MRI-ESM2-0:ssp245:r5i1p1f1:evspsbl
MRI-ESM2-0:ssp245:r3i1p1f1:mrros
MRI-ESM2-0:ssp245:r5i1p1f1:mrros
MRI-ESM2-0:ssp245:r4i1p1f1:mrros
MRI-ESM2-0:ssp245:r2i1p1f1:mrros
MRI-ESM2-0:ssp245:r1i1p1f1:mrros
MRI-ESM2-0:ssp245:r5i1p1f1:mrro
MRI-ESM2-0:ssp245:r2i1p1f1:mrro
MRI-ESM2-0:ssp245:r4i1p1f1:mrro
MRI-ESM2-0:ssp245:r3i1p1f1:mrro
MRI-ESM2-0:ssp245:r1i1p1f1:mrro
MRI-ESM2-0:ssp245:r5i1p1f1:mrso
MRI-ESM2-0:ssp245:r4i1p1f1:mrso
MRI-ESM2-0:ssp245:r2i1p1f1:mrso
MRI-ESM2-0:ssp245:r3i1p1f1:mrso
MRI-ESM2-0:ssp245:r1i1p1f1:mrso
MRI-ESM2-0:ssp245:r5i1p1f1:mrsos
MRI-ESM2-0:ssp245:r4i1p1f1:mrsos
MRI-ESM2-0:ssp245:r2i1p1f1:mrsos
MRI-ESM2-0:ssp245:r3i1p1f1:mrsos
MRI-ESM2-0:ssp245:r1i1p1f1:mrsos
MRI-ESM2-0:ssp245:r1i1p1f

In [83]:
curr_mod='CESM2-WACCM'

In [69]:
#df1 = ingrid_cmip6[(ingrid_cmip6.source_id==curr_mod) & (ingrid_cmip6.variable_id==curr_var)]
df1 = ingrid_cmip6[(ingrid_cmip6.source_id==curr_mod) & (ingrid_cmip6.variable_id=='huss')]

df1

Unnamed: 0,activity_id,experiment_id,file_basename,file_dirname,file_fullpath,grid_label,institution_id,member_id,mip_era,source_id,table_id,time_range,variable_id,tracking_id,version
74532,CMIP,historical,huss_Amon_BCC-ESM1_historical_r2i1p1f1_gn_1850...,/m5/haibo/CMIP6mon/CMIP/BCC/BCC-ESM1/historica...,/m5/haibo/CMIP6mon/CMIP/BCC/BCC-ESM1/historica...,gn,BCC,r2i1p1f1,CMIP6,BCC-ESM1,Amon,185001-201412,huss,,v20181227
74534,CMIP,historical,huss_Amon_BCC-ESM1_historical_r3i1p1f1_gn_1850...,/m5/haibo/CMIP6mon/CMIP/BCC/BCC-ESM1/historica...,/m5/haibo/CMIP6mon/CMIP/BCC/BCC-ESM1/historica...,gn,BCC,r3i1p1f1,CMIP6,BCC-ESM1,Amon,185001-201412,huss,,v20181227
74535,CMIP,historical,huss_Amon_BCC-ESM1_historical_r1i1p1f1_gn_1850...,/m5/haibo/CMIP6mon/CMIP/BCC/BCC-ESM1/historica...,/m5/haibo/CMIP6mon/CMIP/BCC/BCC-ESM1/historica...,gn,BCC,r1i1p1f1,CMIP6,BCC-ESM1,Amon,185001-201412,huss,,v20181227


In [27]:
curr_mod

'MPI-ESM1-2-HR'

In [21]:
ingrid_cmip6

Unnamed: 0,activity_id,experiment_id,file_basename,file_dirname,file_fullpath,grid_label,institution_id,member_id,mip_era,source_id,table_id,time_range,variable_id,tracking_id,version
0,CMIP,historical,va_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_18500...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,va,,v20180301
1,CMIP,historical,vas_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_1850...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,vas,,v20180301
2,CMIP,historical,ta_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_18500...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,ta,,v20180301
3,CMIP,historical,zg_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_18500...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,zg,,v20180301
4,CMIP,historical,sfcWind_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,sfcWind,,v20180301
5,CMIP,historical,wap_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_1850...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,wap,,v20180301
6,CMIP,historical,pr_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_18500...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,pr,,v20180301
7,CMIP,historical,rsds_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_185...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,rsds,,v20180301
8,CMIP,historical,uas_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_1850...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,uas,,v20180301
9,CMIP,historical,prsn_Amon_GFDL-CM4_historical_r1i1p1f1_gr1_185...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,/m3/haibo/CMIP6mon/CMIP/NOAA-GFDL/GFDL-CM4/his...,gr1,NOAA-GFDL,r1i1p1f1,CMIP6,GFDL-CM4,Amon,185001-185412,prsn,,v20180301


## Save dataframe containing all file information

In [71]:
nc_link
df1.activity_id.iloc[0]

'ScenarioMIP'

In [74]:
nc_link

'http://mary.ldeo.columbia.edu:81/CMIP6/.CMIP/.ScenarioMIP/.BCC/.BCC-CSM2-MR/.ssp245/.r1i1p1f1/.Lmon/.mrsos/.gn/.v20190308/.mrsos_Lmon_BCC-CSM2-MR_ssp245_r1i1p1f1_gn_201501-210012.nc/.mrsos/dods'

In [69]:
http://mary.ldeo.columbia.edu:81/CMIP6/.ScenarioMIP/.BCC/.BCC-CSM2-MR/.ssp245/.r1i1p1f1/.Lmon/.mrsos/.gn/.v20190308/.mrsos_Lmon_BCC-CSM2-MR_ssp245_r1i1p1f1_gn_201501-210012.nc/.mrsos/dods 

0

In [60]:
requests.get(nc_link)  

<Response [404]>

In [61]:
df1

Unnamed: 0,activity_id,experiment_id,file_basename,file_dirname,file_fullpath,grid_label,institution_id,member_id,mip_era,source_id,table_id,time_range,variable_id,tracking_id,version
74647,ScenarioMIP,ssp245,mrsos_Lmon_BCC-CSM2-MR_ssp245_r1i1p1f1_gn_2015...,/m5/haibo/CMIP6mon/ScenarioMIP/BCC/BCC-CSM2-MR...,/m5/haibo/CMIP6mon/ScenarioMIP/BCC/BCC-CSM2-MR...,gn,BCC,r1i1p1f1,CMIP6,BCC-CSM2-MR,Lmon,201501-210012,mrsos,,v20190308


KeyError: 0