<a href="https://colab.research.google.com/github/addinar/permafrost-modeling-convlstm/blob/main/data/notebooks/CMIP5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Download and Import Packages**

In [1]:
!pip install xarray netCDF4

Collecting netCDF4
  Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime
  Downloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netCDF4
Successfully installed cftime-1.6.4.post1 netCDF4-1.7.2


In [71]:
import xarray as xr
import pandas as pd
import os

In [3]:
# if using colab
from google.colab import drive, userdata
drive.mount('/content/drive')

Mounted at /content/drive


# **Set Up Functions and Variables**

In [117]:
latitudes = {
    '0': [59.66292135, 61.68539326], # band 1: [min_lat, max_lat]
    '1': [61.68539326, 63.70786517], # band 2
    '2': [63.70786517, 65.73033708], # band 3
    '3': [65.73033708, 67.75280899], # band 4
    '4': [67.75280899, 69.7752809], # band 5
    '5': [69.7752809, 71.79775281] # band 6
}

In [118]:
rcp26_path = userdata.get('RCP26_path')
rcp45_path = userdata.get('RCP45_path')
rcp60_path = userdata.get('RCP60_path')
rcp85_path = userdata.get('RCP85_path')

In [119]:
# inspect one folder to see variable names
os.listdir(rcp45_path)

['rsds_Amon_GFDL-ESM2G_rcp45_r1i1p1_207101-207512.nc',
 'rlds_Amon_GFDL-ESM2G_rcp45_r1i1p1_208101-208512.nc',
 'rsds_Amon_GFDL-ESM2G_rcp45_r1i1p1_209601-210012.nc',
 'pr_Amon_GFDL-ESM2G_rcp45_r1i1p1_208101-208512.nc',
 'hfss_Amon_GFDL-ESM2G_rcp45_r1i1p1_205101-205512.nc',
 'pr_Amon_GFDL-ESM2G_rcp45_r1i1p1_208601-209012.nc',
 'rlds_Amon_GFDL-ESM2G_rcp45_r1i1p1_208601-209012.nc',
 'hfss_Amon_GFDL-ESM2G_rcp45_r1i1p1_207601-208012.nc',
 'prsn_Amon_GFDL-ESM2G_rcp45_r1i1p1_204601-205012.nc',
 'snw_LImon_GFDL-ESM2G_rcp45_r1i1p1_207101-207512.nc',
 'hfss_Amon_GFDL-ESM2G_rcp45_r1i1p1_205601-206012.nc',
 'prsn_Amon_GFDL-ESM2G_rcp45_r1i1p1_204101-204512.nc',
 'prsn_Amon_GFDL-ESM2G_rcp45_r1i1p1_202101-202512.nc',
 'ts_Amon_GFDL-ESM2G_rcp45_r1i1p1_203601-204012.nc',
 'snw_LImon_GFDL-ESM2G_rcp45_r1i1p1_209601-210012.nc',
 'prsn_Amon_GFDL-ESM2G_rcp45_r1i1p1_202601-203012.nc',
 'tas_Amon_GFDL-ESM2G_rcp45_r1i1p1_202101-202512.nc',
 'ts_Amon_GFDL-ESM2G_rcp45_r1i1p1_203101-203512.nc',
 'snw_LImon_GFDL-ES

In [120]:
feature_mapping = {
    'hfss': 'surface_sensible_heat_flux_sum',
    'pr': 'total_precipitation_sum',
    'prsn': 'snowfall_sum',
    'rlds': 'surface_thermal_radiation_downwards_sum',
    'tas': 'temperature_2m',
    'ts': 'skin_temperature',
    'snw': 'surface_snow_amount',
    'tos': 'average_lake_temperature', # actually 'sea_surface_temperature' but will be used as a proxy
    'snd': 'snow_depth',
    'mrsos': 'avg_volumetric_water_content',
    'hfls': 'surface_latent_heat_flux_sum',
    'rsds': 'surface_solar_radiation_downwards_sum'
}

Feature variables and corresponding standard names can be found [here](https://pcmdi.llnl.gov/mips/cmip5/docs/standard_output.pdf?id=79).

In [121]:
rcp26_data = {}
rcp45_data = {}
rcp60_data = {}
rcp85_data = {}

In [122]:
def get_mapped_attribute(file_name):
  attribute = file_name.split('_')[0]
  mapped_attribute = feature_mapping[attribute]
  return attribute, mapped_attribute

In [123]:
def create_df(file_name, rcp_folder):
  full_path = os.path.join(rcp_folder, file_name)
  data = xr.open_dataset(full_path)
  data = data.assign_coords(
    lon=(((data.lon + 180) % 360) - 180)
  )
  attribute, mapped_attribute = get_mapped_attribute(file_name)
  file_dfs = []
  for i in range(6):
    lats = latitudes[str(i)]
    min_lat = lats[0]
    max_lat = lats[1]
    try:
      region = data.sel(
        lat=slice(min_lat, max_lat),
        lon=slice(-168.75, -143.75)
      )
    except:
      region = data.sel(
        rlat=slice(min_lat, max_lat),
        rlon=slice(-168.75, -143.75)
      )
    df = region[attribute].to_dataframe().reset_index()
    df = pd.DataFrame(df.groupby('time')[attribute].max()).reset_index()
    df.rename(columns={attribute:mapped_attribute}, inplace=True)
    df['band'] = f'band_{i+1}'
    file_dfs.append(df)
  file_df = pd.concat(file_dfs, axis=0).reset_index()
  return file_df

# **Get RCP 2.6 Data**

In [61]:
for file in os.listdir(rcp26_path):
  attribute, mapped_attribute = get_mapped_attribute(file)
  file_df = create_df(file, rcp26_path)
  if mapped_attribute not in rcp26_data:
    rcp26_data[mapped_attribute] = []
  rcp26_data[mapped_attribute].append(file_df)

  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(ful

In [62]:
for key, value in rcp26_data.items():
  rcp26_data[key] = pd.concat(value, axis=0).reset_index(drop=True)

In [63]:
merged_dfs = []
for key, value in rcp26_data.items():
  df = value[['time', key, 'band']]
  merged_dfs.append(df)

final_df = merged_dfs[0]

for df in merged_dfs[1:]:
  final_df = pd.merge(final_df, df, on=['time', 'band'], how='outer')

In [64]:
print(final_df.shape, final_df.columns, final_df.isna().sum())

(5760, 14) Index(['time', 'snow_depth', 'band', 'skin_temperature', 'temperature_2m',
       'snowfall_sum', 'surface_thermal_radiation_downwards_sum',
       'avg_volumetric_water_content', 'average_lake_temperature',
       'total_precipitation_sum', 'surface_latent_heat_flux_sum',
       'surface_sensible_heat_flux_sum', 'surface_snow_amount',
       'surface_solar_radiation_downwards_sum'],
      dtype='object') time                                       0
snow_depth                                 0
band                                       0
skin_temperature                           0
temperature_2m                             0
snowfall_sum                               0
surface_thermal_radiation_downwards_sum    0
avg_volumetric_water_content               0
average_lake_temperature                   0
total_precipitation_sum                    0
surface_latent_heat_flux_sum               0
surface_sensible_heat_flux_sum             0
surface_snow_amount                     

In [69]:
output_path = userdata.get('rcp_output_path')
rcp26_file_path = os.path.join(output_path, 'RCP26_df.csv')

In [70]:
final_df.to_csv(rcp26_file_path)

# **Get RCP 4.5 Data**

There was an error with getting data for RCP 4.5. I'll need to get soil moisture data separately.

In [124]:
for file in os.listdir(rcp45_path):
  attribute, mapped_attribute = get_mapped_attribute(file)
  file_df = create_df(file, rcp45_path)
  if mapped_attribute not in rcp45_data:
    rcp45_data[mapped_attribute] = []
  rcp45_data[mapped_attribute].append(file_df)

  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(ful

In [125]:
# now get RCP 45 soil moisture content as a DF

rcp45_mrsos_path = userdata.get('RCP45_mrsos')
for file in os.listdir(rcp45_mrsos_path):
  attribute, mapped_attribute = get_mapped_attribute(file)
  file_df = create_df(file, rcp45_mrsos_path)
  if mapped_attribute not in rcp45_data:
    rcp45_data[mapped_attribute] = []
  rcp45_data[mapped_attribute].append(file_df)

  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)


In [126]:
for key, value in rcp45_data.items():
  rcp45_data[key] = pd.concat(value, axis=0).reset_index(drop=True)

In [127]:
merged_dfs = []
for key, value in rcp45_data.items():
  df = value[['time', key, 'band']]
  merged_dfs.append(df)

final_df = merged_dfs[0]

for df in merged_dfs[1:]:
  final_df = pd.merge(final_df, df, on=['time', 'band'], how='outer')

In [128]:
print(final_df.shape, final_df.columns, final_df.isna().sum())

(5760, 14) Index(['time', 'surface_solar_radiation_downwards_sum', 'band',
       'surface_thermal_radiation_downwards_sum', 'total_precipitation_sum',
       'surface_sensible_heat_flux_sum', 'snowfall_sum', 'surface_snow_amount',
       'skin_temperature', 'temperature_2m', 'surface_latent_heat_flux_sum',
       'snow_depth', 'average_lake_temperature',
       'avg_volumetric_water_content'],
      dtype='object') time                                       0
surface_solar_radiation_downwards_sum      0
band                                       0
surface_thermal_radiation_downwards_sum    0
total_precipitation_sum                    0
surface_sensible_heat_flux_sum             0
snowfall_sum                               0
surface_snow_amount                        0
skin_temperature                           0
temperature_2m                             0
surface_latent_heat_flux_sum               0
snow_depth                                 0
average_lake_temperature                

In [130]:
output_path = userdata.get('rcp_output_path')
rcp45_file_path = os.path.join(output_path, 'RCP45_df.csv')
final_df.to_csv(rcp45_file_path)

# **Get RCP 6.0 Data**

In [78]:
for file in os.listdir(rcp60_path):
  attribute, mapped_attribute = get_mapped_attribute(file)
  file_df = create_df(file, rcp60_path)
  if mapped_attribute not in rcp60_data:
    rcp60_data[mapped_attribute] = []
  rcp60_data[mapped_attribute].append(file_df)

  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(ful

In [79]:
for key, value in rcp60_data.items():
  rcp60_data[key] = pd.concat(value, axis=0).reset_index(drop=True)

In [81]:
merged_dfs = []
for key, value in rcp60_data.items():
  df = value[['time', key, 'band']]
  merged_dfs.append(df)

final_df = merged_dfs[0]

for df in merged_dfs[1:]:
  final_df = pd.merge(final_df, df, on=['time', 'band'], how='outer')

In [82]:
print(final_df.shape, final_df.columns, final_df.isna().sum())

(5760, 14) Index(['time', 'surface_latent_heat_flux_sum', 'band',
       'surface_solar_radiation_downwards_sum', 'avg_volumetric_water_content',
       'average_lake_temperature', 'temperature_2m', 'snow_depth',
       'surface_thermal_radiation_downwards_sum', 'total_precipitation_sum',
       'skin_temperature', 'snowfall_sum', 'surface_snow_amount',
       'surface_sensible_heat_flux_sum'],
      dtype='object') time                                       0
surface_latent_heat_flux_sum               0
band                                       0
surface_solar_radiation_downwards_sum      0
avg_volumetric_water_content               0
average_lake_temperature                   0
temperature_2m                             0
snow_depth                                 0
surface_thermal_radiation_downwards_sum    0
total_precipitation_sum                    0
skin_temperature                           0
snowfall_sum                               0
surface_snow_amount                     

In [83]:
output_path = userdata.get('rcp_output_path')
rcp60_file_path = os.path.join(output_path, 'RCP60_df.csv')
final_df.to_csv(rcp60_file_path)

# **Get RCP 8.5 Data**

In [89]:
for file in os.listdir(rcp85_path):
  attribute, mapped_attribute = get_mapped_attribute(file)
  file_df = create_df(file, rcp85_path)
  if mapped_attribute not in rcp85_data:
    rcp85_data[mapped_attribute] = []
  rcp85_data[mapped_attribute].append(file_df)

  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(full_path)
  data = xr.open_dataset(ful

In [90]:
for key, value in rcp85_data.items():
  rcp85_data[key] = pd.concat(value, axis=0).reset_index(drop=True)

In [91]:
merged_dfs = []
for key, value in rcp85_data.items():
  df = value[['time', key, 'band']]
  merged_dfs.append(df)

final_df = merged_dfs[0]

for df in merged_dfs[1:]:
  final_df = pd.merge(final_df, df, on=['time', 'band'], how='outer')

In [92]:
print(final_df.shape, final_df.columns, final_df.isna().sum())

(5760, 14) Index(['time', 'surface_sensible_heat_flux_sum', 'band', 'snowfall_sum',
       'surface_thermal_radiation_downwards_sum',
       'surface_solar_radiation_downwards_sum', 'total_precipitation_sum',
       'surface_latent_heat_flux_sum', 'snow_depth', 'skin_temperature',
       'surface_snow_amount', 'avg_volumetric_water_content',
       'average_lake_temperature', 'temperature_2m'],
      dtype='object') time                                       0
surface_sensible_heat_flux_sum             0
band                                       0
snowfall_sum                               0
surface_thermal_radiation_downwards_sum    0
surface_solar_radiation_downwards_sum      0
total_precipitation_sum                    0
surface_latent_heat_flux_sum               0
snow_depth                                 0
skin_temperature                           0
surface_snow_amount                        0
avg_volumetric_water_content               0
average_lake_temperature                

In [93]:
output_path = userdata.get('rcp_output_path')
rcp85_file_path = os.path.join(output_path, 'RCP85_df.csv')
final_df.to_csv(rcp85_file_path)