# EDA plots for paper

In [None]:
import glob
import multiprocessing
import os

import cartopy.crs as ccrs
import cmcrameri
import cmocean
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
import xarray as xr

import ccic
from ccic.data.cpcir import CPCIR_GRID
from ccic.data.gridsat import GRIDSAT_GRID

# Load common style
plt.style.use('ccic.files.ccic')

# Data directory
ANALYSES=os.environ['ANALYSES']

In [None]:
ds_dict = {
    'GridSat': xr.open_dataset(os.path.join(ANALYSES, 'EDA/collocations_gridsat_big.nc')),
    'CPCIR': xr.open_dataset(os.path.join(ANALYSES, 'EDA/collocations_cpcir_big.nc'))
}

grid_dict = {
    'GridSat': GRIDSAT_GRID,
    'CPCIR': CPCIR_GRID
}

## Input data brightness temperatures

In [None]:
bins = np.arange(150,350)

fig, ax = plt.subplots(figsize=(6/1.25,4/1.25))

for title in ['CPCIR', 'GridSat']:
    ds = ds_dict[title]
    ax.hist(ds.ir_win, bins=bins, density=True, label=title, histtype='step', lw=2)

ax.set_xlabel('Brightness temperature [$\si{\kelvin}$]')
ax.set_ylabel('PDF [$\si{\per\kelvin}$]')
ax.legend(loc='upper left')
ax.minorticks_on()
ax.grid(ls='dotted', alpha=0.3)

fig.tight_layout()
fig.savefig('brightness_distributions_training_set.pdf', bbox_inches='tight')
plt.show()

## Cloud class frequencies table

In [None]:
coverage_dict = dict()
for key, ds in ds_dict.items():
    grid = grid_dict[key]
    H, _, _ = np.histogram2d(ds.profile_column_inds, ds.profile_row_inds, bins=(range(0, grid.projection_x_coords.size+1), range(0, grid.projection_y_coords.size+1)))
    coverage_dict[key] = ((H != 0).sum() / H.size)

print(coverage_dict)

In [None]:
# Compute the data to visualize
cloud_class_count_dict = dict.fromkeys(ds_dict.keys())
df_cloud_class_count_dict = dict.fromkeys(ds_dict.keys())
for key in cloud_class_count_dict.keys():
    cloud_class_count_dict[key] = dict()
    for class_i in tqdm.tqdm(range(9), ncols=40):
        cloud_class_count_dict[key][class_i] = np.sum(ds_dict[key].cloud_class == class_i, axis=0).data.tolist()
    cloud_class_count_dict[key]['level'] = np.arange(20)
    df_cloud_class_count_dict[key] = pd.DataFrame(cloud_class_count_dict[key]).set_index('level')

for i, (key, df) in enumerate(df_cloud_class_count_dict.items()):
    print(key)
    print(df[[i for i in range(9)]].sum(axis=0) / df.values.sum() * 100)
    print()

print(ds_dict['GridSat'].cloud_mask.sum() / ds_dict['GridSat'].cloud_mask.size)

print(ds_dict['CPCIR'].cloud_mask.sum() / ds_dict['CPCIR'].cloud_mask.size)

## Proportion of data in training-validation-test sets

In [None]:
def return_profiles(f):
    return xr.open_dataset(f).profiles.size

training_data_cpcir = glob.glob('/mnt/data_sun/ccic/dataset/training_data/cpcir/**/*nc', recursive=True)
validation_data_cpcir = glob.glob('/mnt/data_sun/ccic/dataset/validation_data/cpcir/**/*nc', recursive=True)
test_data_cpcir = glob.glob('/mnt/data_sun/ccic/dataset/test_data/cpcir/2010_*/*nc', recursive=True)

training_data_cpcir2 = glob.glob('/mnt/data_sun/ccic/dataset/training_data/cpcir2/**/*nc', recursive=True)
validation_data_cpcir2 = glob.glob('/mnt/data_sun/ccic/dataset/validation_data/cpcir2/**/*nc', recursive=True)
test_data_cpcir2 = glob.glob('/mnt/data_sun/ccic/dataset/test_data/cpcir2/2010_*/*nc', recursive=True)

training_data_gridsat = glob.glob('/mnt/data_sun/ccic/dataset/training_data/gridsat/**/*nc', recursive=True)
validation_data_gridsat = glob.glob('/mnt/data_sun/ccic/dataset/validation_data/gridsat/**/*nc', recursive=True)
test_data_gridsat = glob.glob('/mnt/data_sun/ccic/dataset/test_data/gridsat/2010_*/*nc', recursive=True)

data = [
    ("test_data_gridsat", test_data_gridsat),
    ("training_data_gridsat", training_data_gridsat),
    ("validation_data_gridsat", validation_data_gridsat),
    ("training_data_cpcir", training_data_cpcir),
    ("validation_data_cpcir", validation_data_cpcir),
    ("test_data_cpcir", test_data_cpcir),
    ("training_data_cpcir2", training_data_cpcir2),
    ("validation_data_cpcir2", validation_data_cpcir2),
    ("test_data_cpcir2", test_data_cpcir2),
]

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    for name, files in data:
        profiles = list(tqdm.tqdm(pool.imap(return_profiles, files), ncols=80, total=len(files)))
        print(name, sum(profiles))

## IWP distributions

In [None]:
bins = np.concatenate(([0], np.logspace(-6, 2, num=100, endpoint=True)))

fig, ax = plt.subplots(figsize=(6,4/1.25))

ax_mass = ax.twinx()

y_lim = 0
for i, title in enumerate(['CPCIR', 'GridSat']):
    ds = ds_dict[title]
    label = f'{title}'
    h, _, _ = ax.hist(ds.tiwp * 1e-3, bins=bins, density=True, label=label, histtype='step', lw=2, color=f'C{i}')
    print(f'{title} (random)', h[0] * np.diff(bins)[0])
    if h[1:].max() > y_lim:
        y_lim = h[1:].max()
    h_mass, _ = np.histogram(ds.tiwp * 1e-3, bins=bins, weights=ds.tiwp / ds.tiwp.sum())
    ax_mass.stairs(h_mass, bins, color=f'C{i}', lw=1)

ax.set_xlabel('TIWP [$\si{\kilo\gram\per\square\meter}$]')
ax.set_ylabel('PDF [$(\si{\kilo\gram\per\square\meter})^{-1}$]')
ax.set_xlim(bins[1], None)
ax.set_ylim(0, 3e2)
ax.set_xscale('log')
ax.axvline(1e-3, c='C2', ls='dashed')
ax.legend(loc='center')
ax.minorticks_on()
ax.grid(ls='dotted', alpha=0.3)

ax.set_ylim(1e-5, 3e2)
ax.set_yscale('log')

ax_mass.set_ylabel('Contribution to total TIWP [\si{\percent}]')

legend_elements = [
    matplotlib.lines.Line2D([0], [0], lw=0, marker='s', color='C0', label='CPCIR'),
    matplotlib.lines.Line2D([0], [0], lw=0, marker='s', color='C1', label='GridSat'),
    matplotlib.lines.Line2D([0], [0], lw=2, color='k', label='PDF'),
    matplotlib.lines.Line2D([0], [0], lw=1, color='k', label='Contribution')
]
ax.legend(handles=legend_elements, ncols=1,
          handlelength=0.5, columnspacing=1, handletextpad=0.5,
          bbox_transform=ax.transData,
          loc='lower left', bbox_to_anchor=(1e-3, 1e-4))
ax.text(.9e-3, 1e-3, 'clear-sky\nthreshold', color='C2', ha='right', weight='bold')

fig.tight_layout()

fig.savefig('iwp_distributions_training_set_log.pdf', bbox_inches='tight')

plt.show()

## IWC distributions

In [None]:
bins = np.concatenate(
    (
        [0],
        np.logspace(start=-8, stop=np.log10(15), num=100, base=10, endpoint=True)
    )
)

# Pre-compute histograms here to adjust the plot later
y_lim = 0
hist_data = {key: {i: None for i in range(-1, 20)} for key in ds_dict.keys()} # Level -1: total IWC
for (title, ds) in ds_dict.items():
    for level in tqdm.tqdm(range(20), ncols=80):
        h, _ = np.histogram(ds.tiwc.values[:, level].flatten(), bins=bins, density=True)
        hist_data[title][level] = h
        y_lim = max(y_lim, h[1:].max())
    h, _ = np.histogram(ds.tiwc.values.flatten(), bins=bins, density=True)
    hist_data[title][-1] = h
    y_lim = max(y_lim, h[1:].max())
    print('Probability contained in first bin:', title, h[0] * np.diff(bins)[0])

In [None]:
color = matplotlib.colors.ListedColormap(
    list(map(lambda x: list(cmcrameri.cm.batlow(x)), np.linspace(0., 1., 20))) + matplotlib.colors.to_rgba_array('grey').tolist()
)

fig, axs_ = plt.subplots(nrows=2, ncols=2, gridspec_kw={'width_ratios': [0.975, 0.025]})

gs = axs_[0,1].get_gridspec()
for ax in axs_[:,1]:
    ax.remove()
ax_legend = fig.add_subplot(gs[:,1])
ax_legend.set_axis_off()

axs = axs_[:,0]

axs_twins = [ax.twinx() for ax in axs]

for i, title in enumerate(['CPCIR', 'GridSat']):
    levels = hist_data[title]
    ax = axs[i]
    for level_idx, level_hist in tqdm.tqdm(levels.items(), ncols=80):
        if level_idx == -1:
            continue
        ax.stairs(level_hist, edges=bins, label='{:} km'.format(str(ds.levels.values[level_idx] / 1e3)), zorder=level_idx, color=color(level_idx))
    ax.stairs(levels[-1], edges=bins, label='all altitudes', fill=True, zorder=0, color='grey')
    
    ax.set_title(f'({chr(97+i)}) {title}', loc='left', fontsize=14)
    ax.axvline(1e-7, c='C2', ls='dashed', label='clear-sky\nthreshold')
    ax.text(1.2e-7, 1e-2, 'clear-sky\nthreshold', c='C2', weight='bold')

for i, ax in enumerate(axs):
    ax.set_xlabel('TIWC [$\si{\gram\per\cubic\meter}$]')
    ax.set_ylabel('PDF [$(\si{\gram\per\cubic\meter})^{-1}$]')
    ax.set_xlim(bins[1], None)
    ax.set_ylim(1e-8, 1e6)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.minorticks_on()
    ax.grid(ls='dotted', alpha=0.3)

cbar = fig.colorbar(plt.cm.ScalarMappable(cmap=color), ax=ax_legend, fraction=1)
cbar.set_ticks(np.linspace(0, 1, 21, endpoint=False) + 1/21/2 )
cbar.set_ticklabels([f'{str(e)}'.rjust(4, '0') + r'$\,$\si{\kilo\meter}' for e in [0.5 + i for i in range(20)]] + ['all'])
cbar.ax.set_title('Altitude')

fig.tight_layout()
fig.savefig('iwc_distributions_training_set.pdf', bbox_inches='tight')

plt.show(fig)

## Spatial distributions

In [None]:
# Use the function below with the arguments (ds_dataset.profile_row_inds, ds_dataset.profile_column_inds, dataset_GRID)
def global_indices_to_coordinates(row_idxs, col_idxs, grid):
    """Returns (lats, lons)"""
    return grid.projection_y_coords[row_idxs], grid.projection_x_coords[col_idxs]

cmap = plt.colormaps.get_cmap('gray_r').copy()
cmap.set_under('w')

fig, axs_ = plt.subplots(figsize=(8, 4.8), ncols=3, subplot_kw={'projection': ccrs.PlateCarree()}, gridspec_kw={'width_ratios': [1, 1, 0.1]})
axs = axs_[:2]

for i, grid_str in enumerate(['CPCIR', 'GridSat']):
    
    ax = axs[i]


    # Get grid
    grid = grid_dict[grid_str]

    # Make histogram computation easier
    idx_coordinates, counts = np.unique(np.array([ds_dict[grid_str].profile_row_inds, ds_dict[grid_str].profile_column_inds]).T, return_counts=True, axis=0)
    lats, lons = global_indices_to_coordinates(idx_coordinates[:,0], idx_coordinates[:,1], grid)

    delta = 2.5 # degrees
    # Not giving the density=True argument because not all weights are equal
    # But since the bins are all equal-sized, it results an unnormalized density
    H, xedges, yedges = np.histogram2d(lons, lats, bins=[np.arange(-180, 180+delta, delta), np.arange(-90, 90+delta, delta)], weights=counts)
    H = H.astype(int)
    H = H.T

    X, Y = np.meshgrid(xedges, yedges)
    im = ax.pcolormesh(X, Y, np.where(H == 0, np.nan, H), zorder=1, shading='flat', rasterized=True, cmap=cmap, transform=ccrs.PlateCarree())
    ax.gridlines(ls='dotted', alpha=0.75, draw_labels=False, zorder=0)
    ax.coastlines(color='C1', zorder=2, alpha=0.25)
    ax.set_global()
    ax.set_title(f'({chr(97+i)}) {grid_str}', loc='left', fontsize=14)
    if i == 1:
        kwargs_scatter = {
            'marker': '*',
            'transform': ccrs.PlateCarree(),
            'c':'C1',
            'edgecolors': 'k',
            'lw': 0.25,
            's': 8,
            'zorder': 100
        }
        kwargs_text = {
            'horizontalalignment': 'center',
            'fontsize': 'x-small',
            'bbox': {
                'boxstyle': 'round,pad=0.1',
                'fc': 'white',
                'alpha': 0.5
            },
            'zorder': 99,
            'color': 'C0',
            'weight': 'black',
            'transform': ccrs.PlateCarree()
        }
        # Palaiseau
        lon, lat = 2.212288546193281, 48.7157840155545
        ax.scatter(lon, lat, **kwargs_scatter)
        ax.text(lon, lat, r'\textbf{Palaiseau}', verticalalignment='bottom', **kwargs_text)

        # OLYMPEX
        lon, lat = -123, 47.5
        ax.text(lon, lat, r'\textbf{OLYMPEX}', verticalalignment='center', **kwargs_text)

        # Darwin
        lon, lat = 130.8444, -12.4637
        ax.text(lon, lat, '\n'.join([r'\textbf{Darwin}', r'\textbf{HAIC-HIWC}']), verticalalignment='center', **kwargs_text)

        # HAIC-HIWC
        rect = matplotlib.patches.Polygon(
            np.array([
              [-156.30, 2.93],
              [-156.30, 18.0],
              [-130, 18.0],
              [-130, 36],
              [-47, 36],
              [-47, 2.93],
              [-156.30, 2.93]
            ]),
            linewidth=0.25, alpha=0.5, facecolor='white', edgecolor='k', transform=ccrs.PlateCarree()
        )
        ax.add_patch(rect)
        kwargs_text_ = kwargs_text.copy()
        del kwargs_text['bbox']
        ax.text(-88.5, 20, r'\textbf{HAIC-HIWC}', verticalalignment='center', **kwargs_text)

axs_[-1].set_axis_off()
cbar = fig.colorbar(plt.cm.ScalarMappable(cmap=cmap), ax=axs_[-1],
                    fraction=0.3, shrink=0.375)
cbar.set_label('collocations density normalized', fontsize='small', labelpad=-35)


fig.tight_layout()
fig.savefig('spatial_distributions_training_set_normalizeddensity.pdf', bbox_inches='tight', dpi=1200)
plt.close(fig)