In [None]:
# Install NetCDF and related packages
!pip install netCDF4 xarray numpy matplotlib cartopy pandas

# NetCDF Tutorial - Climate & Geoscience Data Standard

**NetCDF** is the standard for climate and geoscience data:
- **Self-describing**: Rich metadata following CF conventions
- **Multidimensional**: Perfect for gridded data (time, lat, lon, level)
- **Interoperable**: Works with all major climate tools
- **Efficient**: Optimized for scientific array access patterns

Perfect for: weather data, oceanography, satellite imagery, climate models.

In [None]:
import netCDF4 as nc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr  # Modern NetCDF interface
import warnings
import os
warnings.filterwarnings('ignore')

print(f"NetCDF4 version: {nc.__version__}")
print(f"xarray version: {xr.__version__}")

## 🌍 Creating Climate Data

In [None]:
# Generate realistic climate dataset
np.random.seed(42)

# Coordinate dimensions
ntime = 365  # Daily data for 1 year
nlat = 180   # 1-degree latitude grid
nlon = 360   # 1-degree longitude grid
nlevel = 10  # Pressure levels

# Create coordinate arrays
time_base = datetime(2023, 1, 1)
times = [time_base + timedelta(days=i) for i in range(ntime)]
time_numeric = nc.date2num(times, 'days since 1900-01-01', calendar='gregorian')

latitudes = np.linspace(-89.5, 89.5, nlat)
longitudes = np.linspace(-179.5, 179.5, nlon)
pressure_levels = [1000, 925, 850, 700, 600, 500, 400, 300, 250, 200]  # hPa

# Generate realistic temperature data
# Base temperature with latitude gradient
lat_grid, lon_grid = np.meshgrid(latitudes, longitudes, indexing='ij')
base_temp = 30 - 0.6 * np.abs(lat_grid)  # Cooler at poles

# Add seasonal cycle
temp_data = np.zeros((ntime, nlat, nlon))
for t in range(ntime):
    day_of_year = t + 1
    seasonal = 10 * np.cos(2 * np.pi * (day_of_year - 172) / 365)  # Peak in summer
    # Different seasonal amplitude by latitude
    seasonal_field = seasonal * (1 - 0.5 * np.abs(lat_grid) / 90)
    # Hemisphere phase shift
    seasonal_field[lat_grid < 0] *= -1
    
    temp_data[t, :, :] = base_temp + seasonal_field + np.random.normal(0, 2, (nlat, nlon))

# Generate precipitation (more realistic pattern)
precip_data = np.zeros((ntime, nlat, nlon))
for t in range(ntime):
    # ITCZ pattern
    itcz_lat = 10 * np.sin(2 * np.pi * t / 365)
    itcz_rain = 20 * np.exp(-((lat_grid - itcz_lat) / 10)**2)
    
    # Mid-latitude storm tracks
    storm_rain = 5 * (np.exp(-((lat_grid - 45) / 15)**2) + np.exp(-((lat_grid + 45) / 15)**2))
    
    base_precip = itcz_rain + storm_rain
    precip_data[t, :, :] = np.maximum(0, base_precip + np.random.exponential(2, (nlat, nlon)))

# Generate 3D atmospheric data (temperature by pressure level)
temp_3d = np.zeros((ntime, nlevel, nlat, nlon))
for p, pres in enumerate(pressure_levels):
    # Temperature decreases with altitude (roughly -6.5K/km)
    # Using standard atmosphere approximation
    altitude_km = 44.3 - 6.25 * np.log(pres / 1013.25)  # Approximate altitude
    temp_offset = -6.5 * altitude_km  # Lapse rate
    temp_3d[:, p, :, :] = temp_data + temp_offset

print(f"Generated climate data:")
print(f"  2D Temperature: {temp_data.shape} (time, lat, lon)")
print(f"  2D Precipitation: {precip_data.shape}")
print(f"  3D Temperature: {temp_3d.shape} (time, level, lat, lon)")
print(f"  Total size: {(temp_data.nbytes + precip_data.nbytes + temp_3d.nbytes)/1024**2:.1f} MB")

## 📁 Creating NetCDF with CF Conventions

In [None]:

filename = 'climate_data.nc'

with nc.Dataset(filename, 'w', format='NETCDF4') as ncfile:
    # Global attributes (CF convention)
    ncfile.title = 'Synthetic Global Climate Data'
    ncfile.institution = 'Python Climate Workshop'
    ncfile.source = 'Synthetic data generated for demonstration'
    ncfile.history = f'Created on {datetime.now().isoformat()}'
    ncfile.Conventions = 'CF-1.8'
    ncfile.comment = 'Educational dataset demonstrating NetCDF structure'
    
    # Create dimensions
    ncfile.createDimension('time', ntime)
    ncfile.createDimension('lat', nlat)
    ncfile.createDimension('lon', nlon)
    ncfile.createDimension('level', nlevel)
    
    # Coordinate variables
    times_var = ncfile.createVariable('time', 'f8', ('time',))
    times_var[:] = time_numeric
    times_var.units = 'days since 1900-01-01'
    times_var.calendar = 'gregorian'
    times_var.long_name = 'time'
    times_var.standard_name = 'time'
    
    lats_var = ncfile.createVariable('lat', 'f4', ('lat',))
    lats_var[:] = latitudes
    lats_var.units = 'degrees_north'
    lats_var.long_name = 'latitude'
    lats_var.standard_name = 'latitude'
    
    lons_var = ncfile.createVariable('lon', 'f4', ('lon',))
    lons_var[:] = longitudes
    lons_var.units = 'degrees_east'
    lons_var.long_name = 'longitude'
    lons_var.standard_name = 'longitude'
    
    levels_var = ncfile.createVariable('level', 'f4', ('level',))
    levels_var[:] = pressure_levels
    levels_var.units = 'hPa'
    levels_var.long_name = 'pressure level'
    levels_var.standard_name = 'air_pressure'
    levels_var.positive = 'down'
    
    # Data variables (set fill_value at creation)
    fv = np.float32(-999.0)

    temp_var = ncfile.createVariable(
        'temperature', 'f4', ('time', 'lat', 'lon'),
        zlib=True, complevel=6, shuffle=True, fill_value=fv
    )
    temp_var[:] = temp_data.astype(np.float32)
    temp_var.units = 'degrees_celsius'
    temp_var.long_name = '2-meter air temperature'
    temp_var.standard_name = 'air_temperature'
    temp_var.cell_methods = 'time: mean'
    
    precip_var = ncfile.createVariable(
        'precipitation', 'f4', ('time', 'lat', 'lon'),
        zlib=True, complevel=6, shuffle=True, fill_value=fv
    )
    precip_var[:] = precip_data.astype(np.float32)
    precip_var.units = 'mm/day'  # note: CF 'precipitation_flux' expects kg m-2 s-1
    precip_var.long_name = 'daily precipitation'
    precip_var.standard_name = 'precipitation_flux'
    precip_var.cell_methods = 'time: sum'
    
    temp_3d_var = ncfile.createVariable(
        'temperature_3d', 'f4', ('time', 'level', 'lat', 'lon'),
        zlib=True, complevel=6, shuffle=True,
        chunksizes=(10, 5, 45, 90), fill_value=fv
    )
    temp_3d_var[:] = temp_3d.astype(np.float32)
    temp_3d_var.units = 'degrees_celsius'
    temp_3d_var.long_name = 'air temperature at pressure levels'
    temp_3d_var.standard_name = 'air_temperature'
    temp_3d_var.coordinates = 'level lat lon'
    
    print(f"NetCDF file created: {filename}")
    print(f"Variables: {list(ncfile.variables.keys())}")

print(f"File size: {os.path.getsize(filename)/1024**2:.1f} MB")


## 🔍 Reading NetCDF Data

In [None]:
# Reading with netCDF4 (low-level)
print("Reading with netCDF4:")
print("=" * 30)

with nc.Dataset(filename, 'r') as ncfile:
    print(f"Global attributes:")
    for attr in ncfile.ncattrs():
        print(f"  {attr}: {getattr(ncfile, attr)}")
    
    print(f"\nDimensions:")
    for dim in ncfile.dimensions:
        print(f"  {dim}: {len(ncfile.dimensions[dim])}")
    
    print(f"\nVariables:")
    for var in ncfile.variables:
        v = ncfile.variables[var]
        print(f"  {var}: {v.shape} {v.dtype}")
        if hasattr(v, 'units'):
            print(f"    units: {v.units}")
    
    # Read sample data
    temp = ncfile.variables['temperature'][:]
    lats = ncfile.variables['lat'][:]
    lons = ncfile.variables['lon'][:]
    times = ncfile.variables['time'][:]
    
    print(f"\nData ranges:")
    print(f"  Temperature: {temp.min():.1f} to {temp.max():.1f} °C")
    print(f"  Time range: {nc.num2date(times[0], ncfile.variables['time'].units)} to {nc.num2date(times[-1], ncfile.variables['time'].units)}")

In [None]:
# Reading with xarray (modern approach)
print("\nReading with xarray:")
print("=" * 30)

ds = xr.open_dataset(filename)
print(ds)

print(f"\nDataset info:")
print(f"  Coordinates: {list(ds.coords)}")
print(f"  Data variables: {list(ds.data_vars)}")
print(f"  Global attrs: {len(ds.attrs)} attributes")

## 🗺️ Visualizing Climate Data

In [None]:
# Visualize the climate data
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Global temperature map (annual mean)
temp_annual = ds['temperature'].mean(dim='time')
im1 = axes[0,0].imshow(temp_annual, extent=[-180, 180, -90, 90], 
                      cmap='RdYlBu_r', aspect='equal')
axes[0,0].set_title('Annual Mean Temperature')
axes[0,0].set_xlabel('Longitude')
axes[0,0].set_ylabel('Latitude')
plt.colorbar(im1, ax=axes[0,0], label='°C')

# Annual precipitation
precip_annual = ds['precipitation'].mean(dim='time')
im2 = axes[0,1].imshow(precip_annual, extent=[-180, 180, -90, 90], 
                      cmap='Blues', aspect='equal')
axes[0,1].set_title('Annual Mean Precipitation')
axes[0,1].set_xlabel('Longitude')
axes[0,1].set_ylabel('Latitude')
plt.colorbar(im2, ax=axes[0,1], label='mm/day')

# Time series at specific location (e.g., New York: 40.7°N, 74°W)
ny_temp = ds['temperature'].sel(lat=40.5, lon=-74, method='nearest')
ny_precip = ds['precipitation'].sel(lat=40.5, lon=-74, method='nearest')

ax3 = axes[1,0]
ax3.plot(ny_temp.time, ny_temp, 'r-', linewidth=2, label='Temperature')
ax3.set_xlabel('Time')
ax3.set_ylabel('Temperature (°C)', color='r')
ax3.tick_params(axis='y', labelcolor='r')
ax3.set_title('Time Series near New York')

ax3_twin = ax3.twinx()
ax3_twin.bar(ny_precip.time, ny_precip, alpha=0.6, color='blue', width=0.8, label='Precipitation')
ax3_twin.set_ylabel('Precipitation (mm/day)', color='b')
ax3_twin.tick_params(axis='y', labelcolor='b')

# Vertical temperature profile (January average at equator)
temp_profile = ds['temperature_3d'].isel(time=0).sel(lat=0, lon=0, method='nearest')
axes[1,1].plot(temp_profile, temp_profile.level, 'g-', linewidth=2, marker='o')
axes[1,1].set_xlabel('Temperature (°C)')
axes[1,1].set_ylabel('Pressure (hPa)')
axes[1,1].set_title('Vertical Temperature Profile (Jan 1, Equator)')
axes[1,1].invert_yaxis()  # Higher altitude at top
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 📊 Advanced NetCDF Analysis with xarray

In [None]:
# Demonstrate xarray's powerful analysis capabilities
print("Advanced Climate Analysis with xarray:")
print("=" * 40)

# 1. Seasonal climatology
seasonal_temp = ds['temperature'].groupby('time.season').mean()
print(f"Seasonal climatology shape: {seasonal_temp.shape}")

# 2. Global averages
# Weight by cosine of latitude for proper area averaging
weights = np.cos(np.deg2rad(ds.lat))
global_temp = (ds['temperature'] * weights).sum(dim=['lat', 'lon']) / weights.sum()

# 3. Monthly anomalies
monthly_clim = ds['temperature'].groupby('time.month').mean()
monthly_anom = ds['temperature'].groupby('time.month') - monthly_clim

# 4. Regional statistics
# Select tropical region (30°S to 30°N)
tropical = ds.sel(lat=slice(-30, 30))
tropical_mean_temp = tropical['temperature'].mean(dim=['lat', 'lon'])

print(f"\nAnalysis results:")
print(f"  Global mean temperature: {float(global_temp.mean()):.1f} °C")
print(f"  Tropical mean temperature: {float(tropical_mean_temp.mean()):.1f} °C")
print(f"  Temperature range: {float(ds['temperature'].min()):.1f} to {float(ds['temperature'].max()):.1f} °C")

# Visualize analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Seasonal temperature differences
season_diff = seasonal_temp.sel(season='JJA') - seasonal_temp.sel(season='DJF')
im1 = axes[0,0].imshow(season_diff, extent=[-180, 180, -90, 90], 
                      cmap='RdBu_r', vmin=-20, vmax=20, aspect='equal')
axes[0,0].set_title('Summer - Winter Temperature Difference')
plt.colorbar(im1, ax=axes[0,0], label='°C')

# Global temperature time series
axes[0,1].plot(global_temp.time, global_temp, 'b-', linewidth=2)
axes[0,1].set_title('Global Mean Temperature')
axes[0,1].set_ylabel('Temperature (°C)')
axes[0,1].grid(True, alpha=0.3)

# Monthly temperature anomaly at a point
point_anom = monthly_anom.sel(lat=45, lon=0, method='nearest')
axes[1,0].plot(point_anom.time, point_anom, 'r-', linewidth=1)
axes[1,0].axhline(0, color='black', linestyle='--', alpha=0.5)
axes[1,0].set_title('Monthly Temperature Anomalies (45°N, 0°E)')
axes[1,0].set_ylabel('Anomaly (°C)')
axes[1,0].grid(True, alpha=0.3)

# Zonal mean temperature
zonal_temp = ds['temperature'].mean(dim=['time', 'lon'])
axes[1,1].plot(zonal_temp, zonal_temp.lat, 'g-', linewidth=2)
axes[1,1].set_xlabel('Temperature (°C)')
axes[1,1].set_ylabel('Latitude')
axes[1,1].set_title('Zonal Mean Temperature')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 💾 NetCDF Best Practices

In [None]:
# Demonstrate chunking and compression strategies
import os
print("NetCDF Optimization Strategies:")
print("=" * 35)

# Test different chunking strategies for time series data
test_data = np.random.randn(365, 100, 100).astype(np.float32)

strategies = {
    'time_chunks': (50, 100, 100),    # Good for time series analysis
    'spatial_chunks': (365, 25, 25),  # Good for spatial analysis  
    'balanced': (30, 50, 50),         # Balanced approach
    'auto': None                      # Let NetCDF decide
}

for name, chunks in strategies.items():
    test_file = f'test_{name}.nc'
    
    with nc.Dataset(test_file, 'w', format='NETCDF4') as ncfile:
        ncfile.createDimension('time', 365)
        ncfile.createDimension('y', 100)
        ncfile.createDimension('x', 100)
        
        if chunks:
            var = ncfile.createVariable('data', 'f4', ('time', 'y', 'x'),
                                       zlib=True, complevel=6, 
                                       chunksizes=chunks)
        else:
            var = ncfile.createVariable('data', 'f4', ('time', 'y', 'x'),
                                       zlib=True, complevel=6)
        var[:] = test_data
    
    file_size = os.path.getsize(test_file) / 1024**2
    
    # Test read performance
    import time
    with nc.Dataset(test_file, 'r') as ncfile:
        start = time.time()
        _ = ncfile.variables['data'][0, :, :]  # Read one time slice
        spatial_time = time.time() - start
        
        start = time.time()
        _ = ncfile.variables['data'][:, 50, 50]  # Read time series at point
        temporal_time = time.time() - start
    
    print(f"{name:15s}: {file_size:5.1f} MB | Spatial: {spatial_time:.3f}s | Temporal: {temporal_time:.3f}s")
    os.remove(test_file)

print(f"\n💡 Chunking Guidelines:")
print(f"   • Time-series analysis: Chunk along time dimension")
print(f"   • Spatial analysis: Chunk spatially (lat/lon)")
print(f"   • Balanced: Use moderate chunks for mixed access")
print(f"   • File size: Compression effectiveness varies by chunk size")

In [None]:
# Demonstrate NetCDF data export/conversion
print("\nData Export Options:")
print("=" * 20)

# Export to different formats using xarray
sample_data = ds.isel(time=slice(0, 10))  # First 10 days

# 1. Save subset as new NetCDF
sample_data.to_netcdf('sample_data.nc')
print(f"NetCDF subset: {os.path.getsize('sample_data.nc')/1024:.1f} KB")

# 2. Export to CSV (for specific location)
point_data = ds.sel(lat=40, lon=-74, method='nearest').to_dataframe()
point_data.to_csv('point_timeseries.csv')
print(f"CSV export: {os.path.getsize('point_timeseries.csv')/1024:.1f} KB")

# 3. Convert to different NetCDF format
sample_data.to_netcdf('sample_classic.nc', format='NETCDF3_CLASSIC')
print(f"NetCDF3 format: {os.path.getsize('sample_classic.nc')/1024:.1f} KB")

# Clean up
for f in ['sample_data.nc', 'point_timeseries.csv', 'sample_classic.nc']:
    if os.path.exists(f):
        os.remove(f)

print(f"\n🔄 Integration Tips:")
print(f"   • Use xarray for analysis, netCDF4 for low-level control")
print(f"   • Follow CF conventions for metadata")
print(f"   • Add comprehensive attributes and documentation")
print(f"   • Use appropriate chunking for your access patterns")
print(f"   • Enable compression for most scientific datasets")

## 📋 NetCDF Quick Reference

In [None]:
# Clean up
ds.close()
if os.path.exists(filename):
    os.remove(filename)
    print(f"Cleaned up: {filename}")

reference = """
NETCDF QUICK REFERENCE:

Creating NetCDF (netCDF4):
  import netCDF4 as nc
  with nc.Dataset('file.nc', 'w') as f:
      f.createDimension('time', None)  # Unlimited
      f.createDimension('lat', 180)
      var = f.createVariable('temp', 'f4', ('time', 'lat', 'lon'),
                            zlib=True, complevel=6)
      var[:] = data
      var.units = 'degrees_celsius'

Reading NetCDF (xarray - recommended):
  import xarray as xr
  ds = xr.open_dataset('file.nc')
  temp = ds['temperature']         # Access variable
  subset = ds.sel(lat=45, lon=0)   # Select by coordinate
  annual = ds.groupby('time.year').mean()  # Group operations

CF Convention Attributes:
  var.standard_name = 'air_temperature'     # CF standard name
  var.long_name = 'Air Temperature'         # Descriptive name
  var.units = 'degrees_celsius'             # Units string
  var.coordinates = 'lat lon'               # Coordinate variables
  var._FillValue = -999.0                   # Missing value

Performance Optimization:
  # Chunking for time-series access
  chunksizes=(50, lat_size, lon_size)
  
  # Compression
  zlib=True, complevel=6, shuffle=True
  
  # Data types
  Use 'f4' instead of 'f8' when precision allows

xarray Analysis Patterns:
  ds.mean(dim='time')              # Temporal mean
  ds.sel(lat=slice(-30, 30))       # Regional selection
  ds.groupby('time.season').mean() # Seasonal climatology
  ds.resample(time='1M').mean()    # Monthly resampling

Best Practices:
  • Follow CF conventions for interoperability
  • Use descriptive variable and attribute names
  • Include comprehensive metadata
  • Choose appropriate chunking for access patterns
  • Enable compression for most datasets
  • Use unlimited time dimension for extensibility
"""

print(reference)
print("\n🌍 NetCDF: The standard for climate and geoscience data! 📡")