In [13]:
from pathlib import Path
import csv

In [3]:
data_dir = Path.cwd().parent / 'DATA'
data_dir.mkdir(exist_ok=True)

In [4]:
datathon_url = "https://www.kaggle.com/competitions/widsdatathon2023"
data_page = datathon_url + "/data"

## Note:
Need to download data files (.csv) manually from the url:  
<https://www.kaggle.com/competitions/widsdatathon2023/data>

In [9]:
for data_file in data_dir.glob("*.csv"):
    fil_sz = round(data_file.stat().st_size / 1024)
    print(f"- {data_file.name}, {fil_sz:,} kB")

- sample_solution.csv, 784 kB
- test_data.csv, 48,666 kB
- train_data.csv, 586,682 kB


In [12]:
data_file = data_dir / "train_data.csv"

## Data Dictionary

The WiDS 2023 Datathon is using a subset of a pre-prepared dataset in which the variables were gathered from the following datasets (source of the WiDS Datathon dataset will be revealed after the competition closes):

Temperature: Daily maximum and minimum temperature measurements at 2 meters from 1979 onwards were obtained from NOAA’s Climate Prediction Center (CPC) Global Gridded Temperature dataset and converted to Celsius. The official contest target temperature variable is tmp2m = tmax+tmin / 2.

ftp://ftp.cpc.ncep.noaa.gov/precip/PEOPLE/wd52ws/global_temp/


Global precipitation: Daily precipitation data from 1979 onward were obtained from NOAA’s CPC Gauge-Based Analysis of Global Daily Precipitation [42] and converted to mm.

ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/GAUGE_GLB/RT/


U.S. precipitation: Daily U.S. precipitation data in mm were collected from the CPC Unified Gauge-Based Analysis of Daily Precipitation over CONUS. Measurements were replaced with sums over the ensuing two-week period.

https://www.esrl.noaa.gov/psd/thredds/catalog/Datasets/cpc_us_precip/catalog.html


Sea surface temperature and sea ice concentration: NOAA’s Optimum Interpolation Sea Surface Temperature (SST) dataset provides SST and sea ice concentration data, daily from 1981 to the present.

ftp://ftp.cdc.noaa.gov/Projects/Datasets/noaa.oisst.v2.highres/


Multivariate ENSO index (MEI): Bimonthly MEI values (MEI) from 1949 to the present, were obtained from NOAA/Earth System Research Laboratory. The MEI is a scalar summary of six variables (sea-level pressure, zonal and meridional surface wind components, SST, surface air temperature, and sky cloudiness) associated with El Niño/Southern Oscillation (ENSO), an ocean-atmosphere coupled climate mode.

https://www.esrl.noaa.gov/psd/enso/mei/


Madden-Julian oscillation (MJO): Daily MJO values since 1974 are provided by the Australian Government Bureau of Meteorology. MJO is a metric of tropical convection on daily to weekly timescales and can have a significant impact on the United States sub-seasonal climate. Measurements of phase and amplitude on the target date were extracted over the two-week period.

http://www.bom.gov.au/climate/mjo/graphics/rmm.74toRealtime.txt


Relative humidity, sea level pressure, and precipitable water for the entire atmosphere: NOAA’s National Center for Environmental Prediction (NCEP)/National Center for Atmospheric Research Reanalysis dataset contains daily relative humidity (rhum) near the surface (sigma level 0.995) from 1948 to the present and daily pressure at the surface (pres) from 1979 to the present.

ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis/surface/


Geopotential height, zonal wind, and longitudinal wind: To capture polar vortex variability, obtained daily mean geopotential height were obtained at 10mb from the NCEP Reanalysis dataset.

ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/pressure/


North American Multi-Model Ensemble (NMME): The North American Multi-Model Ensemble (NMME) is a collection of physics-based forecast models from various modeling centers in North America. Forecasts issued monthly from the Cansips, CanCM3, CanCM4, CCSM3, CCSM4, GFDL-CM2.1-aer04, GFDL-CM2.5, FLOR-A06 and FLOR-B01, NASA-GMAO-062012, and NCEP-CFSv2 models were downloaded from the IRI/LDEO Climate Data Library. Each forecast contains monthly mean predictions from 0.5 to 8.5 months ahead.

https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/


Pressure and potential evaporation: ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis/surface_gauss/


Elevation: http://research.jisao.washington.edu/data_sets/elevation/elev.1-deg.nc


Köppen-Geiger climate classifications: http://koeppen-geiger.vu-wien.ac.at/present.htm

In [11]:
var_prefixes = {
    "contest-slp-14d": "file containing sea level pressure (slp)",
    "nmme0-tmp2m-34w": "file containing most recent monthly NMME model forecasts",
    "contest-pres-sfc-gauss-14d": "pressure",
    "mjo1d": "MJO phase and amplitude",
    "contest-pevpr-sfc-gauss-14d": "potential evaporation",
    "contest-wind-h850-14d": "geopotential height at 850 millibars",
    "contest-wind-h500-14d": "geopotential height at 500 millibars",
    "contest-wind-h100-14d": "geopotential height at 100 millibars",
    "contest-wind-h10-14d": "geopotential height at 10 millibars",
    "contest-wind-vwnd-925-14d": "longitudinal wind at 925 millibars",
    "contest-wind-vwnd-250-14d": "longitudinal wind at 250 millibars",
    "contest-wind-uwnd-250-14d": "zonal wind at 250 millibars",
    "contest-wind-uwnd-925-14d": "zonal wind at 925 millibars",
    "contest-rhum-sig995-14d": "relative humidity",
    "contest-prwtr-eatm-14d": "precipitable water for entire atmosphere",
    "nmme-prate-34w": "weeks 3-4 weighted average of monthly NMME model forecasts for precipitation",
    "nmme-prate-56w": "weeks 5-6 weighted average of monthly NMME model forecasts for precipitation",
    "nmme0-prate-56w": "weeks 5-6 weighted average of most recent monthly NMME model forecasts for precipitation",
    "nmme0-prate-34w": "weeks 3-4 weighted average of most recent monthly NMME model forecasts for precipitation",
    "nmme-tmp2m-34w": "weeks 3-4 weighted average of most recent monthly NMME model forecasts for target label, contest-tmp2m-14d__tmp2m",
    "nmme-tmp2m-56w": "weeks 5-6 weighted average of monthly NMME model forecasts for target label, contest-tmp2m-14d__tmp2m",
    "mei": "MEI (mei), MEI rank (rank), and Niño Index Phase (nip)",
    "elevation:": "elevation",
    "contest-precip-14d": "measured precipitation",
    "climateregions": "Köppen-Geigerclimateclassifications",
}

In [16]:
with open(data_file, "r") as csv_in:
    reader = csv.reader(csv_in)
    column_names = next(reader)

In [21]:
column_names

['index',
 'lat',
 'lon',
 'startdate',
 'contest-pevpr-sfc-gauss-14d__pevpr',
 'nmme0-tmp2m-34w__cancm30',
 'nmme0-tmp2m-34w__cancm40',
 'nmme0-tmp2m-34w__ccsm30',
 'nmme0-tmp2m-34w__ccsm40',
 'nmme0-tmp2m-34w__cfsv20',
 'nmme0-tmp2m-34w__gfdlflora0',
 'nmme0-tmp2m-34w__gfdlflorb0',
 'nmme0-tmp2m-34w__gfdl0',
 'nmme0-tmp2m-34w__nasa0',
 'nmme0-tmp2m-34w__nmme0mean',
 'contest-wind-h10-14d__wind-hgt-10',
 'nmme-tmp2m-56w__cancm3',
 'nmme-tmp2m-56w__cancm4',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-tmp2m-56w__ccsm4',
 'nmme-tmp2m-56w__cfsv2',
 'nmme-tmp2m-56w__gfdl',
 'nmme-tmp2m-56w__gfdlflora',
 'nmme-tmp2m-56w__gfdlflorb',
 'nmme-tmp2m-56w__nasa',
 'nmme-tmp2m-56w__nmmemean',
 'contest-rhum-sig995-14d__rhum',
 'nmme-prate-34w__cancm3',
 'nmme-prate-34w__cancm4',
 'nmme-prate-34w__ccsm3',
 'nmme-prate-34w__ccsm4',
 'nmme-prate-34w__cfsv2',
 'nmme-prate-34w__gfdl',
 'nmme-prate-34w__gfdlflora',
 'nmme-prate-34w__gfdlflorb',
 'nmme-prate-34w__nasa',
 'nmme-prate-34w__nmmemean',
 'contest-wind-h