# Process Climate Data

Assembles all climate data files downloaded from NOAA's ISD-Lite dataset into a single Pandas DataFrame and
saves to a CSV output file.

Pandas is required along with its optional dependency `tables`. re, gzip and os are included with Python 3.

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import gzip
import os
import re

For every file in the `data_dir` whose name matches `file_pattern`, load the data (via gzip) into Pandas as a fixed-width file. Perform the raw processing necessary to convert the raw integers into the proper scale and assemble the datetime objects. All DataFrames get concatenated together into one big DataFrame at the end.

In [12]:
# A compiled regex that matches all ISD-Lite file names. The first group in the file should match the station ID.
file_pattern = re.compile('([0-9-]{12})-[0-9]{4}\.gz')

def do_process(data_dir):
    dfs = []
    for f in os.listdir(data_dir):
        m = file_pattern.match(f)
        if m:
            with gzip.open(data_dir + f) as fp:
                # Widths must be specified!
                df = pd.read_fwf(fp, names=(
                    'year','month','day','hour','temp','dewpt','slp_hpa','wind_dir','wind_speed','skycond',
                    'precip_1hr','precip_6hr'), na_values=-9999,
                    widths=(4,3,3,3,6,6,6,6,6,6,6,6))
                # Adjust values per ISD format documentation
                for col in ('wind_speed','temp','dewpt','slp_hpa','precip_1hr','precip_6hr'):
                    df[col] /= 10
                # Parse the date info
                date_cols = ['year','month','day','hour']
                df['date'] = pd.to_datetime(df[date_cols])
                df['station_id'] = m.group(1)
                dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)

    # Perform some cleanup to remove junk
    
    # Set wind_dir to NaN if it was coded as 0 (calm winds)
    df.loc[df['wind_dir'] == 0, 'wind_dir'] = np.nan

    cols_in_order = ['station_id','date','temp','dewpt','slp_hpa','wind_dir','wind_speed','skycond','precip_1hr','precip_6hr']
    df = df[cols_in_order]
    
    return df

## All Station Data Since 1980

Process the initial 1980 to present data set

In [17]:
all_since_1980 = do_process("data/all-since-1980/")
all_since_1980

Unnamed: 0,station_id,date,temp,dewpt,slp_hpa,wind_dir,wind_speed,skycond,precip_1hr,precip_6hr
0,727935-24234,1990-01-01 00:00:00,7.8,6.1,,190.0,5.1,7.0,,
1,727935-24234,1990-01-01 01:00:00,8.3,6.7,,180.0,6.7,8.0,,
2,727935-24234,1990-01-01 02:00:00,7.8,6.1,,190.0,6.7,8.0,,
3,727935-24234,1990-01-01 03:00:00,7.8,6.1,,190.0,5.1,8.0,,
4,727935-24234,1990-01-01 04:00:00,7.8,6.1,,190.0,5.1,8.0,,
...,...,...,...,...,...,...,...,...,...,...
3674377,727970-94240,2018-12-31 19:00:00,1.7,0.6,1032.7,70.0,1.5,0.0,0.0,
3674378,727970-94240,2018-12-31 20:00:00,2.8,0.6,1032.3,80.0,2.1,0.0,0.0,
3674379,727970-94240,2018-12-31 21:00:00,4.4,1.1,1031.6,,0.0,2.0,0.0,
3674380,727970-94240,2018-12-31 22:00:00,6.1,1.7,1031.8,,0.0,,0.0,


Save the data to a file for analysis. See AnalyzeAllStationsSince1980.ipynb

HDF is used because it is a convenient compressed format. A CSV of this data would be over 200 MB.

In [18]:
all_since_1980.to_hdf('data/climate_data_all_since_1980.h5', 'all1980',
                      index=False, format='t', complib='zlib', complevel=9)

## Long Run Data

In [19]:
longrun = do_process("data/long-run-1960/")
longrun

Unnamed: 0,station_id,date,temp,dewpt,slp_hpa,wind_dir,wind_speed,skycond,precip_1hr,precip_6hr
0,727935-24234,1990-01-01 00:00:00,7.8,6.1,,190.0,5.1,7.0,,
1,727935-24234,1990-01-01 01:00:00,8.3,6.7,,180.0,6.7,8.0,,
2,727935-24234,1990-01-01 02:00:00,7.8,6.1,,190.0,6.7,8.0,,
3,727935-24234,1990-01-01 03:00:00,7.8,6.1,,190.0,5.1,8.0,,
4,727935-24234,1990-01-01 04:00:00,7.8,6.1,,190.0,5.1,8.0,,
...,...,...,...,...,...,...,...,...,...,...
2473842,727970-94240,2018-12-31 19:00:00,1.7,0.6,1032.7,70.0,1.5,0.0,0.0,
2473843,727970-94240,2018-12-31 20:00:00,2.8,0.6,1032.3,80.0,2.1,0.0,0.0,
2473844,727970-94240,2018-12-31 21:00:00,4.4,1.1,1031.6,,0.0,2.0,0.0,
2473845,727970-94240,2018-12-31 22:00:00,6.1,1.7,1031.8,,0.0,,0.0,


In [20]:
longrun.to_hdf('data/climate_data_longrun1960.h5', 'longrun',
               index=False, format='t', complib='zlib', complevel=9)