In [1]:
import pandas as pd

In [2]:
data_extent = 'only'
data_base_path = '../data/NOAA_weather_data/'

start_year = 1998
end_year = 2015
data_years = range(start_year, end_year, 1)

data_types = [
    'air.sfc',  # Surface air temp
    'air.2m',   # Air temp. at 2 meters above surface
    'apcp',     # Accumulated precipitation
    'crain',    # Catagorical rain at surface
    'rhum.2m',  # Relative humidity 2 meters above surface
    'dpt.2m',   # Dew point temp. 2 meters above surface
    'pres.sfc', # Pressure at surface
    'uwnd.10m', # u component of wind (positive = from west) 10 meters above surface
    'vwnd.10m', # v component of wind (positive = from south) 10 meters above surface
    'veg',      # Vegitation at surface
    'dlwrf',    # Downward long wave radiation flux at surface
    'dswrf',    # Downward short wave radiation flux at surface
    'lcdc',     # Low cloud area fraction
    'hcdc',     # High cloud area fraction
    'mcdc',     # Medium cloud area fraction
    'hpbl',     # Planetary boundry layer height
    'prate',    # Precipitation rate
    'vis'       # Visibility
    'ulwrf.sfc' # Upward long wave radiation flux at surface
]

In [3]:
master_df = pd.DataFrame()

for data_year in data_years:
    
    # read first dataset into dataframe so we have something to join with
    first_data_type = data_types[0]
    input_file = f'{data_base_path}california_{data_extent}/{data_year}_california_{data_extent}_{first_data_type}.csv'
    df = pd.read_csv(input_file)

    for data_type in data_types[1:]: # start loop on second data_type, used first to initalize dataframe
        input_file = f'{data_base_path}california_{data_extent}/{data_year}_california_{data_extent}_{data_type}.csv'
        incomming_df = pd.read_csv(input_file)
        df = pd.merge(df, incomming_df, on=['lat', 'lon', 'time'], how='outer')
        
    master_df = pd.concat([master_df, df])

In [4]:
output_file = f'{data_base_path}{start_year}-{end_year}_california_{data_extent}_all.csv'
master_df.reset_index(drop=True, inplace=True)
master_df.set_index(['time', 'lat', 'lon'], drop=True, inplace=True)
master_df = master_df.fillna(0)
master_df.to_csv(output_file, header=True, index=True)

In [5]:
master_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg
time,lat,lon,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2005-01-01 00:00:00,40.29749,-124.3408,284.87308,283.2486,1.40625,1.0,77.47234,279.43463,98986.18,5.984521,3.811234,0.0
2005-01-01 00:00:00,38.96153,-123.5579,279.62308,279.4986,2.96875,1.0,93.15984,278.4112,98086.18,5.476708,4.858109,73.1
2005-01-01 00:00:00,39.2415,-123.6393,280.68558,281.1236,5.023438,1.0,82.22234,278.27057,98486.18,4.375146,2.998734,76.0
2005-01-01 00:00:00,39.52163,-123.7215,280.68558,281.1236,6.9375,1.0,82.22234,278.27057,98386.18,6.336083,4.592484,76.0
2005-01-01 00:00:00,39.80193,-123.8045,277.49808,277.4361,4.015625,0.0,92.84734,276.31744,95386.18,5.093896,3.061234,74.4


In [6]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11978560 entries, (2005-01-01 00:00:00, 40.29749, -124.3408) to (2014-12-31 21:00:00, 34.37577, -114.3645)
Data columns (total 10 columns):
air.sfc     float64
air.2m      float64
apcp        float64
crain       float64
rhum.2m     float64
dpt.2m      float64
pres.sfc    float64
uwnd.10m    float64
vwnd.10m    float64
veg         float64
dtypes: float64(10)
memory usage: 982.7+ MB
