# NWS Prep
This notebook provides download links for NWS data then ingests the data by station/year from html files.

It handles cropping extra hours from the year (due to time zones) and some data clean-up.

Finally, it outputs a pkl ready to be analyzed and used.

---
**Imports**

In [None]:
import pandas as pd

**Function to get appropriate file names and url's**

In [None]:
def getPaths(site, startYr):
    endYr = startYr + 1
    hours = '72'
    units = 'english'
    chart = 'off'
    headers = 'none'
    obs = 'tabular'
    hourly = 'true'
    pview = 'full'
    history = 'yes'
    start = str(startYr) + '0101'
    end = str(endYr) + '0101'

    url = f'https://www.weather.gov/wrh/timeseries?site={site}&hours={hours}&units={units}&chart={chart}&headers={headers}&obs={obs}&hourly={hourly}&pview={pview}&history={history}&start={start}&end={end}'

    file_path = f'{site}-{startYr}.html'

    return {'file_path':file_path,'url':url}

## Get Each Site Wx Data

This lists every link we must load, and what to save it as.

Would love a programmatic way, but need async http calls...

In [None]:
# Earliest dates to pull:
# JVEMT: 2019
# S11MT: 2019
# SNSLP: 2007, but no 2013-2014!
# SH7MT: 2019
# SH4MT: 2019
# MRPMT: 2019

sites_years = {'JVEMT': 2019, 'S11MT': 2019, 'SNSLP': 2007, 'SH7MT': 2019, 'SH4MT': 2019, 'MRPMT': 2019}
files = set()

for site, year in sites_years.items():    
    for start in range(year, 2023):
        paths = getPaths(site, start)
        files.add(paths['file_path'])
        print(paths['file_path'],'=',paths['url'],'\n')

In [None]:
# SNSLP 2013 has no data, must remove

files.discard('SNSLP-2013.html')

**Create dictionary of sites (keys) with list (value) as tuple of file name and dataframe per year**

In [None]:
# CAUTION: this cell takes 3 minutes to run!

sites_wx = {'JVEMT': [], 'S11MT': [], 'SNSLP': [], 'SH7MT': [], 'SH4MT': [], 'MRPMT': []}

for file in files:
    print(f'Getting {file}')
    df = pd.read_html('../raw_data/nws/' + file)[0] # b/c read_html gives a list of df's, and we have just one
    df['site'] = file[:5] # parse site abbreviation from first 5 chars
    df['file_name'] = file
    sites_wx[file[:5]].append((file, df))

In [None]:
dfs = []

for site in sites_wx:
  for yrs in sites_wx[site]:
    df = yrs[1].copy().iloc[17:-7] # trim hours due to GMT
    year = yrs[0][6:10]
    df['year'] = year # parse from file name
    dfs.append(df)

all_wx = pd.concat(dfs)
all_wx

In [None]:
# rename columns, watch out for misordering due to file names in unordered set!
new_cols = {
    all_wx.columns[0]: "dt",
    all_wx.columns[1]: "temp",
    'DewPoint(°F)': "dew_pt",
    'RelativeHumidity(%)': "rH",
    'HeatIndex(°F)': "heat_idx",
    'WindChill(°F)': "wind_chill",
    'WindDirection': "wind_dir",
    'WindSpeed(mph)' : "wind_spd",
    'SnowDepth(in)': "snow_depth",
    'Snowfall3 hour(in)': "snowfall_3hr",
    'Snowfall6 Hour(in)': "snowfall_6hr",
    'Snowfall24 Hour(in)': "snowfall_24hr",
    'Sea LevelPressure(mb)': "sea_lvl_press",
    'StationPressure(in Hg)': "sta_press",
    'AltimeterSetting(in Hg)': "altimeter_setting",
    'SolarRadiation(W/m²)': "solar_radiation",
    'PercentPossible(%)': "pct_possible",
    '1 HourPrecip(in)': "hr_precip"
}

all_wx.rename(columns=new_cols, inplace = True)
all_wx.drop(columns='Unnamed: 1', inplace=True)
all_wx.sample(4)

In [None]:
all_wx.dt = [f'{d}, {y}' for d, y in zip(all_wx.dt, all_wx.year)]
all_wx.dt = pd.to_datetime(all_wx.dt, format='%b %d, %I:%M %p, %Y', errors='coerce') # parse the datetime
all_wx.info()

In [None]:
all_wx.reset_index(drop=True, inplace=True)
all_wx

In [None]:
import pandas_profiling

# profile = all_wx.profile_report()
# profile.to_file('all_wx-profile.html')

In [None]:
all_wx.to_pickle("pkl/all_wx.pkl")  