# NWS Prep
This notebook provides download links for NWS data then ingests the data by station/year from html files.

It handles cropping extra hours from the year (due to time zones) and some data clean-up.

Finally, it outputs a pkl ready to be analyzed and used.

---
**Imports**

In [1]:
import pandas as pd

**Function to get appropriate file names and url's**

In [2]:
def getPaths(site, startYr):
    endYr = startYr + 1
    hours = '72'
    units = 'english'
    chart = 'off'
    headers = 'none'
    obs = 'tabular'
    hourly = 'true'
    pview = 'full'
    history = 'yes'
    start = str(startYr) + '0101'
    end = str(endYr) + '0101'

    url = f'https://www.weather.gov/wrh/timeseries?site={site}&hours={hours}&units={units}&chart={chart}&headers={headers}&obs={obs}&hourly={hourly}&pview={pview}&history={history}&start={start}&end={end}'

    file_path = f'{site}-{startYr}.html'

    return {'file_path':file_path,'url':url}

## Get Each Site Wx Data

This lists every link we must load, and what to save it as.

Would love a programmatic way, but need async http calls...

In [3]:
# Earliest dates to pull:
# JVEMT: 2019
# S11MT: 2019
# SNSLP: 2007, but no 2013-2014!
# SH7MT: 2019
# SH4MT: 2019
# MRPMT: 2019

sites_years = {'JVEMT': 2019, 'S11MT': 2019, 'SNSLP': 2007, 'SH7MT': 2019, 'SH4MT': 2019, 'MRPMT': 2019}
files = set()

for site, year in sites_years.items():    
    for start in range(year, 2022):
        paths = getPaths(site, start)
        files.add(paths['file_path'])
        print(paths['file_path'],'=',paths['url'],'\n')

JVEMT-2019.html = https://www.weather.gov/wrh/timeseries?site=JVEMT&hours=72&units=english&chart=off&headers=none&obs=tabular&hourly=true&pview=full&history=yes&start=20190101&end=20200101 

JVEMT-2020.html = https://www.weather.gov/wrh/timeseries?site=JVEMT&hours=72&units=english&chart=off&headers=none&obs=tabular&hourly=true&pview=full&history=yes&start=20200101&end=20210101 

JVEMT-2021.html = https://www.weather.gov/wrh/timeseries?site=JVEMT&hours=72&units=english&chart=off&headers=none&obs=tabular&hourly=true&pview=full&history=yes&start=20210101&end=20220101 

S11MT-2019.html = https://www.weather.gov/wrh/timeseries?site=S11MT&hours=72&units=english&chart=off&headers=none&obs=tabular&hourly=true&pview=full&history=yes&start=20190101&end=20200101 

S11MT-2020.html = https://www.weather.gov/wrh/timeseries?site=S11MT&hours=72&units=english&chart=off&headers=none&obs=tabular&hourly=true&pview=full&history=yes&start=20200101&end=20210101 

S11MT-2021.html = https://www.weather.gov/wrh

In [4]:
# SNSLP 2013 has no data, must remove

files.discard('SNSLP-2013.html')

**Create dictionary of sites (keys) with list (value) as tuple of file name and dataframe per year**

In [5]:
# CAUTION: this cell takes 3 minutes to run!

sites_wx = {'JVEMT': [], 'S11MT': [], 'SNSLP': [], 'SH7MT': [], 'SH4MT': [], 'MRPMT': []}

# remove 1 file, forgot which? # snowslip 2013 no data

for file in files:
    print(f'Getting {file}')
    df = pd.read_html('../data/nws_obs/' + file)[0] # b/c read_html gives a list of df's, and we have just one
    df['site'] = file[:5] # parse site abbreviation from first 5 chars
    df['file_name'] = file
    sites_wx[file[:5]].append((file, df))

Getting S11MT-2020.html
Getting SNSLP-2021.html
Getting SNSLP-2014.html
Getting SNSLP-2009.html
Getting S11MT-2021.html
Getting SNSLP-2010.html
Getting SNSLP-2008.html
Getting MRPMT-2021.html
Getting JVEMT-2021.html
Getting MRPMT-2020.html
Getting SNSLP-2019.html
Getting SNSLP-2017.html
Getting SNSLP-2020.html
Getting MRPMT-2019.html
Getting SNSLP-2012.html
Getting SH7MT-2020.html
Getting SNSLP-2015.html
Getting SH7MT-2019.html
Getting SH4MT-2020.html
Getting SH7MT-2021.html
Getting SNSLP-2018.html
Getting SNSLP-2007.html
Getting SNSLP-2011.html
Getting SH4MT-2021.html
Getting S11MT-2019.html
Getting SH4MT-2019.html
Getting JVEMT-2019.html
Getting JVEMT-2020.html
Getting SNSLP-2016.html


In [45]:
dfs = []

for site in sites_wx:
  for yrs in sites_wx[site]:
    df = yrs[1].copy().iloc[17:-7] # trim hours due to GMT
    year = yrs[0][6:10]
    df['year'] = year # parse from file name
    dfs.append(df)

all_wx = pd.concat(dfs)
all_wx

Unnamed: 0,Date/Time (L),Temp. (°F),DewPoint(°F),RelativeHumidity(%),HeatIndex(°F),WindChill(°F),WindDirection,WindSpeed(mph),1 HourPrecip(in),SnowDepth(in),...,Snowfall24 Hour(in),site,file_name,year,Sea LevelPressure(mb),StationPressure(in Hg),AltimeterSetting(in Hg),SolarRadiation(W/m²),PercentPossible(%),Unnamed: 1
17,"Dec 31, 11:00 pm",-12.0,-16.0,80.0,,,E,2G5,0.0,18.9,...,0.0,JVEMT,JVEMT-2021.html,2021,,,,,,
18,"Dec 31, 10:00 pm",-10.0,-14.0,81.0,,,E,2G5,0.0,18.7,...,0.0,JVEMT,JVEMT-2021.html,2021,,,,,,
19,"Dec 31, 9:00 pm",-7.0,-11.0,82.0,,,E,2G5,0.0,19.1,...,0.0,JVEMT,JVEMT-2021.html,2021,,,,,,
20,"Dec 31, 8:00 pm",-5.0,-9.0,81.0,,,E,1G3,0.0,19.1,...,0.0,JVEMT,JVEMT-2021.html,2021,,,,,,
21,"Dec 31, 7:00 pm",-2.0,-7.0,79.0,,,E,1G3,0.0,19.1,...,0.0,JVEMT,JVEMT-2021.html,2021,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855,"Oct 15, 11:00 pm",49.0,,,,,SW,14G31,,0.0,...,,MRPMT,MRPMT-2019.html,2019,,24.69,29.93,,,
1856,"Oct 15, 10:00 pm",49.0,,,,,SW,13G29,,0.0,...,,MRPMT,MRPMT-2019.html,2019,,24.70,29.95,,,
1857,"Oct 15, 9:00 pm",48.0,,,,,SW,10G25,,0.0,...,,MRPMT,MRPMT-2019.html,2019,,24.72,29.97,,,
1858,"Oct 15, 8:00 pm",48.0,,,,,SSW,10G19,,0.0,...,,MRPMT,MRPMT-2019.html,2019,,24.74,29.99,,,


In [46]:
# rename columns, watch out for misordering due to file names in unordered set!
new_cols = {
    all_wx.columns[0]: "dt",
    all_wx.columns[1]: "temp",
    'DewPoint(°F)': "dew_pt",
    'RelativeHumidity(%)': "rH",
    'HeatIndex(°F)': "heat_idx",
    'WindChill(°F)': "wind_chill",
    'WindDirection': "wind_dir",
    'WindSpeed(mph)' : "wind_spd",
    'SnowDepth(in)': "snow_depth",
    'Snowfall3 hour(in)': "snowfall_3hr",
    'Snowfall6 Hour(in)': "snowfall_6hr",
    'Snowfall24 Hour(in)': "snowfall_24hr",
    'Sea LevelPressure(mb)': "sea_lvl_press",
    'StationPressure(in Hg)': "sta_press",
    'AltimeterSetting(in Hg)': "altimeter_setting",
    'SolarRadiation(W/m²)': "solar_radiation",
    'PercentPossible(%)': "pct_possible",
    '1 HourPrecip(in)': "hr_precip"
}

all_wx.rename(columns=new_cols, inplace = True)
all_wx.drop(columns='Unnamed: 1', inplace=True)
all_wx.sample(4)

Unnamed: 0,dt,temp,dew_pt,rH,heat_idx,wind_chill,wind_dir,wind_spd,hr_precip,snow_depth,...,snowfall_6hr,snowfall_24hr,site,file_name,year,sea_lvl_press,sta_press,altimeter_setting,solar_radiation,pct_possible
5158,"Mar 28, 1:00 pm",39.0,18.0,43.0,,30.0,SSW,18G39,,77.3,...,0.0,0.0,SH7MT,SH7MT-2021.html,2021,,,,401.0,40 %
2890,"Aug 28, 9:00 pm",60.0,36.0,41.0,,,WSW,12G23,,,...,,,SNSLP,SNSLP-2014.html,2014,1000.85,23.08,29.94,0.0,--
3338,"Jun 17, 3:00 am",42.0,36.0,79.0,,,NNW,1G5,0.0,0.0,...,0.0,0.0,SH4MT,SH4MT-2021.html,2021,,,,,
6057,"Feb 19, 2:00 am",20.0,,,,8.0,SW,12G23,,31.0,...,0.0,0.7,MRPMT,MRPMT-2021.html,2021,,24.58,29.8,,


In [48]:
all_wx.dt = [f'{d}, {y}' for d, y in zip(all_wx.dt, all_wx.year)]
all_wx.dt = pd.to_datetime(all_wx.dt, format='%b %d, %I:%M %p, %Y', errors='coerce') # parse the datetime
all_wx.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155159 entries, 17 to 1859
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   dt                 155159 non-null  datetime64[ns]
 1   temp               155159 non-null  float64       
 2   dew_pt             136839 non-null  float64       
 3   rH                 136783 non-null  float64       
 4   heat_idx           2958 non-null    float64       
 5   wind_chill         58968 non-null   float64       
 6   wind_dir           147187 non-null  object        
 7   wind_spd           155159 non-null  object        
 8   hr_precip          34541 non-null   float64       
 9   snow_depth         87075 non-null   float64       
 10  snowfall_3hr       85961 non-null   float64       
 11  snowfall_6hr       85612 non-null   float64       
 12  snowfall_24hr      85358 non-null   float64       
 13  site               155159 non-null  object   

In [49]:
all_wx.reset_index(drop=True, inplace=True)
all_wx

Unnamed: 0,dt,temp,dew_pt,rH,heat_idx,wind_chill,wind_dir,wind_spd,hr_precip,snow_depth,...,snowfall_6hr,snowfall_24hr,site,file_name,year,sea_lvl_press,sta_press,altimeter_setting,solar_radiation,pct_possible
0,2021-12-31 23:00:00,-12.0,-16.0,80.0,,,E,2G5,0.0,18.9,...,0.0,0.0,JVEMT,JVEMT-2021.html,2021,,,,,
1,2021-12-31 22:00:00,-10.0,-14.0,81.0,,,E,2G5,0.0,18.7,...,0.0,0.0,JVEMT,JVEMT-2021.html,2021,,,,,
2,2021-12-31 21:00:00,-7.0,-11.0,82.0,,,E,2G5,0.0,19.1,...,0.1,0.0,JVEMT,JVEMT-2021.html,2021,,,,,
3,2021-12-31 20:00:00,-5.0,-9.0,81.0,,,E,1G3,0.0,19.1,...,0.1,0.0,JVEMT,JVEMT-2021.html,2021,,,,,
4,2021-12-31 19:00:00,-2.0,-7.0,79.0,,,E,1G3,0.0,19.1,...,0.1,0.0,JVEMT,JVEMT-2021.html,2021,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155154,2019-10-15 23:00:00,49.0,,,,,SW,14G31,,0.0,...,0.0,,MRPMT,MRPMT-2019.html,2019,,24.69,29.93,,
155155,2019-10-15 22:00:00,49.0,,,,,SW,13G29,,0.0,...,0.0,,MRPMT,MRPMT-2019.html,2019,,24.70,29.95,,
155156,2019-10-15 21:00:00,48.0,,,,,SW,10G25,,0.0,...,0.0,,MRPMT,MRPMT-2019.html,2019,,24.72,29.97,,
155157,2019-10-15 20:00:00,48.0,,,,,SSW,10G19,,0.0,...,0.0,,MRPMT,MRPMT-2019.html,2019,,24.74,29.99,,


In [51]:
import pandas_profiling

# profile = all_wx.profile_report()
# profile.to_file('all_wx-profile.html')

In [None]:
all_wx.to_pickle("pkl/all_wx.pkl")  