# Investigation of NOAA's wind data
[The National Oceanic and Atmospheric Administration](https://www.noaa.gov) maintains a series of weather stations called [Automated Surface Observation Systems](http://www.hurricanescience.org/science/observation/landbased/automatedsurfaceobssystems/) (ASOS). They offer one-minute and five-minute interval data at these FTP sites:
* ftp://ftp.ncdc.noaa.gov/pub/data/asos-onemin/  
* ftp://ftp.ncdc.noaa.gov/pub/data/asos-fivemin/  

The structure of the five-minute interval data is [explained in this pdf](ftp://ftp.ncdc.noaa.gov/pub/data/documentlibrary/tddoc/td6401b.pdf) from NOAA. 


### Libraries and installs

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import os
import datetime
import re
from fastparquet import write
from matplotlib import pyplot as plt 
import gmplot

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# NOAA data ia assumed to be in a subfolder called 'noaa' 
# For example, if the base data folder is '/users/data', noaa data should be in '/users/data/noaa'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

The ASOS data for current month was downloaded from ftp://ftp.ncdc.noaa.gov/pub/download/hidden/onemin/ and stored locally. The next few cells read all the data files for the current month and filter out the data for a single day for analysis.

In [21]:
# List of stations in the 35 < lat < 40 and  -125 < lon < -120 bounding box
station_list = ['KAPC', 'KBLU', 'KCCR', 'KHWD', 'KLVK', 'KMAE', 'KMCE', 'KMOD', 'KMRY', 'KMYV', 'KNUQ', 'KOAK', 'KOVE', 'KPRB', 'KSAC', 'KSBP', 'KSCK', 
                'KSFO', 'KSJC', 'KSMF', 'KSNS', 'KSTS', 'KUKI', 'KVCB', 'KWVI']

In [23]:
# ******* This cell takes close to 2 hours to run ******

lines = [] # an array of each read line
for station in station_list:
    filepath = "ftp://ftp.ncdc.noaa.gov/pub/data/asos-fivemin/6401-2019/64010{}201909.dat".format(station)
    try:
        for line in pd.read_csv(filepath_or_buffer=filepath , encoding='utf-8', header=None, chunksize=1):
            lines.append(line.iloc[0,0])
    except:
        pass

In [25]:
def createNOAAdf(lines, fileName):
    """ Helper function to process noaa data"""
    
    # split lines and data chunks
    data = [] # an array of arrays, inner arrays are all data for one record, outer array is all records
    for line in lines:

        # reset any variables if needed
        record = [] 
        Report_Modifier = ''
        Wind_Data = False 
        Variable_Winds = False
        Gusts = False
        Wind_Direction = ''
        Wind_Speed = ''
        Gust_Speed = ''
        Variable_Wind_Info = ''
        System_Maintenance_Reqd = False

        line = line.split() # take string of one record's data and split into space separated chunks
        WBAN_Number = line[0][0:5] # The WBAN (Weather Bureau, Army, Navy) number is a unique 5-digit number
        Call_Sign = line[0][5:] # The call sign is a location identifier, three or four characters in length 
        suffix = line[1][-2:] # grab the last two digits that are the year (i.e. 19 for 2019)
        Year = '20'+suffix # in YYYY format
        CallSign_Date = re.split(Year, line[1])
        Call_Sign2 = CallSign_Date[0] # this seems to be the same as Call_Sign but without initial letter
        Date = CallSign_Date[1]
        Month = Date[0:2] # in MM format
        Day = Date[2:4] # in DD format
        Hour = Date[4:6] # in HH format
        Minute = Date[6:8] # Observations are recorded on whole five-minute increments (i.e. 00,05,10,...,50,55)
        Record_Length = Date[8:11] # I'm not sure what this is yet - Length of record??
        Date = Date[11:] # MM/DD/YY format
        Timestamp = line[2] # in HH:MM:SS format
        Interval = line[3] # should be 5-MIN as opposed to 1-MIN
        Call_Sign3 = line[4] # for some reason, a THIRD output of the call sign. random.
        Zulu_Time = line[5] # Zulu Time, or military time, or UTC

        # after this point, data could be missing/optional and data positions are not fixed
        currIndx = 6
        try:
            Next_Data = line[currIndx]
            if not any(x in Next_Data for x in ['KT','SM']):
                Report_Modifier = Next_Data # AUTO for fully automated report, COR for correction to a previously disseminated report
                currIndx += 1
            Next_Data = line[currIndx]
            if "KT" in Next_Data:
                Wind_Data = True
                Wind_Direction = Next_Data[0:3] # in tens of degrees from true north
                if Next_Data[0:3] == 'VRB':
                    Variable_Winds = True
                Wind_Speed = Next_Data[3:5] # in whole knots (two digits)
                if Next_Data[5] == 'G':
                    Gusts = True
                    Gust_Speed = Next_Data[6:8] # speed in whole knots (two digits)
            else:
                Wind_Data = False
        except:
            print("OUT OF DATA AT FIELD {}".format(currIndx))
            print(line)
        finally:
            currIndx += 1

        try:
            Next_Data = line[currIndx]
            if Wind_Data:
                if (re.fullmatch(r'[0-9][0-9][0-9]V[0-9][0-9][0-9]', Next_Data)): #e.g. 180V240 = wind direction varies from 180 to 240 degrees
                    Variable_Wind_Info = Next_Data
                    Variable_Winds = True
        except:
            print("OUT OF DATA AT FIELD {}".format(currIndx))
            print(line)
            
        if line[-1] == '$':
            System_Maintenance_Reqd = True

        #Sea_Level_Pressure = line[13] # given in tenths of hectopascals (millibars). The last digits are recorded (125 means 1012.5)
        #Station_Type = line[18]
        Num_Fields = len(line)
        record = [WBAN_Number, Call_Sign, Call_Sign2, Year, Month, Day, Hour, Minute, Record_Length, Date, Timestamp, Interval, Call_Sign3, Zulu_Time, 
                  Report_Modifier, Wind_Data, Wind_Direction, Wind_Speed, Gusts, Gust_Speed, Variable_Winds, Variable_Wind_Info, System_Maintenance_Reqd, Num_Fields]
        col_names = ["wban_number", "call_sign", "call_sign2", "year", "month", "day", "hour", "minute", "rec_length", "date", "timestamp", "interval", "call_sign3", 
                     "zulu_time", "report_modifier", "wind_data", "wind_direction", "wind_speed", "gusts", "gust_speed", "variable_winds", "variable_wind_info", "sys_maint_reqd", "num_fields"]
        data.append(record)
    
    sample_df = pd.DataFrame(data, columns = col_names)
    
    # save Dataframe to file
    parquet_file = "{}/noaa/{}.parquet".format(datafolder, fileName)
    write(parquet_file, sample_df,compression='GZIP')
    
    return sample_df

In [26]:
noaa_df = createNOAAdf(lines, 'Sep2019')

OUT OF DATA AT FIELD 8
['23293KSJC', 'SJC20190919142504909/19/19', '14:25:31', '5-MIN', 'KSJC', '192225Z', 'AUTO', 'RVRNO']
OUT OF DATA AT FIELD 8
['23277KWVI', 'WVI20190907043504909/07/19', '04:35:32', '5-MIN', 'KWVI', '071235Z', 'AUTO', 'RVRNO']


In [29]:
# Drop rows where wind speed is not numeric
noaa_df = noaa_df[noaa_df.wind_speed != 'T']

### Merge lat long data for stations

In [30]:
# Read from file that was stored earlier
unique_station_df = pd.read_parquet("{}/noaa/uniq_station_data.parquet".format(datafolder))

In [31]:
merged_noaa_df = pd.merge(noaa_df, unique_station_df, on='wban_number')

In [33]:
# Convert data type of numeric columns
merged_noaa_df[['wind_speed','gust_speed','lat','lon']] = merged_noaa_df[['wind_speed','gust_speed','lat','lon']].apply(pd.to_numeric)

In [45]:
# Get data for bounding box
bay_noaa_df = merged_noaa_df[(merged_noaa_df.lat > 35) & (merged_noaa_df.lat < 40) 
                              & (merged_noaa_df.lon > -125) & (merged_noaa_df.lon < -120)]
bay_noaa_df.reset_index(inplace=True, drop=True)
bay_noaa_df['datetime'] = bay_noaa_df[['year', 'month','day','hour','minute']].apply(lambda x: int(''.join(x)), axis=1)
bay_noaa_df.head()

Unnamed: 0,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date,timestamp,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,lat,lon,datetime
0,93227,KAPC,APC,2019,9,1,0,0,106,09/01/19,00:00:31,5-MIN,KAPC,010800Z,AUTO,True,180,4.0,False,,False,,False,19,38.21,-122.285,201909010000
1,93227,KAPC,APC,2019,9,1,0,5,106,09/01/19,00:05:31,5-MIN,KAPC,010805Z,AUTO,True,200,6.0,False,,False,,False,19,38.21,-122.285,201909010005
2,93227,KAPC,APC,2019,9,1,0,10,106,09/01/19,00:10:31,5-MIN,KAPC,010810Z,AUTO,True,240,5.0,False,,False,,False,19,38.21,-122.285,201909010010
3,93227,KAPC,APC,2019,9,1,0,15,106,09/01/19,00:15:31,5-MIN,KAPC,010815Z,AUTO,True,260,4.0,False,,False,,False,19,38.21,-122.285,201909010015
4,93227,KAPC,APC,2019,9,1,0,20,106,09/01/19,00:20:31,5-MIN,KAPC,010820Z,AUTO,True,260,3.0,False,,False,,False,19,38.21,-122.285,201909010020


In [46]:
days_list = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30']

In [48]:
for day in days_list:
    datestr = '09/{}/19'.format(day)
    dly_noaa_df = bay_noaa_df[bay_noaa_df.date == datestr]
    dly_noaa_df.drop(['year', 'month','day','hour','minute','date','timestamp'], axis=1, inplace=True)
    parquet_file = "{}/noaa/asos_201909{}.parquet".format(datafolder, day)
    write(parquet_file, dly_noaa_df,compression='GZIP')