# Investigation of NOAA's wind data
[The National Oceanic and Atmospheric Administration](https://www.noaa.gov) maintains a series of weather stations called [Automated Surface Observation Systems](http://www.hurricanescience.org/science/observation/landbased/automatedsurfaceobssystems/) (ASOS). They offer one-minute and five-minute interval data at these FTP sites:
* ftp://ftp.ncdc.noaa.gov/pub/data/asos-onemin/  
* ftp://ftp.ncdc.noaa.gov/pub/data/asos-fivemin/  

The structure of the five-minute interval data is [explained in this pdf](ftp://ftp.ncdc.noaa.gov/pub/data/documentlibrary/tddoc/td6401b.pdf) from NOAA. 


### Libraries and installs

In [14]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import os
import datetime
import re
from fastparquet import write
from matplotlib import pyplot as plt 
import gmplot

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [5]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# NOAA data ia assumed to be in a subfolder called 'noaa' 
# For example, if the base data folder is '/users/data', noaa data should be in '/users/data/noaa'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

The ASOS data for current month was downloaded from ftp://ftp.ncdc.noaa.gov/pub/download/hidden/onemin/ and stored locally. The next few cells read all the data files for the current month and filter out the data for a single day for analysis.

In [19]:
dirpath = "{}/noaa/fmd_201909".format(datafolder)
entries = os.listdir(dirpath)
lines = [] # an array of each read line
for entry in entries:
    filepath = os.path.join(dirpath, entry)
    try:
        for line in pd.read_csv(filepath_or_buffer=filepath , encoding='utf-8', header=None, chunksize=1):
            lines.append(line.iloc[0,0])
    except:
        pass

In [21]:
print("ASOS data for September has ",len(lines)," records as of 3 PM 09/29")

ASOS data for September has  6168774  records as of 3 PM 09/29


In [26]:
def createNOAAdf(lines, fileName):
    """ Helper function to process noaa data"""
    
    # split lines and data chunks
    data = [] # an array of arrays, inner arrays are all data for one record, outer array is all records
    for line in lines:

        # reset any variables if needed
        record = [] 
        Report_Modifier = ''
        Wind_Data = False 
        Variable_Winds = False
        Gusts = False
        Wind_Direction = ''
        Wind_Speed = ''
        Gust_Speed = ''
        Variable_Wind_Info = ''
        System_Maintenance_Reqd = False

        line = line.split() # take string of one record's data and split into space separated chunks
        WBAN_Number = line[0][0:5] # The WBAN (Weather Bureau, Army, Navy) number is a unique 5-digit number
        Call_Sign = line[0][5:] # The call sign is a location identifier, three or four characters in length 
        suffix = line[1][-2:] # grab the last two digits that are the year (i.e. 19 for 2019)
        Year = '20'+suffix # in YYYY format
        CallSign_Date = re.split(Year, line[1])
        Call_Sign2 = CallSign_Date[0] # this seems to be the same as Call_Sign but without initial letter
        Date = CallSign_Date[1]
        Month = Date[0:2] # in MM format
        Day = Date[2:4] # in DD format
        Hour = Date[4:6] # in HH format
        Minute = Date[6:8] # Observations are recorded on whole five-minute increments (i.e. 00,05,10,...,50,55)
        Record_Length = Date[8:11] # I'm not sure what this is yet - Length of record??
        Date = Date[11:] # MM/DD/YY format
        Timestamp = line[2] # in HH:MM:SS format
        Interval = line[3] # should be 5-MIN as opposed to 1-MIN
        Call_Sign3 = line[4] # for some reason, a THIRD output of the call sign. random.
        Zulu_Time = line[5] # Zulu Time, or military time, or UTC

        # after this point, data could be missing/optional and data positions are not fixed
        currIndx = 6
        try:
            Next_Data = line[currIndx]
            if not any(x in Next_Data for x in ['KT','SM']):
                Report_Modifier = Next_Data # AUTO for fully automated report, COR for correction to a previously disseminated report
                currIndx += 1
            Next_Data = line[currIndx]
            if "KT" in Next_Data:
                Wind_Data = True
                Wind_Direction = Next_Data[0:3] # in tens of degrees from true north
                if Next_Data[0:3] == 'VRB':
                    Variable_Winds = True
                Wind_Speed = Next_Data[3:5] # in whole knots (two digits)
                if Next_Data[5] == 'G':
                    Gusts = True
                    Gust_Speed = Next_Data[6:8] # speed in whole knots (two digits)
            else:
                Wind_Data = False
        except:
            print("OUT OF DATA AT FIELD {}".format(currIndx))
            print(line)
        finally:
            currIndx += 1

        try:
            Next_Data = line[currIndx]
            if Wind_Data:
                if (re.fullmatch(r'[0-9][0-9][0-9]V[0-9][0-9][0-9]', Next_Data)): #e.g. 180V240 = wind direction varies from 180 to 240 degrees
                    Variable_Wind_Info = Next_Data
                    Variable_Winds = True
        except:
            print("OUT OF DATA AT FIELD {}".format(currIndx))
            print(line)
            
        if line[-1] == '$':
            System_Maintenance_Reqd = True

        #Sea_Level_Pressure = line[13] # given in tenths of hectopascals (millibars). The last digits are recorded (125 means 1012.5)
        #Station_Type = line[18]
        Num_Fields = len(line)
        record = [WBAN_Number, Call_Sign, Call_Sign2, Year, Month, Day, Hour, Minute, Record_Length, Date, Timestamp, Interval, Call_Sign3, Zulu_Time, 
                  Report_Modifier, Wind_Data, Wind_Direction, Wind_Speed, Gusts, Gust_Speed, Variable_Winds, Variable_Wind_Info, System_Maintenance_Reqd, Num_Fields]
        col_names = ["wban_number", "call_sign", "call_sign2", "year", "month", "day", "hour", "minute", "rec_length", "date", "timestamp", "interval", "call_sign3", 
                     "zulu_time", "report_modifier", "wind_data", "wind_direction", "wind_speed", "gusts", "gust_speed", "variable_winds", "variable_wind_info", "sys_maint_reqd", "num_fields"]
        data.append(record)
    
    sample_df = pd.DataFrame(data, columns = col_names)
    
    # save Dataframe to file
    parquet_file = "{}/noaa/{}.parquet".format(datafolder, fileName)
    write(parquet_file, sample_df,compression='GZIP')
    
    return sample_df

In [27]:
noaa_df = createNOAAdf(lines, 'Sep2019')

OUT OF DATA AT FIELD 7
['93721KBWI', 'BWI20190914190004209/14/19', '19:00:31', '5-MIN', 'KBWI', '150000Z', '180']
OUT OF DATA AT FIELD 8
['93721KBWI', 'BWI20190914190004209/14/19', '19:00:31', '5-MIN', 'KBWI', '150000Z', '180']
OUT OF DATA AT FIELD 8
['24029KSHR', 'SHR20190911223504409/11/19', '22:35:31', '5-MIN', 'KSHR', '120535Z', 'AUTO', '3']
OUT OF DATA AT FIELD 8
['13961KFTW', 'FTW20190925064004909/25/19', '06:40:32', '5-MIN', 'KFTW', '251240Z', 'AUTO', 'RVRNO']
OUT OF DATA AT FIELD 7
['27406PASC', 'SCC20190914225005209/14/19', '22:50:31', '5-MIN', 'PASC', '150750Z', '08007KT0170000']
OUT OF DATA AT FIELD 8
['23191KAVX', 'AVX20190928085504909/28/19', '08:55:31', '5-MIN', 'KAVX', '281655Z', 'AUTO', 'RVRNO']
OUT OF DATA AT FIELD 7
['12876KGIF', 'GIF20190924172504109/24/19', '17:25:31', '5-MIN', 'KGIF', '242225Z', 'AU1']
OUT OF DATA AT FIELD 8
['12876KGIF', 'GIF20190924172504109/24/19', '17:25:31', '5-MIN', 'KGIF', '242225Z', 'AU1']
OUT OF DATA AT FIELD 8
['12947KCOT', 'COT2019090904

### Initial look at the data


In [28]:
noaa_df.head()

Unnamed: 0,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date,timestamp,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields
0,24154,KMLP,MLP,2019,9,1,0,0,135,09/01/19,00:00:31,5-MIN,KMLP,010800Z,AUTO,True,000,0,False,,False,,False,23
1,24154,KMLP,MLP,2019,9,1,0,5,136,09/01/19,00:05:31,5-MIN,KMLP,010805Z,AUTO,True,VRB,4,False,,True,,False,23
2,24154,KMLP,MLP,2019,9,1,0,10,136,09/01/19,00:10:31,5-MIN,KMLP,010810Z,AUTO,True,VRB,3,False,,True,,False,23
3,24154,KMLP,MLP,2019,9,1,0,15,138,09/01/19,00:15:31,5-MIN,KMLP,010815Z,AUTO,True,210,5,False,,False,,False,23
4,24154,KMLP,MLP,2019,9,1,0,20,138,09/01/19,00:20:31,5-MIN,KMLP,010820Z,AUTO,True,230,6,False,,False,,False,23


In [32]:
noaa_df[noaa_df.wind_speed == 'T']

Unnamed: 0,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date,timestamp,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields
2739677,3936,KMHK,MHK,2019,9,18,5,5,365,09/18/19,05:05:31,5-MIN,KMHK,181105Z,,True,AUK,T,False,,False,,False,57
4452667,92805,KPMP,PMP,2019,9,24,4,25,459,09/24/19,04:25:31,5-MIN,KPMP,240925Z,,True,AUK,T,False,,False,,True,85
5772065,94050,KEEO,EEO,2019,9,13,17,40,179,09/13/19,17:40:31,5-MIN,KEEO,140040Z,,True,AUK,T,False,,False,,False,33
5772077,94050,KEEO,EEO,2019,9,13,18,35,165,09/13/19,18:35:31,5-MIN,KEEO,140135Z,,True,AUK,T,False,,False,,False,27
6100357,54757,KELZ,ELZ,2019,9,8,8,15,113,09/08/19,08:15:31,5-MIN,KELZ,081315Z,,True,AUK,T,False,,False,,False,20


In [33]:
# Drop rows where wind speed is not numeric
noaa_df = noaa_df[noaa_df.wind_speed != 'T']

### Merge lat long data for stations

In [38]:
# Read from file that was stored earlier
station_df = pd.read_parquet("{}/noaa/station_data.parquet".format(datafolder))

In [39]:
station_df[station_df.wban_number == '21504']

Unnamed: 0,wban_number,descriptor,lat,lon,elev_m,begin_date,end_date
25041,21504,HILO INTERNATIONAL AIRPORT US HI PHTO,19.719,-155.053,11.6,19730101,20190920
25042,21504,HILO GENERAL LYMAN ARPT US HI PHTO,19.719,-155.053,11.0,19430415,19451228
28925,21504,HILO INTERNATIONAL AP US HI PHTO,19.719,-155.053,11.0,19491001,19721231


In [40]:
# The station dataframe has multiple records for the same wban_number based on differing elevations and descriptors.
# Create a new dataframe with unique wban_number and lat-lon values to join with the ASOS data
unique_station_df = station_df.drop(['descriptor','elev_m','begin_date','end_date'], axis=1).drop_duplicates()

In [41]:
unique_station_df[unique_station_df.wban_number == '21504']

Unnamed: 0,wban_number,lat,lon
25041,21504,19.719,-155.053


In [93]:
# Write to file
parquet_file = "{}/noaa/uniq_station_data.parquet".format(datafolder)
write(parquet_file, unique_station_df,compression='GZIP')

In [None]:
# Read from file that was stored earlier
unique_station_df = pd.read_parquet("{}/noaa/uniq_station_data.parquet".format(datafolder))

In [42]:
merged_df = pd.merge(noaa_df, unique_station_df, on='wban_number')

In [43]:
merged_df.head()

Unnamed: 0,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date,timestamp,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,lat,lon
0,24154,KMLP,MLP,2019,9,1,0,0,135,09/01/19,00:00:31,5-MIN,KMLP,010800Z,AUTO,True,000,0.0,False,,False,,False,23,47.457,-115.645
1,24154,KMLP,MLP,2019,9,1,0,5,136,09/01/19,00:05:31,5-MIN,KMLP,010805Z,AUTO,True,VRB,4.0,False,,True,,False,23,47.457,-115.645
2,24154,KMLP,MLP,2019,9,1,0,10,136,09/01/19,00:10:31,5-MIN,KMLP,010810Z,AUTO,True,VRB,3.0,False,,True,,False,23,47.457,-115.645
3,24154,KMLP,MLP,2019,9,1,0,15,138,09/01/19,00:15:31,5-MIN,KMLP,010815Z,AUTO,True,210,5.0,False,,False,,False,23,47.457,-115.645
4,24154,KMLP,MLP,2019,9,1,0,20,138,09/01/19,00:20:31,5-MIN,KMLP,010820Z,AUTO,True,230,6.0,False,,False,,False,23,47.457,-115.645


In [46]:
# Convert data type of numeric columns
merged_df[['wind_speed','gust_speed','lat','lon']] = merged_df[['wind_speed','gust_speed','lat','lon']].apply(pd.to_numeric)

In [47]:
merged_df.describe()

Unnamed: 0,wind_speed,gust_speed,num_fields,lat,lon
count,6507357.0,663690.0,6595372.0,6595372.0,6595372.0
mean,5.960893,21.424103,19.53388,39.02964,-96.14671
std,4.534093,4.859211,1.950849,7.309837,20.45387
min,0.0,10.0,6.0,13.483,-170.222
25%,3.0,18.0,18.0,34.262,-108.54
50%,5.0,21.0,19.0,39.295,-92.543
75%,9.0,24.0,20.0,42.746,-81.684
max,51.0,71.0,75.0,71.283,144.8


In [48]:
merged_df[["wban_number", "call_sign", "call_sign2", "year", "month", "day", "hour", "minute", "rec_length", "date", "timestamp",
"interval", "zulu_time", "report_modifier", "wind_data", "wind_direction", "gusts", "variable_winds", "variable_wind_info", "sys_maint_reqd"]].describe().T

Unnamed: 0,count,unique,top,freq
wban_number,6595372,903,14750,14690
call_sign,6595372,903,KGFL,14690
call_sign2,6595372,903,GFL,14690
year,6595372,1,2019,6595372
month,6595372,1,09,6595372
day,6595372,29,03,268867
hour,6595372,25,00,296759
minute,6595372,13,00,549162
rec_length,6595372,251,109,709369
date,6595372,29,09/03/19,268858


In [64]:
station_df.wban_number.count(), unique_station_df.wban_number.count(), noaa_df.wban_number.count(), merged_df.wban_number.count() 

(29729, 26315, 6168769, 6595372)

In [49]:
# Write to file
parquet_file = "{}/noaa/Sep2019_withloc.parquet".format(datafolder)
write(parquet_file, merged_df,compression='GZIP')

In [55]:
# Filter out data for a single day 
Sep27_df = merged_df[merged_df.date == '09/27/19']

In [56]:
gusty = Sep27_df[Sep27_df['gusts'] == True]
print("There are",len(gusty),"records with gust data.")

There are 30949 records with gust data.


In [57]:
variable = Sep27_df[Sep27_df['variable_winds'] == True]
print("There are",len(variable),"records with variable wind data.")

There are 14925 records with variable wind data.


In [59]:
nowinddata = Sep27_df[Sep27_df['wind_data'] == False]
missing_wind = 0
for num in list(nowinddata.index):
#     print(lines[num])
    missing_wind += 1
print("There are",missing_wind,"records out of", len(lines), "without wind data.")

There are 3737 records out of 6168774 without wind data.


In [60]:
maintreqd = Sep27_df[Sep27_df['sys_maint_reqd'] == True]
print("There are {} records from systems requiring maintenance".format(len(maintreqd)))

There are 25154 records from systems requiring maintenance
