<h2>Import Libraries</h2>

In [1]:
import pandas as pd
import numpy as np
import os
import math

<h2>Import Data</h2>

In [2]:
weather_files = [os.path.join('data/weather/', f) for f in os.listdir('data/weather/')]

In [3]:
df = None
for filename in weather_files:
    if df is None:
        df = pd.read_csv(filename, usecols=['DATE', 'REPORT_TYPE', 'CALL_SIGN', 'WND', 'TMP', 'DEW'])
    else:
        tmp_df = pd.read_csv(filename, usecols=['DATE', 'REPORT_TYPE', 'CALL_SIGN', 'WND', 'TMP', 'DEW'])
        df = pd.concat([df, tmp_df])

In [4]:
df.describe()

Unnamed: 0,DATE,REPORT_TYPE,CALL_SIGN,WND,TMP,DEW
count,103221,103221,103221,103221,103221,103221
unique,90086,6,2,1816,359,324
top,2017-01-28T05:59:00,FM-15,KORD,"999,9,C,0000,5",99999,99999
freq,4,70098,94973,4943,3051,3052


<h2>Filter Rows</h2>

In [5]:
df = df[df['REPORT_TYPE']=='FM-15'].copy()

In [6]:
df.head(10)

Unnamed: 0,DATE,REPORT_TYPE,CALL_SIGN,WND,TMP,DEW
1,2017-01-01T00:51:00,FM-15,KORD,"260,5,N,0031,5",-225,-675
2,2017-01-01T01:51:00,FM-15,KORD,"260,5,N,0021,5",-285,-725
3,2017-01-01T02:51:00,FM-15,KORD,"250,5,N,0026,5",-335,-725
4,2017-01-01T03:51:00,FM-15,KORD,"240,5,N,0031,5",-335,-675
5,2017-01-01T04:51:00,FM-15,KORD,"250,5,N,0031,5",-395,-675
6,2017-01-01T05:51:00,FM-15,KORD,"270,5,N,0021,5",-445,-725
10,2017-01-01T06:51:00,FM-15,KORD,"250,5,N,0021,5",-505,-725
11,2017-01-01T07:51:00,FM-15,KORD,"999,9,C,0000,5",-615,-835
12,2017-01-01T08:51:00,FM-15,KORD,"210,5,N,0021,5",-615,-785
13,2017-01-01T09:51:00,FM-15,KORD,"190,5,N,0026,5",-725,-895


<h2>Format Data Types</h2>

In [7]:
# set DATE to datetime
df['DATE'] = pd.to_datetime(df['DATE'], utc=True)

# round to nearest hour
df['DATE'] = df['DATE'].dt.round('h')

In [8]:
# break wnd into columns
df[['WND_DIR', 'WND_DIR_QUAL', 'WND_OBS', 'WND_SPEED', 'WND_SPEED_QUAL']] = df['WND'].str.split(',', expand=True)

# update column data types
df = df.astype({'WND_DIR': int, 'WND_SPEED': int})

# update wind speed - multiplied by 10 in file
df['WND_SPEED'] = df['WND_SPEED']/10

In [9]:
# break TMP into columns
df[['TMP', 'TMP_QUAL']] = df['TMP'].str.split(',', expand=True)

# update column data types
df = df.astype({'TMP': int})

# update temperature - multiplied by 10 in file
df['TMP'] = df['TMP']/10

In [10]:
# break DEW into columns
df[['DEW', 'DEW_QUAL']] = df['DEW'].str.split(',', expand=True)

# update column data types
df = df.astype({'DEW': int})

# update wind speed - multiplied by 10 in file
df['DEW'] = df['DEW']/10

In [11]:
df.head(5)

Unnamed: 0,DATE,REPORT_TYPE,CALL_SIGN,WND,TMP,DEW,WND_DIR,WND_DIR_QUAL,WND_OBS,WND_SPEED,WND_SPEED_QUAL,TMP_QUAL,DEW_QUAL
1,2017-01-01 01:00:00+00:00,FM-15,KORD,"260,5,N,0031,5",-2.2,-6.7,260,5,N,3.1,5,5,5
2,2017-01-01 02:00:00+00:00,FM-15,KORD,"260,5,N,0021,5",-2.8,-7.2,260,5,N,2.1,5,5,5
3,2017-01-01 03:00:00+00:00,FM-15,KORD,"250,5,N,0026,5",-3.3,-7.2,250,5,N,2.6,5,5,5
4,2017-01-01 04:00:00+00:00,FM-15,KORD,"240,5,N,0031,5",-3.3,-6.7,240,5,N,3.1,5,5,5
5,2017-01-01 05:00:00+00:00,FM-15,KORD,"250,5,N,0031,5",-3.9,-6.7,250,5,N,3.1,5,5,5


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70098 entries, 1 to 13012
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   DATE            70098 non-null  datetime64[ns, UTC]
 1   REPORT_TYPE     70098 non-null  object             
 2   CALL_SIGN       70098 non-null  object             
 3   WND             70098 non-null  object             
 4   TMP             70098 non-null  float64            
 5   DEW             70098 non-null  float64            
 6   WND_DIR         70098 non-null  int32              
 7   WND_DIR_QUAL    70098 non-null  object             
 8   WND_OBS         70098 non-null  object             
 9   WND_SPEED       70098 non-null  float64            
 10  WND_SPEED_QUAL  70098 non-null  object             
 11  TMP_QUAL        70098 non-null  object             
 12  DEW_QUAL        70098 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3

<h2>Remove Invalid Entries</h2>

In [13]:
# 1 or 5 mean completed with no errors
df = df[(df['WND_DIR_QUAL']=='1') | (df['WND_DIR_QUAL']=='5')]
df = df[(df['TMP_QUAL']=='1') | (df['TMP_QUAL']=='5')]
df = df[(df['DEW_QUAL']=='1') | (df['DEW_QUAL']=='5')]
df.describe()

Unnamed: 0,TMP,DEW,WND_DIR,WND_SPEED
count,64949.0,64949.0,64949.0,64949.0
mean,10.660842,4.133186,196.471077,4.697067
std,11.512508,10.682797,100.440841,2.201085
min,-26.7,-32.8,10.0,1.5
25%,1.7,-3.3,110.0,3.1
50%,11.1,4.4,210.0,4.1
75%,20.6,12.8,280.0,6.2
max,39.4,26.1,360.0,19.0


In [14]:
# drop remaining duplicates (if any)
df = df.drop_duplicates(subset=['DATE'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56760 entries, 1 to 12336
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   DATE            56760 non-null  datetime64[ns, UTC]
 1   REPORT_TYPE     56760 non-null  object             
 2   CALL_SIGN       56760 non-null  object             
 3   WND             56760 non-null  object             
 4   TMP             56760 non-null  float64            
 5   DEW             56760 non-null  float64            
 6   WND_DIR         56760 non-null  int32              
 7   WND_DIR_QUAL    56760 non-null  object             
 8   WND_OBS         56760 non-null  object             
 9   WND_SPEED       56760 non-null  float64            
 10  WND_SPEED_QUAL  56760 non-null  object             
 11  TMP_QUAL        56760 non-null  object             
 12  DEW_QUAL        56760 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3

<h2>Add Relative Humidity</h2>

In [15]:
# using function here: https://bmcnoldy.earth.miami.edu/Humidity.html
def get_rh(td, t):
    return 100*(math.exp((17.625*td)/(243.04+td))/math.exp((17.625*t)/(243.04+t)))

df['RH'] = df.apply(lambda x: get_rh(x['DEW'], x['TMP']), axis=1)

df.head()

Unnamed: 0,DATE,REPORT_TYPE,CALL_SIGN,WND,TMP,DEW,WND_DIR,WND_DIR_QUAL,WND_OBS,WND_SPEED,WND_SPEED_QUAL,TMP_QUAL,DEW_QUAL,RH
1,2017-01-01 01:00:00+00:00,FM-15,KORD,"260,5,N,0031,5",-2.2,-6.7,260,5,N,3.1,5,5,5,71.273049
2,2017-01-01 02:00:00+00:00,FM-15,KORD,"260,5,N,0021,5",-2.8,-7.2,260,5,N,2.1,5,5,5,71.701603
3,2017-01-01 03:00:00+00:00,FM-15,KORD,"250,5,N,0026,5",-3.3,-7.2,250,5,N,2.6,5,5,5,74.418167
4,2017-01-01 04:00:00+00:00,FM-15,KORD,"240,5,N,0031,5",-3.3,-6.7,240,5,N,3.1,5,5,5,77.333391
5,2017-01-01 05:00:00+00:00,FM-15,KORD,"250,5,N,0031,5",-3.9,-6.7,250,5,N,3.1,5,5,5,80.879103


<h2>Drop Extra Columns</h2>

In [16]:
# don't need qual columns anymore - dropped invalid entries
# don't need DEW anymore - already got relative humidity
# don't need original WND
# don't need WND_OBS - don't know what is
# don't need report type or call_sign anymore
df = df.drop(columns=['REPORT_TYPE', 'CALL_SIGN', 'WND', 'DEW', 'WND_OBS', 'WND_DIR_QUAL', 'WND_SPEED_QUAL', 'TMP_QUAL', 'DEW_QUAL'])
df.head()

Unnamed: 0,DATE,TMP,WND_DIR,WND_SPEED,RH
1,2017-01-01 01:00:00+00:00,-2.2,260,3.1,71.273049
2,2017-01-01 02:00:00+00:00,-2.8,260,2.1,71.701603
3,2017-01-01 03:00:00+00:00,-3.3,250,2.6,74.418167
4,2017-01-01 04:00:00+00:00,-3.3,240,3.1,77.333391
5,2017-01-01 05:00:00+00:00,-3.9,250,3.1,80.879103


<h2>Convert Metric to Imperial</h2>

In [17]:
# convert C to F
df['TMP'] = df['TMP']*9/5 + 32

# convert m/s to mph
# using approx conversion here: https://www.unitconverters.net/speed/meters-per-second-to-miles-per-hour.htm
df['WND_SPEED'] = df['WND_SPEED'] * 2.2369

df.head()

Unnamed: 0,DATE,TMP,WND_DIR,WND_SPEED,RH
1,2017-01-01 01:00:00+00:00,28.04,260,6.93439,71.273049
2,2017-01-01 02:00:00+00:00,26.96,260,4.69749,71.701603
3,2017-01-01 03:00:00+00:00,26.06,250,5.81594,74.418167
4,2017-01-01 04:00:00+00:00,26.06,240,6.93439,77.333391
5,2017-01-01 05:00:00+00:00,24.98,250,6.93439,80.879103


In [18]:
df.describe()

Unnamed: 0,TMP,WND_DIR,WND_SPEED,RH
count,56760.0,56760.0,56760.0,56760.0
mean,51.019651,196.291755,10.482135,66.983538
std,20.851079,100.176757,4.918094,17.086063
min,-16.06,10.0,3.35535,14.323454
25%,35.06,110.0,6.93439,54.823569
50%,51.08,210.0,9.17129,68.103793
75%,68.0,280.0,13.86878,80.303848
max,102.92,360.0,42.5011,100.0


<h2>Save Result</h2>

In [19]:
df.to_parquet('data/weather.parquet')