#### This notebook is where we will extract the pertinent weather data for the flights training data set

In [1]:
# Import pandas for df manipulations
import pandas as pd 

from functions import get_hour, time_of_day 

In [2]:
# Load the datasets
weather = pd.read_csv('data/WeatherEvents_Jan2016-Dec2020.csv')
data = pd.read_csv('data/flights.csv', low_memory = False)

In [3]:
# Inspect the data dataframe
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2018-12-15,NK,NK,NK,393,NK,N506NK,393,10397,...,425.0,,,,,,,,,
1,1,2018-12-16,UA,UA_CODESHARE,UA,4119,EV,N14993,4119,11618,...,277.0,,,,,,,,,
2,2,2018-12-15,NK,NK,NK,443,NK,N681NK,443,10821,...,925.0,,,,,,,,,
3,3,2018-12-15,NK,NK,NK,445,NK,N504NK,445,11066,...,973.0,,,,,,,,,
4,4,2018-12-15,NK,NK,NK,446,NK,N621NK,446,12889,...,1514.0,,,,,,,,,


Need to make an hour column in data

In [4]:
# Create a column with the hour that the plane departs and arrives

data['dep_hour'] = data['crs_dep_time'].apply(get_hour)
data['arr_hour'] = data['crs_arr_time'].apply(get_hour)
data['arr_hour'].dtypes

dtype('int64')

In [5]:
# Create a column that assigns a period of the day to a departure
data['dep_period'] = data['dep_hour'].apply(time_of_day)
data['arr_period'] = data['arr_hour'].apply(time_of_day)
data['arr_period'].value_counts()

Afternoon    484247
Evening      439478
Morning      369087
Night         35861
Name: arr_period, dtype: int64

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,dep_hour,arr_hour,dep_period,arr_period
0,0,2018-12-15,NK,NK,NK,393,NK,N506NK,393,10397,...,,,,,,,15,16,Afternoon,Afternoon
1,1,2018-12-16,UA,UA_CODESHARE,UA,4119,EV,N14993,4119,11618,...,,,,,,,19,21,Evening,Evening
2,2,2018-12-15,NK,NK,NK,443,NK,N681NK,443,10821,...,,,,,,,20,22,Evening,Evening
3,3,2018-12-15,NK,NK,NK,445,NK,N504NK,445,11066,...,,,,,,,6,8,Morning,Morning
4,4,2018-12-15,NK,NK,NK,446,NK,N621NK,446,12889,...,,,,,,,8,14,Morning,Afternoon


In [7]:
# Build the matchcode variable for the data table
data['origin_match_code'] = data['origin'].astype(str) + data['fl_date'].astype(str) + data['dep_period'].astype(str)
data['dest_match_code'] = data['dest'].astype(str) + data['fl_date'].astype(str) + data['arr_period'].astype(str)

In [8]:
# Let's engineer some columns a little bit
weather['date'] = pd.DatetimeIndex(weather['StartTime(UTC)']).date
weather['hour'] = pd.DatetimeIndex(weather['StartTime(UTC)']).hour
weather['period'] = weather['hour'].apply(time_of_day)
weather = weather.drop(columns = ['StartTime(UTC)', 'EndTime(UTC)'])


In [9]:
# Remove the K prefix from the airport codes
weather['AirportCode'] = weather['AirportCode'].apply(lambda x: x[1:])

In [10]:
# Encode the match code column to match values to the data dataframe
weather['match_code'] = weather['AirportCode'].astype(str) + weather['date'].astype(str) + weather['period'].astype(str)

In [11]:
# Drop unnecessary columns

weather = weather.drop(columns = ['EventId',
                                  'TimeZone',
                                 'AirportCode',
                                 'LocationLat',
                                 'LocationLng',
                                 'City',
                                 'County',
                                 'State',
                                 'ZipCode'])

In [12]:
# Drop duplicate values
weather = weather.drop_duplicates(subset = 'match_code')
weather.shape

(3097656, 6)

In [13]:
# Create dictionaries for severity and condition with match_code as the key
severity_dict = dict(zip(weather['match_code'], weather['Severity']))
condition_dict = dict(zip(weather['match_code'], weather['Type']))

In [14]:
len(severity_dict)

3097656

In [15]:
# Use the previously created dictionaries to add severity and condition to the flights dataset
data['origin_weather_condition'] = data['origin_match_code'].map(condition_dict)
data['origin_weather_severity'] = data['origin_match_code'].map(severity_dict)
data['dest_weather_condition'] = data['dest_match_code'].map(condition_dict)
data['dest_weather_severity'] = data['dest_match_code'].map(severity_dict)
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dep_hour,arr_hour,dep_period,arr_period,origin_match_code,dest_match_code,origin_weather_condition,origin_weather_severity,dest_weather_condition,dest_weather_severity
0,0,2018-12-15,NK,NK,NK,393,NK,N506NK,393,10397,...,15,16,Afternoon,Afternoon,ATL2018-12-15Afternoon,MSY2018-12-15Afternoon,,,,
1,1,2018-12-16,UA,UA_CODESHARE,UA,4119,EV,N14993,4119,11618,...,19,21,Evening,Evening,EWR2018-12-16Evening,RIC2018-12-16Evening,Rain,Moderate,,
2,2,2018-12-15,NK,NK,NK,443,NK,N681NK,443,10821,...,20,22,Evening,Evening,BWI2018-12-15Evening,FLL2018-12-15Evening,Rain,Light,Rain,Light
3,3,2018-12-15,NK,NK,NK,445,NK,N504NK,445,11066,...,6,8,Morning,Morning,CMH2018-12-15Morning,FLL2018-12-15Morning,,,,
4,4,2018-12-15,NK,NK,NK,446,NK,N621NK,446,12889,...,8,14,Morning,Afternoon,LAS2018-12-15Morning,ORD2018-12-15Afternoon,,,,


In [16]:
data.isna().sum()

Unnamed: 0                        0
fl_date                           0
mkt_unique_carrier                0
branded_code_share                0
mkt_carrier                       0
mkt_carrier_fl_num                0
op_unique_carrier                 0
tail_num                       1355
op_carrier_fl_num                 0
origin_airport_id                 0
origin                            0
origin_city_name                  0
dest_airport_id                   0
dest                              0
dest_city_name                    0
crs_dep_time                      0
dep_time                      10676
dep_delay                     10968
taxi_out                      11019
wheels_off                    11012
wheels_on                     11807
taxi_in                       11814
crs_arr_time                      0
arr_time                      11807
arr_delay                     14489
cancelled                         0
cancellation_code           1317461
diverted                    

In [17]:
# Replace NaN weather conditions with 'Clear' and severity with 'Light'
data['origin_weather_condition'].fillna('Clear', inplace=True)
data['origin_weather_severity'].fillna('Light', inplace=True)
data['dest_weather_condition'].fillna('Clear', inplace=True)
data['dest_weather_severity'].fillna('Light', inplace=True)
data.head()


Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dep_hour,arr_hour,dep_period,arr_period,origin_match_code,dest_match_code,origin_weather_condition,origin_weather_severity,dest_weather_condition,dest_weather_severity
0,0,2018-12-15,NK,NK,NK,393,NK,N506NK,393,10397,...,15,16,Afternoon,Afternoon,ATL2018-12-15Afternoon,MSY2018-12-15Afternoon,Clear,Light,Clear,Light
1,1,2018-12-16,UA,UA_CODESHARE,UA,4119,EV,N14993,4119,11618,...,19,21,Evening,Evening,EWR2018-12-16Evening,RIC2018-12-16Evening,Rain,Moderate,Clear,Light
2,2,2018-12-15,NK,NK,NK,443,NK,N681NK,443,10821,...,20,22,Evening,Evening,BWI2018-12-15Evening,FLL2018-12-15Evening,Rain,Light,Rain,Light
3,3,2018-12-15,NK,NK,NK,445,NK,N504NK,445,11066,...,6,8,Morning,Morning,CMH2018-12-15Morning,FLL2018-12-15Morning,Clear,Light,Clear,Light
4,4,2018-12-15,NK,NK,NK,446,NK,N621NK,446,12889,...,8,14,Morning,Afternoon,LAS2018-12-15Morning,ORD2018-12-15Afternoon,Clear,Light,Clear,Light


In [None]:
# Write the data to a new csv
data.to_csv('data/flights_weather.csv')