#### This notebook is where we will extract the pertinent weather data for the flights training data set

In [1]:
# Import pandas for df manipulations
import pandas as pd 

from functions import get_hour, time_of_day 

In [2]:
# Load the datasets
weather = pd.read_csv('data/WeatherEvents_Jan2016-Dec2020.csv')
data = pd.read_csv('data/flights.csv', low_memory = False)

In [3]:
# Inspect the data dataframe
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,157.0,,,,,,,,,
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,313.0,,,,,,,,,
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,313.0,,,,,,,,,
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,396.0,,,,,,,,,
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,290.0,,,,,,,,,


Need to make an hour column in data

In [5]:
# Create a column with the hour that the plane departs and arrives

data['dep_hour'] = data['crs_dep_time'].apply(get_hour)
data['arr_hour'] = data['crs_arr_time'].apply(get_hour)
data['arr_hour'].dtypes

dtype('int64')

In [6]:
# Create a column that assigns a period of the day to a departure
data['dep_period'] = data['dep_hour'].apply(time_of_day)
data['arr_period'] = data['arr_hour'].apply(time_of_day)
data['arr_period'].value_counts()

Afternoon    484247
Evening      439478
Morning      369087
Night         35861
Name: arr_period, dtype: int64

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,dep_hour,arr_hour,dep_period,arr_period
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,,,,,,,21,23,Evening,Evening
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,,,,,,,11,13,Morning,Afternoon
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,,,,,,,10,11,Morning,Morning
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,,,,,,,20,21,Evening,Evening
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,,,,,,,12,13,Afternoon,Afternoon


In [8]:
# Build the matchcode variable for the data table
data['origin_match_code'] = data['origin'].astype(str) + data['fl_date'].astype(str) + data['dep_period'].astype(str)
data['dest_match_code'] = data['dest'].astype(str) + data['fl_date'].astype(str) + data['arr_period'].astype(str)

In [10]:
# Let's engineer some columns a little bit
weather['date'] = pd.DatetimeIndex(weather['StartTime(UTC)']).date
weather['hour'] = pd.DatetimeIndex(weather['StartTime(UTC)']).hour
weather['period'] = weather['hour'].apply(time_of_day)
weather = weather.drop(columns = ['StartTime(UTC)', 'EndTime(UTC)'])


In [11]:
# Remove the K prefix from the airport codes
weather['AirportCode'] = weather['AirportCode'].apply(lambda x: x[1:])

In [12]:
# Encode the match code column to match values to the data dataframe
weather['match_code'] = weather['AirportCode'].astype(str) + weather['date'].astype(str) + weather['period'].astype(str)

In [13]:
# Drop unnecessary columns

weather = weather.drop(columns = ['EventId',
                                  'TimeZone',
                                 'AirportCode',
                                 'LocationLat',
                                 'LocationLng',
                                 'City',
                                 'County',
                                 'State',
                                 'ZipCode'])

In [14]:
# Drop duplicate values
weather = weather.drop_duplicates(subset = 'match_code')
weather.shape

(3097656, 6)

In [15]:
# Create dictionaries for severity and condition with match_code as the key
severity_dict = dict(zip(weather['match_code'], weather['Severity']))
condition_dict = dict(zip(weather['match_code'], weather['Type']))

In [16]:
len(severity_dict)

3097656

In [17]:
# Use the previously created dictionaries to add severity and condition to the flights dataset
data['origin_weather_condition'] = data['origin_match_code'].map(condition_dict)
data['origin_weather_severity'] = data['origin_match_code'].map(severity_dict)
data['dest_weather_condition'] = data['dest_match_code'].map(condition_dict)
data['dest_weather_severity'] = data['dest_match_code'].map(severity_dict)
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dep_hour,arr_hour,dep_period,arr_period,origin_match_code,dest_match_code,origin_weather_condition,origin_weather_severity,dest_weather_condition,dest_weather_severity
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,21,23,Evening,Evening,MYR2019-01-01Evening,CLT2019-01-01Evening,,,,
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,11,13,Morning,Afternoon,BDL2019-01-01Morning,DCA2019-01-01Afternoon,Rain,Moderate,,
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,10,11,Morning,Morning,DCA2019-01-01Morning,BDL2019-01-01Morning,Rain,Light,Rain,Moderate
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,20,21,Evening,Evening,DCA2019-01-01Evening,GSP2019-01-01Evening,,,,
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,12,13,Afternoon,Afternoon,ORF2019-01-01Afternoon,CLT2019-01-01Afternoon,,,,


In [18]:
# Replace NaN weather conditions with 'Clear' and severity with 'Light'
data['origin_weather_condition'].fillna('Clear', inplace=True)
data['origin_weather_severity'].fillna('Light', inplace=True)
data['dest_weather_condition'].fillna('Clear', inplace=True)
data['dest_weather_severity'].fillna('Light', inplace=True)
data.head()


Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dep_hour,arr_hour,dep_period,arr_period,origin_match_code,dest_match_code,origin_weather_condition,origin_weather_severity,dest_weather_condition,dest_weather_severity
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,21,23,Evening,Evening,MYR2019-01-01Evening,CLT2019-01-01Evening,Clear,Light,Clear,Light
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,11,13,Morning,Afternoon,BDL2019-01-01Morning,DCA2019-01-01Afternoon,Rain,Moderate,Clear,Light
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,10,11,Morning,Morning,DCA2019-01-01Morning,BDL2019-01-01Morning,Rain,Light,Rain,Moderate
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,20,21,Evening,Evening,DCA2019-01-01Evening,GSP2019-01-01Evening,Clear,Light,Clear,Light
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,12,13,Afternoon,Afternoon,ORF2019-01-01Afternoon,CLT2019-01-01Afternoon,Clear,Light,Clear,Light


In [20]:
# Write the data to a new csv
data.to_csv('data/flights_weather.csv')