In [1]:
# Import pandas for df manipulations
import pandas as pd 

In [2]:
# Load the datasets
weather = pd.read_csv('WeatherEvents_Jan2016-Dec2020.csv')
data = pd.read_csv('data/flights.csv', low_memory = False)

In [3]:
# Inspect the data dataframe
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,157.0,,,,,,,,,
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,313.0,,,,,,,,,
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,313.0,,,,,,,,,
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,396.0,,,,,,,,,
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,290.0,,,,,,,,,


Need to make an hour column in data

In [4]:
# Extract the hour from the flight departure column
def get_hour(time):
    
    """
    Params - A time in format HHMM.0
    
    Returns  the two digits representing the hour of the flight
    """
    
    s = str(int(time))
    if len(s) <= 2:
        return 0
    else:
        return int(s[:-2])
    
# Function to assign a time of day label to the dataframes.  
# Allows for better matching of weather to our flights dataframe
def time_of_day(hour):
    """
    Input an hour of the day.  
    Output Morning, Afternoon, Evening, Night
    """
    if 0 >= hour > 6:
        return 'Night'
    elif 6>= hour > 12:
        return 'Morning'
    elif 12 >= hour > 6:
        return'Afternoon'
    else:
        return 'Evening'
    

In [5]:
# Create a column with the hour that the plane departs

data['hour'] = data['crs_dep_time'].apply(get_hour)

In [6]:
# Create a column that assigns a period of the day to a departure
data['period'] = data['hour'].apply(time_of_day)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,hour,period
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,,,,,,,,,21,Evening
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,,,,,,,,,11,Afternoon
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,,,,,,,,,10,Afternoon
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,,,,,,,,,20,Evening
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,,,,,,,,,12,Afternoon


In [8]:
# Build the matchcode variable for the data table
data['match_code'] = data['origin'].astype(str) + data['fl_date'].astype(str) + data['period'].astype(str)

In [9]:
# Inspect weather
weather.head()

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode
0,W-1,Snow,Light,2016-01-06 23:14:00,2016-01-07 00:34:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
1,W-2,Snow,Light,2016-01-07 04:14:00,2016-01-07 04:54:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
2,W-3,Snow,Light,2016-01-07 05:54:00,2016-01-07 15:34:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
3,W-4,Snow,Light,2016-01-08 05:34:00,2016-01-08 05:54:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
4,W-5,Snow,Light,2016-01-08 13:54:00,2016-01-08 15:54:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0


In [10]:
# Let's engineer some columns a little bit
weather['date'] = pd.DatetimeIndex(weather['StartTime(UTC)']).date
weather['hour'] = pd.DatetimeIndex(weather['StartTime(UTC)']).hour
weather['period'] = weather['hour'].apply(time_of_day)
weather = weather.drop(columns = ['StartTime(UTC)', 'EndTime(UTC)'])


In [11]:
# Remove the K prefix from the airport codes
weather['AirportCode'] = weather['AirportCode'].apply(lambda x: x[1:])

In [12]:
weather[weather['AirportCode'] == 'JFK']

Unnamed: 0,EventId,Type,Severity,TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode,date,hour,period
281085,W-281138,Rain,Light,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2016-01-10,3,Evening
281086,W-281139,Rain,Light,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2016-01-10,8,Afternoon
281087,W-281140,Rain,Moderate,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2016-01-10,9,Afternoon
281088,W-281141,Rain,Heavy,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2016-01-10,10,Afternoon
281089,W-281142,Rain,Moderate,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2016-01-10,11,Afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283842,W-283895,Rain,Light,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2020-12-25,8,Afternoon
283843,W-283896,Rain,Light,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2020-12-25,16,Evening
283844,W-283897,Rain,Light,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2020-12-31,10,Afternoon
283845,W-283898,Rain,Moderate,US/Eastern,JFK,40.6392,-73.7639,Jamaica,Queens,NY,11430.0,2020-12-31,13,Evening


In [13]:
# Encode the match code column to match values to the data dataframe
weather['match_code'] = weather['AirportCode'].astype(str) + weather['date'].astype(str) + weather['period'].astype(str)

In [14]:
# Drop unnecessary columns

weather = weather.drop(columns = ['EventId',
                                  'TimeZone',
                                 'AirportCode',
                                 'LocationLat',
                                 'LocationLng',
                                 'City',
                                 'County',
                                 'State',
                                 'ZipCode'])

In [15]:
weather.shape

(6274206, 6)

In [16]:
weather = weather.drop_duplicates(subset = 'match_code')
weather.shape

(2344678, 6)

In [17]:
severity_dict = dict(zip(weather['match_code'], weather['Severity']))
condition_dict = dict(zip(weather['match_code'], weather['Type']))

In [18]:
len(severity_dict)

2344678

In [19]:
data['weather_condition'] = data['match_code'].map(condition_dict)
data['weather_severity'] = data['match_code'].map(severity_dict)
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,hour,period,match_code,weather_condition,weather_severity
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,,,,,,21,Evening,MYR2019-01-01Evening,,
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,,,,,,11,Afternoon,BDL2019-01-01Afternoon,Rain,Light
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,,,,,,10,Afternoon,DCA2019-01-01Afternoon,Rain,Light
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,,,,,,20,Evening,DCA2019-01-01Evening,Rain,Light
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,,,,,,12,Afternoon,ORF2019-01-01Afternoon,,


In [20]:
# Replace NaN weather conditions with 'Clear' and severity with 'Light'
data['weather_condition'].fillna('Clear', inplace=True)
data['weather_severity'].fillna('Light', inplace=True)
data.head()


Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,hour,period,match_code,weather_condition,weather_severity
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,,,,,,21,Evening,MYR2019-01-01Evening,Clear,Light
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,,,,,,11,Afternoon,BDL2019-01-01Afternoon,Rain,Light
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,,,,,,10,Afternoon,DCA2019-01-01Afternoon,Rain,Light
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,,,,,,20,Evening,DCA2019-01-01Evening,Rain,Light
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,,,,,,12,Afternoon,ORF2019-01-01Afternoon,Clear,Light


In [25]:
data.to_csv('data/flights_weather.csv')

In [24]:
data.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,hour,period,match_code,weather_condition,weather_severity
0,0,2019-01-01,AA,AA_CODESHARE,AA,5606,OH,N575NN,5606,13577,...,,,,,,21,Evening,MYR2019-01-01Evening,Clear,Light
1,1,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,10529,...,,,,,,11,Afternoon,BDL2019-01-01Afternoon,Rain,Light
2,2,2019-01-01,AA,AA_CODESHARE,AA,5607,OH,N254PS,5607,11278,...,,,,,,10,Afternoon,DCA2019-01-01Afternoon,Rain,Light
3,3,2019-01-01,AA,AA_CODESHARE,AA,5608,OH,N710PS,5608,11278,...,,,,,,20,Evening,DCA2019-01-01Evening,Rain,Light
4,4,2019-01-01,AA,AA_CODESHARE,AA,5610,OH,N515AE,5610,13931,...,,,,,,12,Afternoon,ORF2019-01-01Afternoon,Clear,Light
