<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Chapter-5" data-toc-modified-id="Chapter-5-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Chapter 5</a></span></li></ul></div>

## Chapter 5

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from dask import delayed

In [3]:
# Define @delayed-function read_flights
@delayed
def read_flights(filename):

    # Read in the DataFrame: df
    df = pd.read_csv(filename, parse_dates=['FL_DATE'])

    # Replace 0s in df['WEATHER_DELAY'] with np.nan
    df['WEATHER_DELAY'] = df['WEATHER_DELAY'].replace(0, np.nan)

    # Return df
    return df

In [25]:
df = read_flights('flightdelays/flightdelays-2016-1.csv')
df.info().compute()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89165 entries, 0 to 89164
Data columns (total 23 columns):
FL_DATE                89165 non-null datetime64[ns]
UNIQUE_CARRIER         89165 non-null object
FL_NUM                 89165 non-null int64
ORIGIN                 89165 non-null object
ORIGIN_CITY_NAME       89165 non-null object
ORIGIN_STATE_ABR       89165 non-null object
ORIGIN_STATE_NM        89165 non-null object
DEST                   89165 non-null object
DEST_CITY_NAME         89165 non-null object
DEST_STATE_ABR         89165 non-null object
DEST_STATE_NM          89165 non-null object
CRS_DEP_TIME           89165 non-null int64
DEP_DELAY              89165 non-null float64
CRS_ARR_TIME           89165 non-null int64
ARR_DELAY              86628 non-null float64
CANCELLED              89165 non-null float64
DIVERTED               89165 non-null float64
CARRIER_DELAY          14255 non-null float64
WEATHER_DELAY          731 non-null float64
NAS_DELAY              1425

In [26]:
from glob import glob
import dask.dataframe as dd

dataframes = []
filenames = glob('flightdelays/*.csv')

In [27]:
# Loop over filenames with index filename
for filename in filenames:
    # Apply read_flights to filename; append to dataframes
    dataframes.append(read_flights(filename))

# Compute flight delays: flight_delays
flight_delays = dd.from_delayed(dataframes)

# Print average of 'WEATHER_DELAY' column of flight_delays
print(flight_delays['WEATHER_DELAY'].mean().compute())

51.29467680608365


In [28]:
# Define @delayed-function read_weather with input filename
@delayed
def read_weather(filename):
    # Read in filename: df
    df = pd.read_csv(filename, parse_dates=['Date'])

    # Clean 'PrecipitationIn'
    df['PrecipitationIn'] = pd.to_numeric(df['PrecipitationIn'],
                                         errors='coerce')

    # Create the 'Airport' column
    df['Airport'] = filename.split('.')[0].split('/')[1]

    # Return df
    return df

In [35]:
df = read_weather('weatherdata/DEN.csv')
df.compute()

Unnamed: 0,Date,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,...,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees,Airport
0,2016-01-01,35,21,6,10,1,-9,80,50,19,...,10,7,20,9,22.0,0.00,0,,216,DEN
1,2016-01-02,41,29,16,11,4,-1,50,37,24,...,10,10,17,10,22.0,0.00,2,,205,DEN
2,2016-01-03,46,31,15,19,11,2,68,49,30,...,10,10,14,7,29.0,0.00,3,,193,DEN
3,2016-01-04,45,32,18,23,14,9,73,55,37,...,10,10,10,6,12.0,0.00,6,,154,DEN
4,2016-01-05,45,34,23,19,16,12,63,48,33,...,10,10,13,7,14.0,0.00,3,,246,DEN
5,2016-01-06,46,33,19,27,20,14,81,62,43,...,10,10,16,8,17.0,0.00,4,,166,DEN
6,2016-01-07,35,32,28,31,28,22,92,79,66,...,4,1,12,5,12.0,0.15,8,Snow,307,DEN
7,2016-01-08,29,24,18,27,19,15,92,83,74,...,2,0,16,6,19.0,0.23,8,Fog-Snow,38,DEN
8,2016-01-09,28,17,6,19,13,3,92,78,63,...,6,1,14,4,15.0,,6,Snow,203,DEN
9,2016-01-10,32,19,5,16,8,2,88,71,53,...,6,0,21,8,22.0,,1,Fog-Snow,242,DEN


In [39]:
weather_dfs = []
filenames = glob('weatherdata/???.csv')
print(filenames)

['weatherdata/DFW.csv', 'weatherdata/DEN.csv', 'weatherdata/MCO.csv', 'weatherdata/ATL.csv', 'weatherdata/ORD.csv']


In [40]:
# Loop over filenames with filename
for filename in filenames:
    # Invoke read_weather on filename; append result to weather_dfs
    weather_dfs.append(read_weather(filename))

# Call dd.from_delayed() with weather_dfs: weather
weather = dd.from_delayed(weather_dfs)

# Print result of weather.nlargest(1, 'Max TemperatureF')
print(weather.nlargest(1, 'Max TemperatureF').compute())

          Date  Max TemperatureF  Mean TemperatureF  Min TemperatureF  \
224 2016-08-12               107                 93                79   

     Max Dew PointF  MeanDew PointF  Min DewpointF  Max Humidity  \
224              75              71             66            79   

     Mean Humidity  Min Humidity   ...     Mean VisibilityMiles  \
224             53            27   ...                        8   

     Min VisibilityMiles  Max Wind SpeedMPH  Mean Wind SpeedMPH  \
224                    0                 41                  10   

     Max Gust SpeedMPH  PrecipitationIn  CloudCover             Events  \
224               54.0             0.82           5  Rain-Thunderstorm   

     WindDirDegrees  Airport  
224             214      DFW  

[1 rows x 24 columns]


In [47]:
weather.head()

Unnamed: 0,Date,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,...,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees,Airport
0,2016-01-01,49,44,39,32,28,24,70,55,39,...,10,10,18,11,24.0,0.0,7,,355,DFW
1,2016-01-02,51,45,38,28,25,20,70,50,30,...,10,10,12,6,13.0,0.0,8,,298,DFW
2,2016-01-03,60,49,37,33,30,26,82,57,32,...,10,10,15,8,17.0,0.0,0,,314,DFW
3,2016-01-04,54,43,32,32,29,27,82,60,38,...,10,10,10,5,12.0,0.0,0,,6,DFW
4,2016-01-05,47,40,32,31,29,27,82,70,57,...,10,10,15,8,19.0,0.0,6,,121,DFW


In [42]:
# Make cleaned Boolean Series from weather['Events']: is_snowy
is_snowy = weather['Events'].str.contains('Snow').fillna(False)

# Create filtered DataFrame with weather.loc & is_snowy: got_snow
got_snow = weather.loc[is_snowy]

# Groupby 'Airport' column; select 'PrecipitationIn'; aggregate sum(): result
result = got_snow.groupby('Airport')['PrecipitationIn'].sum()

# Compute & print the value of result
print(result.compute())

Airport
DEN    5.59
ATL    1.94
ORD    3.91
Name: PrecipitationIn, dtype: float64


In [65]:
import time

def percent_delayed(df):
    return (df['WEATHER_DELAY'].count() / len(df)) * 100

weather_delays = flight_delays.merge(weather,
                                     left_on=['FL_DATE', 'ORIGIN'],
                                     right_on=['Date', 'Airport'],
                                     how='left')
weather_delays.head()

Unnamed: 0,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,DEST,DEST_CITY_NAME,DEST_STATE_ABR,...,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees,Airport
0,2016-04-01,OO,5800,IAH,"Houston, TX",TX,Texas,CLT,"Charlotte, NC",NC,...,,,,,,,,,,
1,2016-04-01,UA,377,BIL,"Billings, MT",MT,Montana,DEN,"Denver, CO",CO,...,,,,,,,,,,
2,2016-04-01,UA,789,PDX,"Portland, OR",OR,Oregon,DEN,"Denver, CO",CO,...,,,,,,,,,,
3,2016-04-01,EV,3728,IAH,"Houston, TX",TX,Texas,DSM,"Des Moines, IA",IA,...,,,,,,,,,,
4,2016-04-01,UA,215,DEN,"Denver, CO",CO,Colorado,MSP,"Minneapolis, MN",MN,...,10.0,10.0,23.0,9.0,29.0,0.0,3.0,,230.0,DEN


In [66]:
# Print time in milliseconds to compute percent_delayed on weather_delays
t_start = time.time()
print(percent_delayed(weather_delays).compute())
t_end = time.time()
print((t_end-t_start)*1000)

# Call weather_delays.persist(): persisted_weather_delays
persisted_weather_delays = weather_delays.persist()

# Print time in milliseconds to compute percent_delayed on persisted_weather_delays
t_start = time.time()
print(percent_delayed(persisted_weather_delays).compute())
t_end = time.time()
print((t_end-t_start)*1000)

0.8039985937667181
15106.848001480103
0.8039985937667181
22.253036499023438


In [75]:
# Group persisted_weather_delays by 'Events': by_event
by_event = persisted_weather_delays.groupby('Events')

# Count 'by_event['WEATHER_DELAY'] column & divide by total number of delayed flights
pct_delayed = by_event['WEATHER_DELAY'].count() / persisted_weather_delays['WEATHER_DELAY'].count() * 100

# Compute & print five largest values of pct_delayed
print(pct_delayed.nlargest(5).compute())

# Calculate mean of by_event['WEATHER_DELAY'] column & return the 5 largest entries: avg_delay_time
avg_delay_time = by_event['WEATHER_DELAY'].mean().nlargest(5)

# Compute & print avg_delay_time
print(avg_delay_time.compute())

Events
Rain-Thunderstorm        14.638783
Snow                      5.975014
Fog-Snow                  2.444324
Rain                      2.199891
Fog-Rain-Thunderstorm     1.466594
Name: WEATHER_DELAY, dtype: float64
Events
Rain-Snow                77.725490
Thunderstorm             72.500000
Fog-Rain                 55.000000
Rain-Thunderstorm        47.335807
Fog-Rain-Thunderstorm    39.685185
Name: WEATHER_DELAY, dtype: float64
