# Features to represent weather of day

In [1]:
# import Modules
import pandas as pd

In [2]:
#Function for getting weather from scripts
def getting_weather(df,start_date,end_date):
    '''Given the weather dataframe and string of dates of interest (YYYY-MM-DD), provides weather data for dates provided and airport code.  '''
    filter1 = (df['StartTime(UTC)'] > start_date) & (df['StartTime(UTC)'] <= end_date)
    df = df.loc[filter1]
    df2 = df[['Type','Severity','StartTime(UTC)','iata_code']]
    return df2


In [3]:
#For testing purposes, importing a file from training data (not on github)
test = pd.read_csv('fl_samples.csv')

In [4]:
# Read the relevant weather_all.csv into a dataframe
weather = pd.read_csv('../../data/Weather_data/weather_all.csv')
#Plug and play example, changing dates
df_weather = getting_weather(weather,'2019-01-01','2019-01-31')

In [24]:
test.head(10)

Unnamed: 0,fl_date,origin,dep_delay,arr_delay,air_time
0,2019-01-01,ABE,-22.0,-30.0,136.0
1,2019-01-01,ABE,-9.0,-19.0,102.0
2,2019-01-01,ABE,-9.0,-6.0,111.0
3,2019-01-01,ABE,-9.0,-3.0,104.0
4,2019-01-01,ABE,-6.0,-25.0,86.0
5,2019-01-01,ABE,-6.0,-10.0,22.0
6,2019-01-01,ABE,-5.0,-4.0,134.0
7,2019-01-01,ABE,-3.0,-9.0,113.0
8,2019-01-01,ABE,-1.0,-8.0,86.0
9,2019-01-01,ABE,-1.0,11.0,123.0


In [25]:
test.shape

(165811, 5)

In [5]:
df_weather.head(10)

Unnamed: 0,Type,Severity,StartTime(UTC),iata_code
257,Rain,Light,2019-01-06,PSP
258,Rain,Light,2019-01-06,PSP
259,Fog,Moderate,2019-01-07,PSP
260,Rain,Light,2019-01-12,PSP
261,Rain,Light,2019-01-12,PSP
262,Rain,Light,2019-01-14,PSP
263,Rain,Moderate,2019-01-14,PSP
264,Rain,Light,2019-01-14,PSP
265,Rain,Moderate,2019-01-14,PSP
266,Rain,Light,2019-01-15,PSP


In [10]:
df_weather.dtypes

Type              object
Severity          object
StartTime(UTC)    object
iata_code         object
dtype: object

In [6]:
#renaming column to merge dataframes
df_weather = df_weather.rename(columns = {'iata_code':'origin', 'StartTime(UTC)':'fl_date'})

In [7]:
# Drop duplicates in the data frame
df_weather = df_weather.drop_duplicates(['origin','fl_date','Severity','Type'])

In [26]:
#Merge the data frame
test.merge(df_weather , how = 'left', on = ['origin', 'fl_date'])

Unnamed: 0,fl_date,origin,dep_delay,arr_delay,air_time,Type,Severity
0,2019-01-01,ABE,-22.0,-30.0,136.0,,
1,2019-01-01,ABE,-9.0,-19.0,102.0,,
2,2019-01-01,ABE,-9.0,-6.0,111.0,,
3,2019-01-01,ABE,-9.0,-3.0,104.0,,
4,2019-01-01,ABE,-6.0,-25.0,86.0,,
...,...,...,...,...,...,...,...
321029,2019-01-08,YKM,36.0,25.0,28.0,Snow,Light
321030,2019-01-08,YKM,36.0,25.0,28.0,Rain,Light
321031,2019-01-08,YUM,-6.0,18.0,40.0,,
321032,2019-01-08,YUM,-3.0,-4.0,30.0,,


In [8]:
# Drop duplicates in the data frame
df_weather 

Unnamed: 0,Type,Severity,fl_date,origin
257,Rain,Light,2019-01-06,PSP
259,Fog,Moderate,2019-01-07,PSP
260,Rain,Light,2019-01-12,PSP
262,Rain,Light,2019-01-14,PSP
263,Rain,Moderate,2019-01-14,PSP
...,...,...,...,...
971784,Rain,Heavy,2019-01-20,GSP
971786,Rain,Light,2019-01-23,GSP
971788,Rain,Light,2019-01-24,GSP
971789,Rain,Moderate,2019-01-24,GSP


In [9]:
df_dummy_weather = df_weather.copy()

In [10]:
#check to see what things there are, that I can eliminate (Other and Unk can be in the same column)
df_dummy_weather["Severity"].unique()

array(['Light', 'Moderate', 'Heavy', 'UNK', 'Severe', 'Other'],
      dtype=object)

In [11]:
df_dummy_weather["Type"].unique()

array(['Rain', 'Fog', 'Precipitation', 'Snow', 'Storm', 'Cold', 'Hail'],
      dtype=object)

In [12]:
df_dummy_weather

Unnamed: 0,Type,Severity,fl_date,origin
257,Rain,Light,2019-01-06,PSP
259,Fog,Moderate,2019-01-07,PSP
260,Rain,Light,2019-01-12,PSP
262,Rain,Light,2019-01-14,PSP
263,Rain,Moderate,2019-01-14,PSP
...,...,...,...,...
971784,Rain,Heavy,2019-01-20,GSP
971786,Rain,Light,2019-01-23,GSP
971788,Rain,Light,2019-01-24,GSP
971789,Rain,Moderate,2019-01-24,GSP


In [13]:
# combine the UNK severity type with Other, simplifies, may also need the same code to convert NaNs
df_dummy_weather = df_dummy_weather.replace('UNK', 'Other')

In [14]:
df_dummy_weather["Severity"].unique()

array(['Light', 'Moderate', 'Heavy', 'Other', 'Severe'], dtype=object)

In [15]:
#replace severity as ranking, leave 0 for when you have NaN's as "Sunny" - Will be useful with NaNs
severity_nums = {"Severity": {"Sunny": 0, "Other": 1, "Light": 2, "Moderate": 3, "Heavy": 4, "Severe": 5}} 
df_dummy_weather = df_dummy_weather.replace(severity_nums)

In [16]:
df_dummy_weather.shape

(8958, 4)

In [17]:
#Get dummies for the Type of weather
df_dummy_final = pd.get_dummies(df_dummy_weather,columns = ['Type'])

In [18]:
df_dummy_final

Unnamed: 0,Severity,fl_date,origin,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,Type_Storm
257,2,2019-01-06,PSP,0,0,0,0,1,0,0
259,3,2019-01-07,PSP,0,1,0,0,0,0,0
260,2,2019-01-12,PSP,0,0,0,0,1,0,0
262,2,2019-01-14,PSP,0,0,0,0,1,0,0
263,3,2019-01-14,PSP,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
971784,4,2019-01-20,GSP,0,0,0,0,1,0,0
971786,2,2019-01-23,GSP,0,0,0,0,1,0,0
971788,2,2019-01-24,GSP,0,0,0,0,1,0,0
971789,3,2019-01-24,GSP,0,0,0,0,1,0,0


In [65]:
#Merging with Test data to get sample final results
df1 = test.merge(df_dummy_final , how = 'left', on = ['origin', 'fl_date'])

In [68]:
df1['Type_Rain'] = df1['Type_Rain'].fillna(0)
df1

Unnamed: 0,fl_date,origin,dep_delay,arr_delay,air_time,Severity,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,Type_Storm
0,2019-01-01,ABE,-22.0,-30.0,136.0,,,,,,0.0,,
1,2019-01-01,ABE,-9.0,-19.0,102.0,,,,,,0.0,,
2,2019-01-01,ABE,-9.0,-6.0,111.0,,,,,,0.0,,
3,2019-01-01,ABE,-9.0,-3.0,104.0,,,,,,0.0,,
4,2019-01-01,ABE,-6.0,-25.0,86.0,,,,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209178,2019-01-08,YKM,36.0,25.0,28.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
209179,2019-01-08,YKM,36.0,25.0,28.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
209180,2019-01-08,YUM,-6.0,18.0,40.0,,,,,,0.0,,
209181,2019-01-08,YUM,-3.0,-4.0,30.0,,,,,,0.0,,


Creating functions for generalized use

In [22]:
def merging_weather(df,df2):
    '''Merges weather dataframe of interest into features table based on dates and origin city (IATA CODE), 
    Where the first dataframe is weather, and the second is table of features.'''
    df = df.rename(columns = {'iata_code':'origin', 'StartTime(UTC)':'fl_date'})
    df = df.drop_duplicates(['origin','fl_date','Severity','Type'])
    df = pd.get_dummies(df,columns = ['Type'])
    final = df2.merge(df, how = 'left', on = ['origin', 'fl_date'])
    return final

In [21]:
def cleaning_weather(df):
    ''' Cleans the weather data for the dataframe provided, replacing NaNs with Zeros for no weather, and no severity. '''
    severity_nums = {"Severity": {"Sunny": 0, "Other": 1, "Light": 2, "Moderate": 3, "Heavy": 4, "Severe": 5}} 
    df = df.replace(severity_nums)
    df['Type_Rain'] = df['Type_Rain'].fillna(0)
    df['Type_Hail'] = df['Type_Hail'].fillna(0)
    df['Type_Cold'] = df['Type_Cold'].fillna(0)
    df['Type_Fog'] = df['Type_Fog'].fillna(0)
    df['Type_Snow'] = df['Type_Snow'].fillna(0)
    df['Type_Storm'] = df['Type_Storm'].fillna(0)
    df['Type_Precipitation'] = df['Type_Precipitation'].fillna(0)
    df['Severity'] = df['Severity'].fillna(0)

In [23]:
df3 = merging_weather(df_weather,test)

In [24]:
cleaning_weather(df3)

KeyError: 'Type_Rain'