In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("data/flights_test_raw.csv")

In [3]:
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01 00:00:00.000,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01 00:00:00.000,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01 00:00:00.000,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01 00:00:00.000,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01 00:00:00.000,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [4]:
test_raw = df.copy()

## Check missing / na

In [5]:
def missing(x):
    n_missing = x.isnull().sum().sort_values(ascending=False)
    p_missing = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
    missing_ = pd.concat([n_missing, p_missing],axis=1, keys = ['number','percent'])
    missing_ = missing_[missing_["percent"] > 0]
    return missing_

In [6]:
# Fill na function
def fill_columns(fill,columns,df):
    for col in columns:
        df[col].fillna(fill,inplace=True)

In [7]:
missing(test_raw)

Unnamed: 0,number,percent
tail_num,1499,0.002269


In [8]:
test_ = test_raw.drop(columns = ['tail_num'],axis=1)

In [9]:
test_.fl_date = [ i [0:10] for i in test_.fl_date ]
test_.origin_city_name = [ i [0:-4] for i in test_.origin_city_name]
test_.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,5888,13891,ONT,Ontario,14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,6276,13891,ONT,Ontario,14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,4598,13891,ONT,Ontario,14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,4761,13891,ONT,Ontario,14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,5162,13891,ONT,Ontario,14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [10]:
very_important_columns = ['fl_date','crs_dep_time','origin_city_name'] 
# city for weather
test_checkpoint = test_.filter(items=very_important_columns)

In [11]:
test_checkpoint

Unnamed: 0,fl_date,crs_dep_time,origin_city_name
0,2020-01-01,1810,Ontario
1,2020-01-01,1150,Ontario
2,2020-01-01,2020,Ontario
3,2020-01-01,1340,Ontario
4,2020-01-01,915,Ontario
...,...,...,...
660551,2020-01-31,1859,Washington
660552,2020-01-31,1515,Washington
660553,2020-01-31,2205,New York
660554,2020-01-31,1035,Chicago


## Import and merge weather data

In [12]:
weather = pd.read_csv("data/Weather.csv")

In [13]:
type(weather["StartTime(UTC)"][0])

str

In [14]:
weather_jan_2020 = pd.DataFrame()
weather_jan_2020 = weather.filter(items=['Type','StartTime(UTC)','City'])

In [15]:
# fixing date
weather_jan_2020['StartTime(UTC)'] = [ i [0:10] for i in weather_jan_2020['StartTime(UTC)'] ]

In [16]:
weather_jan_2020.Type.unique()

array(['Snow', 'Fog', 'Cold', 'Storm', 'Rain', 'Precipitation', 'Hail'],
      dtype=object)

In [17]:
# Remapping weather values
weather_dict = {
    'Snow': 'Snowy',
    'Fog': 'Cloudy',
    'Cold': 'Sunny',
    'Storm': 'Rainy',
    'Rain': 'Rainy',
    'Precipitation': 'Cloudy',
    'Hail': 'Snowy'
}
weather_jan_2020["Type"] = weather_jan_2020.Type.map(weather_dict)

#### checkpoint

In [18]:
weather_checkpoint = weather_jan_2020

In [19]:
weather_checkpoint.columns = ['weather','fl_date','origin_city_name']

In [20]:
# didnt work
# weather_checkpoint['fl_date'] = pd.to_datetime(weather_checkpoint['fl_date'], format='%Y-%m-%d')
# weather_checkpoint['fl_date'] = pd.DatetimeIndex(weather_checkpoint['fl_date']).day
# weather_grouped = weather_checkpoint.groupby(by=['fl_date','origin_city_name']).mean()

In [21]:
weather_checkpoint = weather_checkpoint[(weather_checkpoint['fl_date'] >= '2020-01-01') & (weather_checkpoint['fl_date'] <= '2020-01-07')]

In [22]:
weather_checkpoint.weather.unique()

array(['Snowy', 'Sunny', 'Cloudy', 'Rainy'], dtype=object)

In [23]:
# weather_to_numeric = {
#     'Snowy': 4,
#     'Rainy': 3,
#     'Sunny': 2,
#     'Cloudy': 1
# }

In [24]:
weather_checkpoint.weather.unique()

array(['Snowy', 'Sunny', 'Cloudy', 'Rainy'], dtype=object)

In [25]:
weather_checkpoint.head() # now group fl_date

Unnamed: 0,weather,fl_date,origin_city_name
1721,Snowy,2020-01-01,Saguache
1722,Snowy,2020-01-02,Saguache
1723,Snowy,2020-01-02,Saguache
1724,Snowy,2020-01-02,Saguache
1725,Snowy,2020-01-02,Saguache


In [26]:
weather_checkpoint = weather_checkpoint.groupby(['origin_city_name', 'fl_date'])['weather'].apply(lambda x: x.value_counts().index[0]).reset_index()

In [27]:
weather_checkpoint.head()

Unnamed: 0,origin_city_name,fl_date,weather
0,Abbeville,2020-01-02,Rainy
1,Abbeville,2020-01-03,Rainy
2,Abbeville,2020-01-04,Cloudy
3,Aberdeen,2020-01-02,Rainy
4,Aberdeen,2020-01-03,Rainy


### merge time

In [28]:
flights_weather = test_checkpoint.merge(weather_checkpoint, left_on=['fl_date', 'origin_city_name'], right_on=['fl_date', 'origin_city_name'])

In [29]:
flights_weather.head() # success?

Unnamed: 0,fl_date,crs_dep_time,origin_city_name,weather
0,2020-01-01,1810,Ontario,Rainy
1,2020-01-01,1150,Ontario,Rainy
2,2020-01-01,2020,Ontario,Rainy
3,2020-01-01,1340,Ontario,Rainy
4,2020-01-01,915,Ontario,Rainy


In [30]:
flights_weather.isna().sum() # YES!!!

fl_date             0
crs_dep_time        0
origin_city_name    0
weather             0
dtype: int64

In [31]:
flights_weather = flights_weather.drop(labels=['origin_city_name'], axis=1)

In [32]:
flights_weather.head()

Unnamed: 0,fl_date,crs_dep_time,weather
0,2020-01-01,1810,Rainy
1,2020-01-01,1150,Rainy
2,2020-01-01,2020,Rainy
3,2020-01-01,1340,Rainy
4,2020-01-01,915,Rainy


In [33]:
df_ = flights_weather # checkpoint

# remapping crs_dep_time

In [34]:
# Time weight: 0-500 = 1, 501-1000 = 8, 1001-1500 = 10, 1501-2000 = 8, 2001 > = 5
df_.crs_dep_time = df_.crs_dep_time // 100
crs_dep_time_remap = {
    0: 0.10,
    1: 0.10, 
    2: 0.10,
    3: 0.10, 
    4: 0.10,
    5: 0.10, 
    6: 0.80,
    7: 0.80, 
    8: 0.80,
    9: 0.80, 
    10: 0.80,
    11: 1, 
    12: 1,
    13: 1, 
    14: 1,
    15: 1, 
    16: 0.80,
    17: 0.80,
    18: 0.80,
    19: 0.80, 
    20: 0.80,
    21: 0.50, 
    22: 0.50, 
    23: 0.50
}
df_["dep_time_hour_weight"] = df_.crs_dep_time.map(crs_dep_time_remap)

In [35]:
df_.isna().sum()

fl_date                 0
crs_dep_time            0
weather                 0
dep_time_hour_weight    0
dtype: int64

In [36]:
df_["month"] = [ i [5:7] for i in df_.fl_date ]
# change to datetime and get day of week

In [37]:
# don't drop next time
df_ = df_.drop(labels="fl_date", axis=1)

In [38]:
# need to add month_weight for model to run.
df_['month_weight'] = 0.5

In [39]:
df_

Unnamed: 0,crs_dep_time,weather,dep_time_hour_weight,month,month_weight
0,18,Rainy,0.8,01,0.5
1,11,Rainy,1.0,01,0.5
2,20,Rainy,0.8,01,0.5
3,13,Rainy,1.0,01,0.5
4,9,Rainy,0.8,01,0.5
...,...,...,...,...,...
59295,6,Snowy,0.8,01,0.5
59296,5,Rainy,0.1,01,0.5
59297,16,Rainy,0.8,01,0.5
59298,17,Snowy,0.8,01,0.5


# dummy weather

In [40]:
df_ = pd.get_dummies(df_, columns=['weather'], drop_first=True)
df_.head()

Unnamed: 0,crs_dep_time,dep_time_hour_weight,month,month_weight,weather_Rainy,weather_Snowy,weather_Sunny
0,18,0.8,1,0.5,1,0,0
1,11,1.0,1,0.5,1,0,0
2,20,0.8,1,0.5,1,0,0
3,13,1.0,1,0.5,1,0,0
4,9,0.8,1,0.5,1,0,0


In [41]:
# # load saved model 0.08
import pickle
with open('finalized_Decision_Tree.sav' , 'rb') as f:
    dt = pickle.load(f)
y_pred_dt = dt.predict(df_) # similar

In [42]:
y_pred_dt[:200]

array([28., 27., 19., 35., 31., 36., 36., 13., 35., 35., 39., 19.,  3.,
       28., 35., 35., 36., 24., 35., 13., 24., 27., 36., 31., 36.,  7.,
       31., 36., 39., 27., 39., 31., 39., 36., 13., 35., 38., 35., 36.,
       19., 27., 35., 24., 13., 27., 38., 19., 35., 35., 31., 39., 35.,
       36.,  7., 24.,  7., 35., 39., 38., 36., 31., 31., 31.,  7., 27.,
       35., 24., 27., 31., 35., 13., 19., 36., 27., 39., 35., 28., 36.,
       18., 27., 31., 36., 19., 35., 27., 39., 36., 19., 35., 35., 28.,
       35., 31., 31., 19., 38., 36., 35., 31., 39., 36., 31., 27., 35.,
       24., 13., 13., 38., 38., 36., 36., 35., 35., 36., 24., 36., 36.,
       31., 35., 39., 35., 36., 39., 35., 35., 19., 27., 19., 36., 39.,
       28.,  3., 36., 28.,  3., 35., 27., 35., 27., 38., 27., 39., 28.,
       38., 35., 31., 31., 31.,  3., 36., 13., 31., 13., 36., 13., 24.,
       35., 27., 35., 35., 24., 19., 31., 35., 35., 36., 35., 27., 36.,
       35.,  7.,  7.,  7., 35., 27., 28., 27., 31., 18., 13., 36

# save CSV for submission

In [43]:
final_submission = df_

In [44]:
final_submission['arr_delay'] = y_pred_dt

In [45]:
final_submission

Unnamed: 0,crs_dep_time,dep_time_hour_weight,month,month_weight,weather_Rainy,weather_Snowy,weather_Sunny,arr_delay
0,18,0.8,01,0.5,1,0,0,28.0
1,11,1.0,01,0.5,1,0,0,27.0
2,20,0.8,01,0.5,1,0,0,19.0
3,13,1.0,01,0.5,1,0,0,35.0
4,9,0.8,01,0.5,1,0,0,31.0
...,...,...,...,...,...,...,...,...
59295,6,0.8,01,0.5,0,1,0,38.0
59296,5,0.1,01,0.5,1,0,0,18.0
59297,16,0.8,01,0.5,1,0,0,36.0
59298,17,0.8,01,0.5,0,1,0,32.0


In [47]:
# final_submission
final_submission.to_csv('lou_nik_submission.csv', index=False) 