In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from mlxtend.regressor import StackingCVRegressor
import joblib
from sklearn.impute import KNNImputer, MissingIndicator
from sklearn.pipeline import FeatureUnion, make_pipeline


In [52]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [3]:
flights_training = pd.read_csv("../input/split/flights_training.csv", index_col="Unnamed: 0")

  mask |= (ar1 == a)


In [8]:
flights_validation = pd.read_csv("../input/split/flights_validation.csv", index_col="Unnamed: 0")
flights_validation.head()

Unnamed: 0,id,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,LATITUDE_origin,LONGITUDE_origin,LATITUDE_arrival,LONGITUDE_arrival
1123620,123408,2015,1,9,5,AA,1595,N499AA,AUS,DFW,...,9.0,651.0,65.0,190,755,-18.0,30.19453,-97.66987,32.89595,-97.0372
1388925,957303,2015,3,7,6,WN,123,N644SW,CMH,STL,...,7.0,912.0,90.0,409,940,-14.0,39.99799,-82.89188,38.74769,-90.35999
999832,2092628,2015,5,17,7,DL,1918,N690DL,SNA,ATL,...,11.0,850.0,259.0,1919,1604,-22.0,33.67566,-117.86822,33.64044,-84.42694
561882,624558,2015,2,12,4,AA,2384,N3DTAA,LAS,LAX,...,16.0,1714.0,67.0,236,1702,60.0,36.08036,-115.15233,33.94254,-118.40807
2324779,2004474,2015,5,11,1,MQ,3168,N3AEMQ,ORD,OKC,...,13.0,1714.0,138.0,693,1913,-10.0,41.9796,-87.90446,35.39309,-97.60073


In [4]:
flights_training.head()

Unnamed: 0,id,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,LATITUDE_origin,LONGITUDE_origin,LATITUDE_arrival,LONGITUDE_arrival
106842,635691,2015,2,13,5,WN,315,N725SW,ONT,PHX,...,6.0,1043.0,65.0,325,1245,-4.0,34.056,-117.60119,33.43417,-112.00806
2794621,1371861,2015,4,1,3,MQ,3120,N657MQ,DFW,GRI,...,14.0,2117.0,115.0,561,2210,45.0,32.89595,-97.0372,40.96747,-98.30861
2738024,489731,2015,2,3,2,EV,5212,N852AS,DTW,HPN,...,13.0,1546.0,104.0,505,1600,73.0,42.21206,-83.34884,41.06696,-73.70757
336049,2466470,2015,6,9,2,UA,283,N496UA,LAX,ORD,...,14.0,1517.0,254.0,1744,2059,-4.0,33.94254,-118.40807,41.9796,-87.90446
2640700,1398962,2015,4,3,5,OO,7363,N449SW,MSP,ABR,...,13.0,1356.0,77.0,257,1502,-20.0,44.88055,-93.21692,45.44906,-98.42183


In [5]:
flights_training.columns

Index(['id', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'TAXI_OUT', 'WHEELS_OFF',
       'SCHEDULED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY',
       'LATITUDE_origin', 'LONGITUDE_origin', 'LATITUDE_arrival',
       'LONGITUDE_arrival'],
      dtype='object')

In [57]:
numerical_cols = [cname for cname in flights_training.columns if flights_training[cname].dtype in ['int64', 'float64']]
numerical_cols

['id',
 'YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'FLIGHT_NUMBER',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'TAXI_OUT',
 'WHEELS_OFF',
 'SCHEDULED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_DELAY',
 'LATITUDE_origin',
 'LONGITUDE_origin',
 'LATITUDE_arrival',
 'LONGITUDE_arrival']

In [40]:
flights_training["YEAR"].unique()
flights_training["MONTH"].unique()
flights_training["DAY"].unique()
flights_training["AIRLINE"].unique()
flights_training["FLIGHT_NUMBER"].nunique()


6596

In [6]:
# Number of missing values in each column of training data
missing_val_count_by_column = (flights_training.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

LATITUDE_origin      1917
LONGITUDE_origin     1917
LATITUDE_arrival     1931
LONGITUDE_arrival    1931
dtype: int64


In [33]:
flights_training[flights_training.isnull().any(axis=1)]

Unnamed: 0,id,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,LATITUDE_origin,LONGITUDE_origin,LATITUDE_arrival,LONGITUDE_arrival
838001,1643376,2015,4,19,7,EV,4579,N12166,ECP,IAH,...,15.0,641.0,119.0,572,801,22.0,,,29.98047,-95.33972
2801122,9327,2015,1,1,4,DL,1958,N937AT,ATL,ECP,...,11.0,1657.0,69.0,240,1659,-18.0,33.64044,-84.42694,,
2801722,1600570,2015,4,16,4,DL,2427,N921AT,ATL,ECP,...,11.0,915.0,62.0,240,907,-7.0,33.64044,-84.42694,,
1018176,1531626,2015,4,12,7,DL,2616,N973DL,ECP,ATL,...,12.0,603.0,69.0,240,759,-13.0,,,33.64044,-84.42694
2800788,2504811,2015,6,11,4,EV,4574,N16911,IAH,ECP,...,39.0,1951.0,115.0,572,2109,12.0,29.98047,-95.33972,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2801445,883743,2015,3,2,1,DL,1733,N935DL,ATL,ECP,...,29.0,1126.0,70.0,240,1111,0.0,33.64044,-84.42694,,
2801742,1653298,2015,4,19,7,DL,1403,N343NB,ATL,ECP,...,14.0,1647.0,68.0,240,1638,-6.0,33.64044,-84.42694,,
2802442,1334876,2015,3,30,1,WN,767,N955WN,STL,ECP,...,6.0,1758.0,100.0,634,1800,90.0,38.74769,-90.35999,,
838003,1659443,2015,4,20,1,EV,4579,N12167,ECP,IAH,...,13.0,614.0,119.0,572,801,11.0,,,29.98047,-95.33972


In [36]:
type(flights_training.ORIGIN_AIRPORT)

pandas.core.series.Series

In [58]:
flights_training_new = flights_training
flights_training_new.head()

Unnamed: 0,id,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,LATITUDE_origin,LONGITUDE_origin,LATITUDE_arrival,LONGITUDE_arrival
106842,635691,2015,2,13,5,WN,315,N725SW,ONT,PHX,1040,1037.0,6.0,1043.0,65.0,325,1245,-4.0,34.056,-117.60119,33.43417,-112.00806
2794621,1371861,2015,4,1,3,MQ,3120,N657MQ,DFW,GRI,2015,2103.0,14.0,2117.0,115.0,561,2210,45.0,32.89595,-97.0372,40.96747,-98.30861
2738024,489731,2015,2,3,2,EV,5212,N852AS,DTW,HPN,1416,1533.0,13.0,1546.0,104.0,505,1600,73.0,42.21206,-83.34884,41.06696,-73.70757
336049,2466470,2015,6,9,2,UA,283,N496UA,LAX,ORD,1445,1503.0,14.0,1517.0,254.0,1744,2059,-4.0,33.94254,-118.40807,41.9796,-87.90446
2640700,1398962,2015,4,3,5,OO,7363,N449SW,MSP,ABR,1345,1343.0,13.0,1356.0,77.0,257,1502,-20.0,44.88055,-93.21692,45.44906,-98.42183


In [9]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(flights_training[["AIRLINE"]]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(flights_validation[["AIRLINE"]]))

# One-hot encoding removed index; put it back
OH_cols_train.index = flights_training.index
OH_cols_valid.index = flights_validation.index

# Remove categorical columns (will replace with one-hot encoding)
num_flight_train = flights_training.drop(["AIRLINE", "id", "YEAR", "FLIGHT_NUMBER", "TAIL_NUMBER", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"], axis=1)
num_flight_valid = flights_validation.drop(["AIRLINE", "id", "YEAR", "FLIGHT_NUMBER", "TAIL_NUMBER", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"], axis=1)

# Add one-hot encoded columns to numerical features
OH_flight_train = pd.concat([num_flight_train, OH_cols_train], axis=1)
OH_flight_valid = pd.concat([num_flight_valid, OH_cols_valid], axis=1)

OH_flight_train.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,SCHEDULED_DEPARTURE,DEPARTURE_TIME,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,...,4,5,6,7,8,9,10,11,12,13
106842,2,13,5,1040,1037.0,6.0,1043.0,65.0,325,1245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2794621,4,1,3,2015,2103.0,14.0,2117.0,115.0,561,2210,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2738024,2,3,2,1416,1533.0,13.0,1546.0,104.0,505,1600,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
336049,6,9,2,1445,1503.0,14.0,1517.0,254.0,1744,2059,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2640700,4,3,5,1345,1343.0,13.0,1356.0,77.0,257,1502,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [60]:
y_train = OH_flight_train[["ARRIVAL_DELAY"]]
X_train = OH_flight_train.drop(["ARRIVAL_DELAY"], axis=1)

y_val = OH_flight_valid[["ARRIVAL_DELAY"]]
X_val = OH_flight_valid.drop(["ARRIVAL_DELAY"], axis=1)

X_train.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,SCHEDULED_DEPARTURE,DEPARTURE_TIME,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,LATITUDE_origin,LONGITUDE_origin,LATITUDE_arrival,LONGITUDE_arrival,0,1,2,3,4,5,6,7,8,9,10,11,12,13
106842,2,13,5,1040,1037.0,6.0,1043.0,65.0,325,1245,34.056,-117.60119,33.43417,-112.00806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2794621,4,1,3,2015,2103.0,14.0,2117.0,115.0,561,2210,32.89595,-97.0372,40.96747,-98.30861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2738024,2,3,2,1416,1533.0,13.0,1546.0,104.0,505,1600,42.21206,-83.34884,41.06696,-73.70757,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
336049,6,9,2,1445,1503.0,14.0,1517.0,254.0,1744,2059,33.94254,-118.40807,41.9796,-87.90446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2640700,4,3,5,1345,1343.0,13.0,1356.0,77.0,257,1502,44.88055,-93.21692,45.44906,-98.42183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
transformer = FeatureUnion(
			transformer_list=[
			('features', KNNImputer(n_neighbors=3)),
			('indicators', MissingIndicator())])
imputer = transformer.fit(X_train, y_train)

In [49]:
svr = make_pipeline(KNNImputer(n_neighbors=2), RobustScaler(), SVR(C= 20, epsilon= 0.2))
rf = make_pipeline(SimpleImputer(strategy = 'mean'), RobustScaler(), RandomForestRegressor())
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)
xgboost = XGBRegressor(
                        learning_rate=0.05,n_estimators=3460,
                        max_depth=5, min_child_weight=1,
                        gamma=0, subsample=0.7,
                        colsample_bytree=0.7,
                        objective='reg:linear', nthread=-1,
                        scale_pos_weight=1, seed=27,
                        reg_alpha=0.00006
)

stack_gen = StackingCVRegressor(regressors=(svr, rf, gbr, xgboost),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [61]:
svr_fit = svr.fit(X_train, y_train)
rf_fit = rf.fit(X_train, y_train)
gbr_fit = gbr.fit(X_train, y_train)
xgboost_fit = xgboost.fit(X_train, y_train)
stack_gen_fit = stack_gen.fit(X_train, y_train)

  return f(**kwargs)


In [2]:
y_pred = xgboost_fit.predict(X_val)
mean_squared_error(y_val, y_pred)

NameError: name 'xgboost_fit' is not defined