In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
loaded_model = pickle.load(open('classifier_random.pkl', 'rb'))

In [4]:
loaded_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [5]:
df = pd.read_csv('D:/Downloads/df2016.csv')

In [7]:
df['FlightDate'] = pd.to_datetime(df['FlightDate'])
df['FlightArrDate'] = pd.to_datetime(df['FlightArrDate'])
df['Origin'] = str(df['Origin'])
df['Dest'] = str(df['Dest'])
df['index'] = np.arange(len(df)) 

In [8]:
#Features to consider
df_using = df[['index','pressure_x','pressure_y','DewPointF_y','DewPointF_x','WindChillF_x','WindChillF_y','RoundedCRSDepTime','WindGustKmph_y','precipMM_x','precipMM_y','humidity_y','tempF_y','windspeedKmph_y','windspeedKmph_x','weatherCode_x','WindGustKmph_x','humidity_x','tempF_x','weatherCode_y','RoundedCRSArrTime']]

In [9]:
#Train test split for classifier
X_train_class,X_test_class,y_train_class,y_test_class  = train_test_split(df_using ,df.ArrDel15,random_state = 42,test_size = 0.2)

In [10]:
X_train_class.keys()

Index(['index', 'pressure_x', 'pressure_y', 'DewPointF_y', 'DewPointF_x',
       'WindChillF_x', 'WindChillF_y', 'RoundedCRSDepTime', 'WindGustKmph_y',
       'precipMM_x', 'precipMM_y', 'humidity_y', 'tempF_y', 'windspeedKmph_y',
       'windspeedKmph_x', 'weatherCode_x', 'WindGustKmph_x', 'humidity_x',
       'tempF_x', 'weatherCode_y', 'RoundedCRSArrTime'],
      dtype='object')

In [11]:
#Creating a new dataframe to extract the test set of our classifier
new_df = X_test_class

In [12]:
#Predicting whether or not the flight would be delayed on arrival
new_df['y_pred_class'] = loaded_model.predict(X_test_class)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
#Using only those values for which delay is predicted
new_df = new_df[new_df['y_pred_class'] == 1 ]
new_df

Unnamed: 0,index,pressure_x,pressure_y,DewPointF_y,DewPointF_x,WindChillF_x,WindChillF_y,RoundedCRSDepTime,WindGustKmph_y,precipMM_x,...,tempF_y,windspeedKmph_y,windspeedKmph_x,weatherCode_x,WindGustKmph_x,humidity_x,tempF_x,weatherCode_y,RoundedCRSArrTime,y_pred_class
413057,413057,1014,1017,59,24,62,61,800,12,0.0,...,62,10,5,113,6,39,61,266,900.0,1.0
541520,541520,1004,1012,53,36,66,64,1500,18,0.9,...,64,13,21,353,23,35,65,353,2100.0,1.0
849887,849887,1014,1018,74,73,83,86,1600,29,1.0,...,86,14,19,386,30,68,81,356,1700.0,1.0
475618,475618,1009,1004,43,57,63,65,1500,17,0.0,...,65,15,20,116,23,82,65,113,1600.0,1.0
91548,91548,1016,1011,66,59,66,85,1200,31,0.0,...,87,26,10,119,11,79,67,113,1700.0,1.0
89555,89555,1007,1020,56,37,76,58,2100,9,0.0,...,58,7,27,113,29,35,74,116,2200.0,1.0
42696,42696,1010,1011,34,49,49,28,1700,18,1.3,...,29,16,12,353,17,92,52,332,1800.0,1.0
542363,542363,1012,1017,53,65,68,61,900,8,1.3,...,61,7,15,389,17,93,67,122,1000.0,1.0
814596,814596,1011,1012,72,71,87,80,1700,14,0.0,...,79,7,7,116,8,35,86,176,1900.0,1.0
814227,814227,1020,1013,55,53,63,89,1100,10,0.6,...,92,9,8,353,10,72,64,113,1500.0,1.0


In [14]:
#Creating another dataframe for index and ArrDelayMinutes
another_df = pd.DataFrame()
another_df['index'] = df.index
another_df['ArrDelay'] = df.ArrDelayMinutes

In [16]:
#Merging the two dataframes on index to obtain ArrDelay values for those records in X_test
final_df = new_df.join(another_df, on = 'index', lsuffix = '__x', rsuffix = '__y')

In [17]:
#Y_test_regression
Y_testing_for_regression = final_df.ArrDelay

In [18]:
#Dropping y_pred_class in our X_test
new_df = new_df.drop(['y_pred_class'], axis=1)

In [20]:
#Regression stage
df = df[df['ArrDel15'] == 1]
df_using = df[['index','pressure_x','pressure_y','DewPointF_y','DewPointF_x','WindChillF_x','WindChillF_y','RoundedCRSDepTime','WindGustKmph_y','precipMM_x','precipMM_y','humidity_y','tempF_y','windspeedKmph_y','windspeedKmph_x','weatherCode_x','WindGustKmph_x','humidity_x','tempF_x','weatherCode_y','RoundedCRSArrTime']]
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(df_using,df.ArrDelayMinutes,random_state = 42,test_size = 0.2)

In [21]:
# XGB Regressor
regressor_model =  XGBRegressor()
regressor_model.fit(X_train_reg, y_train_reg)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [22]:
#Predicting using regressor model and using new_df as test case
y_realistic_pred = regressor_model.predict(new_df)

In [25]:
#Mean squared error and RMSE
print('MAE: {}'.format(mean_absolute_error(Y_testing_for_regression, y_realistic_pred)))
print('RMSE: {}'.format(sqrt(mean_squared_error(Y_testing_for_regression, y_realistic_pred))))

MAE: 53.93457622385115
RMSE: 67.38443051381572
