In [147]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,roc_auc_score

from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [148]:
df = pd.read_csv('data/airline_delay_train_new.csv',parse_dates =['FlightDate'])

In [149]:
df.head()

Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week
0,2010-01-17,17:05,MQ,CVG,DFW,812,1,Sunday
1,2010-01-29,17:03,MQ,OMA,ORD,416,0,Friday
2,2010-01-31,18:03,US,SJC,PHX,622,0,Sunday
3,2010-01-26,16:42,YV,MTJ,DEN,197,0,Tuesday
4,2010-01-06,17:53,US,PHL,ORD,678,0,Wednesday


#Process pipeline 
- Separate target from features 
- Pipeline 
- CV split 
- Scores 

In [150]:
y = df['dep_delayed_15min']
X = df.drop(['dep_delayed_15min'], axis=1)

In [151]:
X_train, X_test, y_train, y_test = train_test_split( 
                        X,y,test_size = 0.20, random_state = 42) 

In [142]:
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

In [152]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [159]:
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state= 42))])

In [163]:
#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(baseline_pipeline,X, y,cv=3, scoring ='roc_auc')
# the parameters frid expects the same name as the pipeline "classifier"
from sklearn.model_selection import GridSearchCV
param_grid = {
    'classifier__max_depth': [80, 100],
    'classifier__max_features': [2, 3],
    'classifier__min_samples_leaf': [3, 4, 5],
    'classifier__min_samples_split': [8, 10, 12],
    'classifier__n_estimators': [100, 200, 300]
}

In [None]:
def model_fit():
    grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
    grid_search.fit(X_train, y_train)
    print ("the best model params: ", grid_search.best_params_)
    best_grid = grid_search.best_estimator_
    return best_grid

In [164]:
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [165]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='constant'),
                                                                         Index(['Distance'], dtype='object')),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         Index(

In [166]:
grid_search.best_params_

{'classifier__max_depth': 80,
 'classifier__max_features': 2,
 'classifier__min_samples_leaf': 3,
 'classifier__min_samples_split': 8,
 'classifier__n_estimators': 100}

In [167]:
best_grid = grid_search.best_estimator_

In [None]:
def model_pedict():
    testscore = best_grid.predict_proba(X_test)
    return log_loss(y_test, testscore)

In [169]:
testscore = best_grid.predict_proba(X_test)

In [172]:
log_loss(y_test, testscore)

0.4828014996428915

In [179]:
import xgboost as xgb
model = xgb.XGBClassifier()
pipe_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

In [184]:
param_grid_xgboost = {
    'model__n_estimators': [500,700]
}

In [185]:
grid_xgb = GridSearchCV(pipe_xgb, param_grid_xgboost, cv=3, n_jobs=-1, scoring='roc_auc')

In [186]:
grid_xgb.fit(X_train, y_train)





GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='constant'),
                                                                         Index(['Distance'], dtype='object')),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         Index(













In [188]:
best_xgb = grid_xgb.best_estimator_

In [189]:
best_xgb_score = best_xgb.predict_proba(X_test)

In [190]:
log_loss(y_test, best_xgb_score)

0.4529898514021672

In [191]:
test = pd.read_csv('data/airline_delay_test_new.csv')

In [194]:
ext_test = test.drop('dep_delayed_15min', axis=1)

In [195]:
best_xgb_score_test = best_xgb.predict_proba(ext_test)

In [197]:
log_loss(test['dep_delayed_15min'], best_xgb_score_test)

0.4513241335009996