In [2]:
from sqlalchemy import create_engine
import pandas as pd

def query_features():
    engine = create_engine('sqlite:///./../../data/processed/airlines.db')
    
    with engine.connect() as conn:
        features = pd.read_sql(
        """
        SELECT 
            f.departure_was_delayed_15 as delayed,
            f.airport as departure_airport,
            f.departure_month as month,
            f.departure_date as date,
            f.departure_hod as hour_of_departure,
            f.carrier as carrier,
            f.distance as flight_distance,
            f.elapsed_time_scheduled as elapsed_time,
            f.hourly_visibility as visibility,
            f.hourly_dry_bulb_temp_f as temperature_f,
            f.hourly_precipitation as percipitation,
            f.hourly_wind_speed as wind_speed,
            f.hourly_wind_gust_speed as gust_speed,
            f.hourly_station_pressure
        FROM 
            features AS f
        """, 
        conn)
    
    for col in ['carrier', 'departure_airport']:
        features[col] = features[col].astype('category')
    
    return pd.get_dummies(features, drop_first=True)

# Get summary data for selected date range, counting flights delayed more than 15 minutes on departure
features = query_features()
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74588 entries, 0 to 74587
Data columns (total 25 columns):
delayed                    74588 non-null int64
month                      74588 non-null int64
date                       74588 non-null int64
hour_of_departure          74588 non-null int64
flight_distance            74588 non-null int64
elapsed_time               74588 non-null int64
visibility                 74588 non-null float64
temperature_f              74588 non-null float64
percipitation              74588 non-null float64
wind_speed                 74588 non-null float64
gust_speed                 74588 non-null float64
hourly_station_pressure    74588 non-null float64
departure_airport_LAX      74588 non-null uint8
departure_airport_MDT      74588 non-null uint8
carrier_AS                 74588 non-null uint8
carrier_B6                 74588 non-null uint8
carrier_DL                 74588 non-null uint8
carrier_EV                 74588 non-null uint8
carrier_F9     

In [3]:
y = features['delayed']
X = features.drop('delayed', axis=1)

In [4]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

def conduct_grid_search_and_report_metrics(X, y, param_grid, pipeline, scoring='accuracy', random_state=12):
    """Conducts a grid search with supplied parameters, reports out related metrics"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)
    
    # Create GridSearch, set the specififed 
    grid = GridSearchCV(pipeline, cv=5, n_jobs=1, param_grid=parameters, scoring=scoring)
    
    # Conduct Gridsearch
    grid.fit(X=X_train, y=y_train)
    
    y_pred = grid.predict(X_test)
    
    print(f"Scoring Methodology:", scoring)
    print(f"         Best Score:", grid.best_score_)
    print("")
    print("Test Set Results ")
    print("Accuracy:", accuracy_score(y_pred=y_pred, y_true=y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))
    
    print(grid.best_estimator_)
    
    # Return the fitted grid for use/inspection
    return grid

In [10]:
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

def create_smaller():
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=25, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

    
parameters = [
    {
        'classify': [create_smaller()],
    }
]

pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('select', SelectKBest()),
    ('approximate', None),
    ('classify', None)
])

In [11]:
grid_accuracy = conduct_grid_search_and_report_metrics(X, y, parameters, pipeline, scoring='accuracy')

ValueError: Error when checking input: expected dense_9_input to have shape (None, 25) but got array with shape (39978, 10)

In [14]:
grid_f1 = conduct_grid_search_and_report_metrics(X, y, parameters, pipeline, scoring='f1')

Scoring Methodology: f1
         Best Score: 0.41416427361222696

Test Set Results 
Accuracy: 0.723583180987203
             precision    recall  f1-score   support

          0       0.84      0.80      0.82     19401
          1       0.37      0.45      0.41      5214

avg / total       0.74      0.72      0.73     24615

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('select', SelectKBest(k=10, score_func=<function f_classif at 0x10b799d08>)), ('approximate', None), ('classify', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None,...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])


In [15]:
grid_f1_weighted = conduct_grid_search_and_report_metrics(X, y, parameters, pipeline, scoring='f1_weighted')

Scoring Methodology: f1_weighted
         Best Score: 0.7558936998728272

Test Set Results 
Accuracy: 0.7245988218565915
             precision    recall  f1-score   support

          0       0.84      0.80      0.82     19401
          1       0.38      0.45      0.41      5214

avg / total       0.74      0.72      0.73     24615

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('select', SelectKBest(k=10, score_func=<function f_classif at 0x10b799d08>)), ('approximate', None), ('classify', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None,...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])


In [16]:
grid_roc_auc = conduct_grid_search_and_report_metrics(X, y, parameters, pipeline, scoring='roc_auc')

Scoring Methodology: roc_auc
         Best Score: 0.6980126574054293

Test Set Results 
Accuracy: 0.727320739386553
             precision    recall  f1-score   support

          0       0.84      0.80      0.82     19401
          1       0.38      0.45      0.41      5214

avg / total       0.75      0.73      0.74     24615

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('select', SelectKBest(k=10, score_func=<function f_classif at 0x10b799d08>)), ('approximate', None), ('classify', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None,...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])
