In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
flights = pd.read_csv('data/flights_clean.csv').sample(100)
flights

Unnamed: 0,airline,day,day_of_week,departure_delay,destination_airport,destination_latitude,destination_longitude,distance,month,origin_airport,...,scheduled_arrival,scheduled_departure,scheduled_time,state_destination,state_origin,taxi_in,taxi_out,day_of_year,origin_temperature,destination_temperature
2994967,WN,8,2,65.0,BWI,39.17540,-76.66820,611,9,MDW,...,30,1305,105.0,MD,IL,6.0,7.0,251,77.0,79.6
267013,AA,17,5,-1.0,ORD,41.97960,-87.90446,646,4,RDU,...,785,710,135.0,IL,NC,8.0,10.0,107,64.8,60.2
1645008,WN,13,5,0.0,MDW,41.78598,-87.75242,283,2,CMH,...,650,630,80.0,IL,OH,5.0,7.0,44,35.4,11.0
2906972,DL,25,2,-4.0,JFK,40.63975,-73.77893,1005,8,TPA,...,1070,894,176.0,NY,FL,15.0,12.0,237,86.7,80.6
2147813,OO,20,1,1.0,SLC,40.78839,-111.97777,1195,4,IAH,...,852,715,197.0,UT,TX,7.0,21.0,110,66.2,57.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383917,UA,20,3,-8.0,ORD,41.97960,-87.90446,1744,5,LAX,...,320,1404,236.0,IL,CA,3.0,14.0,140,60.3,43.3
1070608,AS,19,6,-2.0,SEA,47.44898,-122.30931,2496,9,BOS,...,669,480,369.0,WA,MA,4.0,18.0,262,69.9,62.8
2781470,B6,2,7,-3.0,BOS,42.36435,-71.00518,395,8,BUF,...,898,821,77.0,MA,NY,5.0,8.0,214,71.9,78.0
784643,UA,27,1,6.0,DEN,39.85841,-104.66700,991,7,PDX,...,1150,938,152.0,CO,OR,8.0,12.0,208,63.4,77.8


In [4]:
pre_y = flights.departure_delay

In [5]:
pre_X = flights.drop(columns=['departure_delay', 'airline', 'destination_airport',
                              'origin_airport', 'state_destination', 'state_origin'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(pre_X, pre_y)

In [7]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [1]:
# Creating a dataframe that will consist of all combinations of polynomial transformations of the 
# predictors to be considered for interactions

predictor_set = list(train.columns.difference(['departure_delay']))
from itertools import product
values = np.arange(0,len(predictor_set))
polynomial_transformations = pd.DataFrame(product(values, repeat=len(predictor_set)), columns=predictor_set).loc[1:,]
polynomial_transformations.loc[:,'sum_degree'] = (polynomial_transformations).astype(int).sum(axis=1)
polynomial_transformations.loc[:,'count_zeros'] = (polynomial_transformations == 0).astype(int).sum(axis=1)
polynomial_transformations.sort_values(by = ['count_zeros', 'sum_degree'], ascending=[False, True], inplace=True)
polynomial_transformations.drop(columns = ['count_zeros'], inplace=True)
polynomial_transformations.reset_index(inplace = True, drop = True)

NameError: name 'train' is not defined

In [None]:
#Setting the seed as we are shuffling the data before splitting it into K-folds
np.random.seed(123)
# Shuffling the training set before creating K folds
train = train.sample(frac=1)
k = 5 #5-fold cross validation
fold_size = np.round(train.shape[0]/k)

In [None]:
'departure_delay~'+'+'.join(predictor_set)

In [None]:
# Fill out this function - that is all you need to do to make the code work!

# The function must return the mean k-fold cross validation RMSE for the model
# that has the individual predictors,
# the 'selected_interactions', and the 'interaction_being_tested'

# Uncomment the lines below and fill the function

def KFoldCV(selected_interactions, interaction_being_tested):
    rmses = []
    shuffled = train.sample(frac=1)
    folds = np.array_split(shuffled, k)
    
    for i in range(k):
        fold_train = pd.concat([folds[j] for j in range(k) if j != i])
        fold_test = folds[i]
        model = sm.ols('departure_delay~'+'+'.join(predictor_set)+selected_interactions+interaction_being_tested, data=fold_train).fit()
        rmses.append(mean_squared_error(fold_test.price, model.predict(fold_test), squared=False))
    
    return sum(rmses)/len(rmses)

In [None]:
# This code implements the algorithm of systematically considering interactions of degree 2 and going upto 
# the interaction of degree 12. For a given degree 'd' the interactions are selected greedily based on 
# highest reduction in the 5-fold cross validation RMSE. Once no more reduction in the 5-fold cross validation
# RMSE is possible using interactions of degree 'd', interaction terms of the next higher degree 'd+1' are considered.

# 5-fold cross validation RMSE of the initial model with the 4 predictors of degree one
cv_previous_model = KFoldCV(selected_interactions = '', interaction_being_tested = '')
interaction_being_tested = '+'
selected_interactions = ''

# Considering interactions of degree 'd' = 2 to 12
for d in np.arange(2,13):
    # Selecting interaction terms of degree = 'd'
    degree_set = polynomial_transformations.loc[polynomial_transformations.sum_degree==d, :]
    
    # Initializing objects to store the interactions of degree 'd' that reduce the
    # 5-fold cross validation RMSEs as compared to the previous model
    interactions_that_reduce_KfoldCV = []; cv_degree = []; 
    
    # Creating another DataFrame that will consist of the updated set of interactions of degree 'd' to be considered
    # as interactions that do not reduce the 5-fold cross validation RMSE will be discarded
    degree_set_updated = pd.DataFrame(columns = degree_set.columns)
    
    # Continue adding interactions of degree 'd' in the model until no interactions reduce 
    # the 5-fold cross-validation RMSE
    while True:
        
        #Iterating over all possible interactions of degree 'd'
        for index, row in degree_set.iterrows():
            
            # Creating the formula expression for the interaction term to be tested
            for predictor in predictor_set:
                interaction_being_tested = interaction_being_tested + ('I('+predictor +'**' +\
                                         str(row[predictor]) + ')*' if row[predictor]>1 else\
                                               predictor + '*' if row[predictor]==1 else '')
            interaction_being_tested = interaction_being_tested[:-1]
            
            # Call the function 'KFoldCV' to find out the 5-fold cross validation error on adding the 
            # interaction term being tested to the model
            cv = KFoldCV(selected_interactions, interaction_being_tested)
            
            # If the interaction term being tested reduces the 5-fold cross validation RMSE as compared to the
            # previous model, then consider adding it to the model
            if cv<cv_previous_model:
                interactions_that_reduce_KfoldCV.append(interaction_being_tested)
                cv_degree.append(cv)
                degree_set_updated = pd.concat([degree_set_updated, row.to_frame().T])
            interaction_being_tested = '+'
        cv_data = pd.DataFrame({'interaction':interactions_that_reduce_KfoldCV, 'cv':cv_degree})
        
        # Sort the interaction terms that reduce the 5-fold cross valdiation RMSE based on their respective
        # 5-fold cross validation RMSE
        cv_data.sort_values(by = 'cv', inplace = True)
        
        # Break the loop if no interaction of degree 'd' reduces the 5-fold cross validation RMSE as
        # compared to the previous model
        if cv_data.shape[0]==0:
            break
            
        # Select the interaction that corresponds to the least 5-fold cross validation RMSE
        selected_interactions = selected_interactions + cv_data.iloc[0,0]
        cv_previous_model = cv_data.iloc[0,1]
        cv_degree = []; interactions_that_reduce_KfoldCV = []
        degree_set = degree_set_updated.copy()
        degree_set_updated = pd.DataFrame(columns = degree_set.columns)
        
        # Print the progress after each model update, i.e., after an interaction term is selected
        print("Degree of interactions being considered:",d, ", 5-fold CV RMSE:", cv_previous_model)

In [None]:
selected_interactions

In [None]:
model = sm.ols('departure_delay~'+'+'.join(predictor_set)+selected_interactions, data=train).fit()
model.summary()