## Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
from src.modules.feature_engineering import scale_data
from src.modules.load_source_data import load_data_set, load_agg_data

In [17]:
def print_null_features(df):
    '''
    Calculates and prints total rows of a column with missing values and the Percentage of rows for that specific column. 

    Parameters
    ----------
    X : pandas DataFrame
        Test or Train dataset.
           
    Returns
    -------
    features : pandas.core.series.Series
        Pandas Series containing names of features with missing values. 

    '''
    # missing data
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    
    print("Missing data:")
    print("-------------")
    print(missing_data.head(30))
    
    return total


def num_null_replacement(df):  
    '''
    Replaces null values to all numeric features in Train or Test data set. 

    Parameters
    ----------
    X : pandas DataFrame
        Test or Train dataset.
    
    Path : str, Location of source file. Ex. './data'.
        Location of files to read and load.

    
    test: boolean, default=False
        True loads test. False loads train data.
           
    Returns
    -------
    X : pandas DataFrame
        Pandas DataFrame containing Train or Test data and additional features. 

    '''
    
    # Print missing data
    total = print_null_features(df)
    
    for feature in total[total > 0].index:
        if df[feature].dtypes != 'object':
            df[feature] = df[feature].fillna(0)
            
    print_null_features(df)
    
    return df

In [6]:
# get X and y (target variable)
# Load data
X, y = load_data_set()
print(X.shape)
print(y.shape)

(9808, 27)
(9808,)


In [7]:
pd.set_option("display.max_columns", None)
X.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,year,month,day,day_of_week,week_of_year,crs_dep_hour,crs_arr_hour
0,2018-10-30,UA,UA_CODESHARE,UA,3997,EV,N13903,3997,12266,IAH,"Houston, TX",13256,MFE,"Mission/McAllen/Edinburg, TX",1455,1617,N,82,1,316,2018,10,30,2,44,14,16
1,2018-10-30,F9,F9,F9,529,F9,N301FR,529,10599,BHM,"Birmingham, AL",11292,DEN,"Denver, CO",1315,1518,N,183,1,1083,2018,10,30,2,44,13,15
2,2018-10-30,AA,AA,AA,2318,AA,N167AN,2318,11298,DFW,"Dallas/Fort Worth, TX",12892,LAX,"Los Angeles, CA",1740,1853,N,193,1,1235,2018,10,30,2,44,17,18
3,2018-10-30,AA,AA,AA,2375,AA,N962NN,2375,11298,DFW,"Dallas/Fort Worth, TX",14869,SLC,"Salt Lake City, UT",1855,2048,N,173,1,989,2018,10,30,2,44,18,20
4,2018-10-31,UA,UA,UA,746,UA,N68836,746,12173,HNL,"Honolulu, HI",14771,SFO,"San Francisco, CA",1433,2236,N,303,1,2398,2018,10,31,3,44,14,22


In [8]:
# drop features
train = X.copy()
to_drop = ['mkt_carrier', 'tail_num', 'origin_city_name', 'dest_city_name', 'dup', 'fl_date', 'branded_code_share', 'op_unique_carrier', 'origin', 'dest', 'flights']
train = train.drop(to_drop, axis=1)

In [9]:
train.shape

(9808, 16)

In [10]:
#load features - Passengers, fuel_comsumption
train = load_agg_data(train)

In [11]:
train.shape

(9808, 52)

In [12]:
# Save X and y to disk
# Save data in local disk
train.to_csv('./data/train.csv', index=False)
y.to_csv('./data/target.csv', index=False)

In [13]:
sorted(list(train.columns))

['arr_hour_avg_air_time',
 'arr_hour_avg_arr_delay',
 'arr_hour_avg_carrier_delay',
 'arr_hour_avg_late_aircraft_delay',
 'arr_hour_avg_nas_delay',
 'arr_hour_avg_security_delay',
 'arr_hour_avg_weather_delay',
 'carrier_month_avg_passengers',
 'carrier_month_avg_seats',
 'carrier_month_passengers',
 'carrier_month_seats',
 'crs_arr_hour',
 'crs_arr_time',
 'crs_dep_hour',
 'crs_dep_time',
 'crs_elapsed_time',
 'day',
 'day_of_week',
 'day_of_week_avg_air_time',
 'day_of_week_avg_arr_delay',
 'day_of_week_avg_carrier_delay',
 'day_of_week_avg_late_aircraft_delay',
 'day_of_week_avg_nas_delay',
 'day_of_week_avg_security_delay',
 'day_of_week_avg_weather_delay',
 'dest_airport_id',
 'dest_airport_month_flight_seats',
 'dest_airport_month_passengers',
 'dest_total_flights',
 'distance',
 'mkt_carrier_fl_num',
 'mkt_unique_carrier',
 'month',
 'month_avg_air_time',
 'month_avg_arr_delay',
 'month_avg_carrier_delay',
 'month_avg_fuel_comsumption',
 'month_avg_late_aircraft_delay',
 'month_

In [16]:
# Remove nulls for numeric values
train_n_null = num_null_replacement(train)

Missing data:
                                     Total   Percent
month_flight_avg_passengers           1915  0.195249
month_flight_seats                    1915  0.195249
month_flight_passengers               1915  0.195249
month_flight_avg_seats                1915  0.195249
arr_hour_avg_weather_delay              29  0.002957
arr_hour_avg_security_delay             29  0.002957
arr_hour_avg_late_aircraft_delay        29  0.002957
arr_hour_avg_nas_delay                  29  0.002957
arr_hour_avg_carrier_delay              29  0.002957
month_avg_late_aircraft_delay           22  0.002243
month_avg_security_delay                22  0.002243
month_avg_nas_delay                     22  0.002243
month_avg_weather_delay                 22  0.002243
month_avg_carrier_delay                 22  0.002243
day_of_week_avg_nas_delay                5  0.000510
day_of_week_avg_weather_delay            5  0.000510
day_of_week_avg_security_delay           5  0.000510
day_of_week_avg_late_aircraft_de

In [None]:
train_nnull.dtypes[train_n_null.dtypes == 'object']

In [None]:
#train_cleaned = scale_encoder(train_n_null)

def one_hot_encode(X):
    cat_feats = train.dtypes[X.dtypes == 'object'].index.tolist()
    df_dummy = pd.get_dummies(X[cat_feats])
    return df_dummy
df_dummy = one_hot_encode(train)

In [None]:
train_cleaned.to_csv('./data/train_cleaned.csv')