# 04 - Feature Engineering and Data preparation for model building

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

In [11]:
import warnings
warnings.filterwarnings('ignore')

### Load data

In [49]:
flights = pd.read_csv('../data/interim/flights_interim.csv')
flights.shape

(1266210, 22)

In [50]:
def get_perfomed_flights(flights):
    departure_dates = pd.to_datetime(flights['dDate'])
    max_collection_day = pd.to_datetime(flights['collectionDate']).max()                    
    pf = flights[departure_dates <= max_collection_day]
    return pf

In [51]:
pf = get_perfomed_flights(flights)
pf.shape

(152465, 22)

In [61]:
pf_features.columns

Index(['collectionDate', 'dDate', 'dTime', 'aDate', 'aTime', 'dTimeUTC',
       'aTimeUTC', 'flyFrom', 'flyTo', 'airlines', 'flight_no', 'fly_duration',
       'distance', 'route', 'price', 'seats', 'cityFrom', 'cityCodeFrom',
       'cityTo', 'cityCodeTo', 'countryFrom', 'countryTo', 'log_price',
       'day_of_month', 'day_of_week', 'session', 'orig-dest', 'airline',
       'days_until_dep', 'hops', 'direct', 'competition'],
      dtype='object')

In [62]:
columns

['flyFrom',
 'flyTo',
 'orig-dest',
 'day_of_month',
 'day_of_week',
 'days_until_dep',
 'session',
 'airline',
 'hops',
 'direct',
 'competition']

In [63]:
def get_agg_flights(flights):
    aggs = {
        'fly_duration': 'mean',
        'price': 'mean',
        'airline':'nunique',
    }
    grouped = flights.groupby(['collectionDate','flyFrom','flyTo','dDate', 'dTime', 'aTime'])
    flights_agg = grouped[['fly_duration','price', 'airline']].agg(aggs).reset_index()
    return flights_agg

### Missing values

In [5]:
#flights['seats'].fillna(0, inplace=True)

### Outliers?

### Adding new features

In [29]:
def build_features(df):
    # log transformation on target
    df['log_price'] = np.log(df['price'])
    # Day of month
    df['day_of_month'] = df['dDate'].apply(lambda x: int(x.split('-')[2]))

    # Day of the week
    df['day_of_week'] = pd.to_datetime(df['dDate']).apply(lambda x: x.day_of_week)
    days_of_week = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
    df['day_of_week'] = df['day_of_week'].map(days_of_week)

    # Session (morning, afternoon, night)
    df['session'] = pd.cut(pd.to_datetime(df['dTime']), bins=4, labels=['night', 'morning', 'afternoon', 'evening'])

    # Route
    df['orig-dest'] = df['flyFrom']+'-'+df['flyTo']
    
    # Airline
    df['airline'] = df['airlines'].apply(lambda x: x.split(',')[0])
    
    # Days until Departure
    collected = pd.to_datetime(df['collectionDate'])
    departure =  pd.to_datetime(df['dDate'])
    daysUntilDep = departure - collected
    df['days_until_dep'] = daysUntilDep.apply(lambda x: str(x).split()[0])
    
    # Hopping
    df['hops'] = df['route'].apply(lambda x: len(x.split('->')) - 2)
    df['direct'] = df['hops'] == 0
    
    # Competition Factor
    competition = df.groupby(['flyFrom','flyTo','dDate'])['airline'].nunique().reset_index()
    competition.columns = ['flyFrom','flyTo','dDate', 'competition']
    df = pd.merge(df, competition, on=['dDate', 'flyFrom', 'flyTo'])
    
    return df

In [60]:
pf_features = build_features(pf)

In [23]:
flights_features = build_features(flights)

## Save processed data

In [26]:
columns = ['flyFrom', 'flyTo', 'orig-dest','day_of_month', 'day_of_week', 
           'days_until_dep', 'session', 'airline', 'hops', 'direct', 'competition']

flights_processed = flights_features[columns]

In [27]:
flights_processed.to_csv('../data/processed/flights_processed.csv', index=False)

In [47]:
pf_processed = pf_features[columns]
pf_processed.to_csv('../data/processed/pf_processed.csv', index=False)