# 04 - Feature Engineering and Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tqdm import tqdm
tqdm.pandas()

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Load data

In [3]:
flights = pd.read_csv('../data/interim/flights_interim.csv')

### Missing values

In [4]:
flights.isna().sum()

dTime                  0
dTimeUTC               0
aTime                  0
aTimeUTC               0
airlines               0
fly_duration           0
flyFrom                0
cityFrom               0
cityCodeFrom           0
flyTo                  0
cityTo                 0
cityCodeTo             0
distance               0
price                  0
route                  0
countryFrom            0
countryTo              0
flight_no              0
seats             164627
collectionDate         0
dDate                  0
aDate                  0
dtype: int64

In [5]:
# Removing 'seats' column because it has many missing values.
flights.drop('seats', axis=1, inplace=True)

In [6]:
def fill_missing(df):
    """
        Fills in the missing days by duplicating the flights of the previous day.
        
        Args:
            - df: Dataframe with flights data
        Returns: 
            - Same dataframe with news added rows with no missing days.
    """
    collectionDates = df['collectionDate'].unique()
    dates_range = pd.date_range(collectionDates[0], collectionDates[-1]).tolist()
    dates_range = [str(date).split()[0] for date in dates_range]
    missing_dates = [date for date in dates_range if date not in collectionDates]
    missing_dates_dt = [datetime.strptime(date, '%Y-%m-%d') for date in missing_dates]
    previous_dates_dt = [date + timedelta(days=-1) for date in missing_dates_dt]
    previous_dates = [datetime.strftime(date, '%Y-%m-%d') for date in previous_dates_dt]
    
    for idx, date in enumerate(missing_dates):
        previous_date = previous_dates[idx]
        this_date = df[df['collectionDate'] == previous_date].copy()
        this_date['collectionDate'] = date
        df = df.append(this_date)
    return df

In [7]:
flights = fill_missing(flights)

### Outliers?

### Adding new features

In [8]:
def build_features(df):
    df = df.copy()
    # log transformation
    df['log_price'] = np.log(df['price'])
    # Day of month
    df['day_of_month'] = pd.to_datetime(df['dDate']).dt.day

    # Day of the week
    df['day_of_week'] = pd.to_datetime(df['dDate']).apply(lambda x: x.day_of_week)
    days_of_week = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
    df['day_of_week'] = df['day_of_week'].map(days_of_week)

    # Session (morning, afternoon, night)
    df['session'] = pd.cut(pd.to_datetime(df['dTime']), bins=4, labels=['night', 'morning', 'afternoon', 'evening'])

    # Route
    df['orig-dest'] = df['flyFrom']+'-'+df['flyTo']
    
    # Airline
    df['airline'] = df['airlines'].apply(lambda x: x.split(',')[0])
    
    # Days until Departure
    collected = pd.to_datetime(df['collectionDate'])
    departure =  pd.to_datetime(df['dDate'])
    daysUntilDep = departure - collected
    df['days_until_dep'] = daysUntilDep.apply(lambda x: str(x).split()[0]).astype(int)
    
    # Hopping
    df['hops'] = df['route'].apply(lambda x: len(x.split('->')) - 2)
    df['direct'] = df['hops'] == 0
    
    # Competition Factor
    competition = df.groupby(['flyFrom','flyTo','dDate'])['airline'].nunique().reset_index()
    competition.columns = ['flyFrom','flyTo','dDate', 'competition']
    df = pd.merge(df, competition, on=['dDate', 'flyFrom', 'flyTo'])
    
    # id flight
    df['id'] = df.groupby(['dDate', 'flyFrom', 'flyTo', 'dTime', 'aTime', 'airline', 'fly_duration']).ngroup()
    
    #df['mean_price'] = df.groupby('id')['price'].transform('min')
    #df['median_price'] = df.groupby('id')['price'].transform('median')
    #df['diff_price'] = df['price'] - df['median_price']
    
    return df

In [9]:
flights_features = build_features(flights)

In [10]:
flights_features.shape

(632177, 32)

In [11]:
import multiprocessing
import numpy as np


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [12]:
def get_hist_prices(df, progress_bar=True):
    """ 
        New feature 'hist_prices' consisting of a historical price list for each flight.
        
        Args: 
            - df: Dataframe with flights data.
            - progress_bar: If enabled, it displays a progress bar during execution.
            
        Returns: 
            - Same dataframe with the new column 'hist_prices'
    """
    sorted_by_date = df.sort_values(by='collectionDate')
    if progress_bar: 
        grouped = sorted_by_date.groupby(['id'])['price'].progress_apply(list)
    else:
        grouped = sorted_by_date.groupby(['id'])['price'].apply(list)
    grouped = grouped.reset_index(name='hist_prices')
    merged = pd.merge(df, grouped, on='id')
    return merged

In [13]:
flights_hist = get_hist_prices(flights_features)

100%|██████████| 116618/116618 [00:03<00:00, 32358.16it/s]


In [14]:
def filter_flights(df):
    """ Flight filtering to select only valid flights. This is, flights which 
        we have the price for each of the days until the departure of the flight
        
        Args:
            - df: Dataframe with flights data
        Returns
            - new_df: New dataframe with only valid flights.
    """
    max_days = df.groupby('id')['days_until_dep'].transform(max)
    hist_lengths = df['hist_prices'].apply(len)
    new_df = df[max_days == hist_lengths].copy()
    return new_df

In [15]:
flights_valid = filter_flights(flights_hist)
flights_valid.shape

(66107, 33)

In [16]:
def get_waiting_days(df):
    """
        Add new feature 'waiting_days' that indicates the days to wait to 
        get the best price among the remaining days until flight departure. 
        This will be the target variable.
        
        Args: 
            - df: Dataframe with flights data
        Returns
            - Same dataframe with the new target: 'waiting_days'
    """
    
    waiting_days_list = np.array([])
    for row in tqdm(df.itertuples(), total=df.shape[0]):
        current_price = row.price
        hist = row.hist_prices
        days_until_dep = int(row.days_until_dep)
        idx = len(hist)-days_until_dep
        if days_until_dep > 1:
            next_days_prices = hist[idx+1:]
            idx_min = np.argmin(next_days_prices)
            min_price = next_days_prices[idx_min]
            if min_price < current_price:
                waiting_days = idx_min+1
            else:
                waiting_days = 0
        else:
            waiting_days = 0
       # print(row.id, days_until_dep, row.price, next_days_prices, waiting_days)
        waiting_days_list = np.append(waiting_days_list, waiting_days)
    df['waiting_days'] = waiting_days_list.astype(int)
    return df


In [17]:
flights_target = get_waiting_days(flights_valid)

100%|██████████| 66107/66107 [00:02<00:00, 23529.25it/s]


In [28]:
def get_labels(df):
    df['buy'] = df['waiting_days'].apply(lambda days: 0 if days == 0 else 1)
    return df

In [29]:
flights_label = get_labels(flights_target)

In [30]:
flights_label['buy'].value_counts()

0    41575
1    24532
Name: buy, dtype: int64

### Split train and test

In [31]:
def split_data_old(df, test_size=0.3):
    """ Split data into train and test sets. """
    
    flight_dates = pd.to_datetime(df['dDate'])
    #df['collectionDate'] = pd.to_datetime(pd.to_datetime(df['collectionDate']))

    # number of days
    days_timedelta = flight_dates.max()-flight_dates.min()
    n_days = days_timedelta.days
    # date where spliting
    split_idx = int( n_days * (1-test_size)+2)
    
    split_date = flight_dates.min() + timedelta(days=split_idx)
    train = df[flight_dates <= split_date].copy()
    test = df[flight_dates > split_date].copy()
    
    return train, test

In [40]:
def split_data(df, test_days=14):
    flight_dates = pd.to_datetime(df['dDate'])
    split_date = flight_dates.max() - timedelta(days=test_days)
    train = df[flight_dates <= split_date].copy()
    test = df[flight_dates > split_date].copy()
    return train, test

In [41]:
train, test = split_data(flights_label)
train.shape, test.shape

((52669, 35), (13438, 35))

In [42]:
test.shape[0]/flights_label.shape[0]

0.2032765062701378

## Save processed data

In [43]:
columns = ['id','flyFrom', 'flyTo', 'orig-dest','dDate','day_of_month', 'day_of_week', 'fly_duration', 'distance',
           'days_until_dep', 'session', 'airline', 'hops', 'direct', 'competition','price','log_price', 'hist_prices','waiting_days', 'buy']

In [44]:
train[columns].to_csv('../data/processed/train.csv', index=False)
test[columns].to_csv('../data/processed/test.csv', index=False)