# 04 - Feature Engineering and Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Load data

In [129]:
train = pd.read_csv('../data/interim/train.csv')
test = pd.read_csv('../data/interim/test.csv')

### Missing values

In [4]:
train.isna().sum()

collectionDate        0
dDate                 0
dTime                 0
aDate                 0
aTime                 0
dTimeUTC              0
aTimeUTC              0
flyFrom               0
flyTo                 0
airlines              0
flight_no             0
fly_duration          0
distance              0
route                 0
price                 0
seats             32620
cityFrom              0
cityCodeFrom          0
cityTo                0
cityCodeTo            0
countryFrom           0
countryTo             0
dtype: int64

In [116]:
# Removing 'seats' column because it has many missing values.
for df in [train, test]:
    df.drop('seats', axis=1, inplace=True)

### Outliers?

### Adding new features

In [130]:
def build_features(df):
    df = df.copy()
    # log transformation
    df['log_price'] = np.log(df['price'])
    # Day of month
    df['day_of_month'] = pd.to_datetime(df['dDate']).dt.day

    # Day of the week
    df['day_of_week'] = pd.to_datetime(df['dDate']).apply(lambda x: x.day_of_week)
    days_of_week = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
    df['day_of_week'] = df['day_of_week'].map(days_of_week)

    # Session (morning, afternoon, night)
    df['session'] = pd.cut(pd.to_datetime(df['dTime']), bins=3, labels=['night', 'morning', 'evening'])

    # Route
    df['orig-dest'] = df['flyFrom']+'-'+df['flyTo']
    
    # Airline
    df['airline'] = df['airlines'].apply(lambda x: x.split(',')[0])
    
    # Days until Departure
    collected = pd.to_datetime(df['collectionDate'])
    departure =  pd.to_datetime(df['dDate'])
    daysUntilDep = departure - collected
    df['days_until_dep'] = daysUntilDep.apply(lambda x: str(x).split()[0])
    
    # Hopping
    df['hops'] = df['route'].apply(lambda x: len(x.split('->')) - 2)
    df['direct'] = df['hops'] == 0
    
    # Competition Factor
    competition = df.groupby(['flyFrom','flyTo','dDate'])['airline'].nunique().reset_index()
    competition.columns = ['flyFrom','flyTo','dDate', 'competition']
    df = pd.merge(df, competition, on=['dDate', 'flyFrom', 'flyTo'])
    
    # id flight
    df['id'] = df.groupby(['dDate', 'flyFrom', 'flyTo', 'dTime', 'aTime', 'airline', 'fly_duration']).ngroup()
    
    return df

In [131]:
train_features = build_features(train)
test_features = build_features(test)

In [143]:
train_features.shape, test_features.shape

((115484, 33), (36981, 33))

In [127]:
train_features.columns

Index(['collectionDate', 'dDate', 'dTime', 'aDate', 'aTime', 'dTimeUTC',
       'aTimeUTC', 'flyFrom', 'flyTo', 'airlines', 'flight_no', 'fly_duration',
       'distance', 'route', 'price', 'cityFrom', 'cityCodeFrom', 'cityTo',
       'cityCodeTo', 'countryFrom', 'countryTo', 'log_price', 'day_of_month',
       'day_of_week', 'session', 'orig-dest', 'airline', 'days_until_dep',
       'hops', 'direct', 'competition', 'id', 'hist_prices_x', 'hist_prices_y',
       'hist_prices'],
      dtype='object')

In [128]:
test_features.columns

Index(['collectionDate', 'dDate', 'dTime', 'aDate', 'aTime', 'dTimeUTC',
       'aTimeUTC', 'flyFrom', 'flyTo', 'airlines', 'flight_no', 'fly_duration',
       'distance', 'route', 'price', 'cityFrom', 'cityCodeFrom', 'cityTo',
       'cityCodeTo', 'countryFrom', 'countryTo', 'log_price', 'day_of_month',
       'day_of_week', 'session', 'orig-dest', 'airline', 'days_until_dep',
       'hops', 'direct', 'competition', 'id', 'hist_prices_x', 'hist_prices_y',
       'hist_prices_x', 'hist_prices_y'],
      dtype='object')

In [9]:
import multiprocessing
import numpy as np


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [10]:
def get_daystowait(df):
    df = df.copy()
    days_to_wait = []
    
    for row in df.itertuples():
        row_id = row.id
        row_price = row.price
        days = int(row.days_until_dep)
        
        # prices of the current flight
        prices_by_day = df[df['id'] == row_id][['days_until_dep','price']]
        prices_by_day['days_until_dep'] = prices_by_day['days_until_dep'].astype(int)
        
        # subset prices of following days
        next_days = prices_by_day['days_until_dep'] < days
        
        # check which ones price decreases
        price_decreases = prices_by_day['price'] < row_price
        
        # get the closest day
        day_price_decrease = prices_by_day[next_days & price_decreases]['days_until_dep'].max()
        
        # compute how many days remain until price drop
        if not np.isnan(day_price_decrease):    
            days_count = (days - day_price_decrease)
        else: 
            days_count = 0
        days_to_wait.append(days_count)
    df['days_to_wait'] = pd.Series(days_to_wait)
    return df

In [148]:
def get_hist_prices(df):
    df = df.copy()
    grouped = df.groupby(['id', 'days_until_dep'])['price'].mean().reset_index() # tweak: min or mean?
    pivot = grouped.pivot(index='id', columns='days_until_dep', values='price').reset_index()
    columns = list(map(int, list(pivot.columns)[1:]))
    cols_sorted = sorted(columns)
    cols_str = list(map(str, cols_sorted))
    new_cols_names = ['id'] + cols_str
    pivot = pivot[new_cols_names]
    pivot['hist_prices'] = pivot.values.tolist()
    pivot['hist_prices'] = pivot['hist_prices'].apply(lambda x: x[1:])
    pivot = pivot[['id', 'hist_prices']]
    df = pd.merge(df, pivot, on='id')
    return df

In [149]:
train_hist = get_hist_prices(train_features)
test_hist = get_hist_prices(test_features)

In [150]:
train_hist.shape, test_hist.shape

((115484, 34), (36981, 34))

In [86]:
from tqdm import tqdm

def get_waiting_days(df):
    df['waiting_days'] = np.nan
    for row in tqdm(df.itertuples(), total=df.shape[0]):
        hist = row.hist_prices
        row_days = int(row.days_until_dep)
        next_days_prices = hist[:row_days-1][::-1]
        
        # init at 0
        days_to_wait = 0
        
        if (~np.isnan(next_days_prices)).sum() > 0:
            next_days_prices = np.nan_to_num(next_days_prices, nan=np.inf)
            idx = np.argmin(next_days_prices)
            min_price = next_days_prices[idx]
            #price_diff = row.price - min_price
            if min_price < row.price:
                days_to_wait = idx+1
        df['waiting_days'].iloc[row.Index] = days_to_wait
    df['waiting_days'] = df['waiting_days'].astype(int)
    return df

In [151]:
%%time
train_target = get_waiting_days(train_hist)
test_target = get_waiting_days(test_hist)

100%|██████████| 115484/115484 [00:23<00:00, 4890.58it/s]
100%|██████████| 36981/36981 [00:07<00:00, 4724.48it/s]

CPU times: user 31.4 s, sys: 1.32 s, total: 32.7 s
Wall time: 31.5 s





In [152]:
def get_labels(df, factor=0.2):
    df = df.copy()
    df['buy'] = df['waiting_days'].apply(lambda days: 0 if days == 0 else 1)
    return df

In [153]:
train_prepared = get_labels(train_target)
test_prepared = get_labels(test_target)

In [154]:
train_prepared['buy'].value_counts()

0    87287
1    28197
Name: buy, dtype: int64

## Save processed data

In [157]:
columns = ['flyFrom', 'flyTo', 'orig-dest','dDate','day_of_month', 'day_of_week', 'fly_duration', 'distance',
           'days_until_dep', 'session', 'airline', 'hops', 'direct', 'competition','price','log_price' ,'waiting_days', 'buy']

In [159]:
train_prepared[columns].to_csv('../data/processed/train_processed.csv', index=False)
test_prepared[columns].to_csv('../data/processed/test_processed.csv', index=False)