# 04 - Feature Engineering and Data Preprocessing

In [43]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [44]:
import warnings
warnings.filterwarnings('ignore')

### Load data

In [45]:
train = pd.read_csv('../data/interim/train.csv')
test = pd.read_csv('../data/interim/test.csv')

### Missing values

In [46]:
train.isna().sum()

collectionDate        0
dDate                 0
dTime                 0
aDate                 0
aTime                 0
dTimeUTC              0
aTimeUTC              0
flyFrom               0
flyTo                 0
airlines              0
flight_no             0
fly_duration          0
distance              0
route                 0
price                 0
seats             32620
cityFrom              0
cityCodeFrom          0
cityTo                0
cityCodeTo            0
countryFrom           0
countryTo             0
dtype: int64

In [47]:
# Removing 'seats' column because it has many missing values.
for df in [train, test]:
    df.drop('seats', axis=1, inplace=True)

### Outliers?

### Adding new features

In [48]:
def build_features(df):
    df = df.copy()
    # log transformation
    df['log_price'] = np.log(df['price'])
    # Day of month
    df['day_of_month'] = pd.to_datetime(df['dDate']).dt.day

    # Day of the week
    df['day_of_week'] = pd.to_datetime(df['dDate']).apply(lambda x: x.day_of_week)
    days_of_week = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
    df['day_of_week'] = df['day_of_week'].map(days_of_week)

    # Session (morning, afternoon, night)
    df['session'] = pd.cut(pd.to_datetime(df['dTime']), bins=3, labels=['night', 'morning', 'evening'])

    # Route
    df['orig-dest'] = df['flyFrom']+'-'+df['flyTo']
    
    # Airline
    df['airline'] = df['airlines'].apply(lambda x: x.split(',')[0])
    
    # Days until Departure
    collected = pd.to_datetime(df['collectionDate'])
    departure =  pd.to_datetime(df['dDate'])
    daysUntilDep = departure - collected
    df['days_until_dep'] = daysUntilDep.apply(lambda x: str(x).split()[0])
    
    # Hopping
    df['hops'] = df['route'].apply(lambda x: len(x.split('->')) - 2)
    df['direct'] = df['hops'] == 0
    
    # Competition Factor
    competition = df.groupby(['flyFrom','flyTo','dDate'])['airline'].nunique().reset_index()
    competition.columns = ['flyFrom','flyTo','dDate', 'competition']
    df = pd.merge(df, competition, on=['dDate', 'flyFrom', 'flyTo'])
    
    # id flight
    df['id'] = df.groupby(['dDate', 'flyFrom', 'flyTo', 'dTime', 'aTime', 'airline', 'fly_duration']).ngroup()

    
    return df

In [49]:
train_features = build_features(train)
test_features = build_features(test)

In [50]:
train_features.shape, test_features.shape

((115484, 32), (36981, 32))

In [37]:
import multiprocessing
import numpy as np


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [23]:
def get_daystowait(df):
    df = df.copy()
    days_to_wait = []
    
    for row in df.itertuples():
        row_id = row.id
        row_price = row.price
        days = int(row.days_until_dep)
        
        # prices of the current flight
        prices_by_day = df[df['id'] == row_id][['days_until_dep','price']]
        prices_by_day['days_until_dep'] = prices_by_day['days_until_dep'].astype(int)
        
        # subset prices of following days
        next_days = prices_by_day['days_until_dep'] < days
        
        # check which ones price decreases
        price_decreases = prices_by_day['price'] < row_price
        
        # get the closest day
        day_price_decrease = prices_by_day[next_days & price_decreases]['days_until_dep'].max()
        
        # compute how many days remain until price drop
        if not np.isnan(day_price_decrease):    
            days_count = (days - day_price_decrease)
        else: 
            days_count = 0
        days_to_wait.append(days_count)
    df['days_to_wait'] = pd.Series(days_to_wait)
    return df

In [184]:
grouped = train_features.groupby(['id', 'days_until_dep'])['price'].mean().reset_index()
pivot = grouped.pivot(index='id', columns='days_until_dep', values='price').reset_index()

In [188]:
columns = list(map(int, list(pivot.columns)[1:]))
cols_sorted = sorted(columns)
cols_str = list(map(str, cols_sorted))
new_cols_names = ['id'] + cols_str
pivot = pivot[new_cols_names]

In [190]:
pivot['hist_prices'] = pivot.values.tolist()
pivot['hist_prices'] = pivot['hist_prices'].apply(lambda x: x[1:])

In [192]:
pivot = pivot[['id', 'hist_prices']]
df = pd.merge(train_features, pivot, on='id')

In [193]:
df['hist_prices'].head()

0    [78.0, nan, nan, nan, nan, nan, nan, nan, nan,...
1    [91.0, nan, nan, nan, nan, nan, nan, nan, nan,...
2    [91.0, nan, nan, nan, nan, nan, nan, nan, nan,...
3    [108.0, nan, nan, nan, nan, nan, nan, nan, nan...
4    [112.0, nan, nan, nan, nan, nan, nan, nan, nan...
Name: hist_prices, dtype: object

In [219]:
lista = np.array([nan,22,125,2])

In [226]:
~np.isnan([np.nan, 22]).sum()

array([False,  True])

In [207]:
idx, val

(3, 2)

In [242]:
for row in df.tail(20).itertuples():
    hist = row.hist_prices
    row_days = int(row.days_until_dep)
    next_days_prices = hist[:row_days-1][::-1]
    if (~np.isnan(next_days_prices)).sum() > 0:
        next_days_prices = np.nan_to_num(next_days_prices, nan=np.inf)
        idx = np.argmin(next_days_prices)
        min_price = next_days_prices[idx]
        
        if min_price < row.price:
            days_to_wait = idx+1
            price_diff = row.price - min_price
        else:
            days_to_wait = 0
            price_diff = 0
            
        print(row_days, row.price, next_days_prices, days_to_wait, price_diff)
    #print(min_price)
    # ahora pensar como lo tiene que trabajar el evaluador
    # min_price o price_diff

5 399 [347. 399. 479. 258.] 4 141.0
4 347 [399. 479. 258.] 3 89.0
3 399 [479. 258.] 2 141.0
2 479 [258.] 1 221.0
6 235 [ inf  inf  inf 344.  inf] 0 0
5 243 [237. 243.  inf  inf] 1 6.0
4 237 [243.  inf  inf] 0 0
4 244 [284. 284.  inf] 0 0
3 284 [284.  inf] 0 0
4 249 [ inf 315. 315.] 0 0
2 315 [315.] 0 0
2 309 [311.] 0 0


In [216]:
%%time
train_target = get_daystowait(train_features)
test_target = get_daystowait(test_features)

KeyboardInterrupt: 

In [25]:
def get_labels(df, factor=0.2):
    df = df.copy()
    df['buy'] = df['days_to_wait'].apply(lambda days: 0 if days == 0 else 1)
    return df

In [26]:
train_prepared = get_labels(train_target)
test_prepared = get_labels(test_target)

In [27]:
train_prepared['buy'].value_counts()

0    86801
1    28683
Name: buy, dtype: int64

## Save processed data

In [28]:
columns = ['flyFrom', 'flyTo', 'orig-dest','dDate','day_of_month', 'day_of_week', 'fly_duration', 'distance',
           'days_until_dep', 'session', 'airline', 'hops', 'direct', 'competition','price','log_price' ,'days_to_wait', 'buy']

In [29]:
train_prepared[columns].to_csv('../data/processed/train.csv', index=False)
test_prepared[columns].to_csv('../data/processed/test.csv', index=False)

In [32]:
df = train_prepared.copy()

In [33]:
df['id'] = df.groupby(['dDate', 'flyFrom', 'flyTo', 'dTime', 'aTime', 'airline']).ngroup()

In [34]:
df['id'].nunique()

28486

In [35]:
df2 = df.groupby(['id', 'days_until_dep'])['price'].mean().reset_index()

In [36]:
pivot = df2.pivot(index='id', columns='days_until_dep', values='price')
pivot

days_until_dep,1,10,11,12,13,14,15,16,17,18,...,24,25,26,3,4,5,6,7,8,9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,163.0,,,,,,,,,,...,,,,,,,,,,
1,773.0,,,,,,,,,,...,,,,,,,,,,
2,426.0,,,,,,,,,,...,,,,,,,,,,
3,532.0,,,,,,,,,,...,,,,,,,,,,
4,454.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28481,,,,,,,,,,,...,,1070.0,,,,,,,,
28482,,,,,,,,,,,...,,1070.0,,,,,,,,
28483,,,,,,,,,1390.0,,...,,1145.0,,,,,,,,
28484,,,,,,,,,,,...,,1145.0,1109.0,,,,,,,


In [42]:
x= 0 
for row in df.itertuples():
    x += 1
print(x)

115484


In [None]:
aa = list(map(str,np.arange(1, 8)))

In [None]:
aa

In [None]:
full = pd.merge(df, pivot.reset_index(), on='id')

In [None]:
full['range'] = full['days_until_dep'].apply(lambda x: list(map(str, np.arange(1, 8))))

In [None]:
full[]