# 04 - Feature Engineering and Data preparation for model building

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime

## Load data

In [3]:
flights = pd.read_csv('../data/interim/flights_interim.csv')
flights.head()

Unnamed: 0,collectionDate,dDate,dTime,aDate,aTime,dTimeUTC,aTimeUTC,flyFrom,flyTo,airlines,...,distance,route,price,seats,cityFrom,cityCodeFrom,cityTo,cityCodeTo,countryFrom,countryTo
0,2021-01-31,2021-02-01,15:00,2021-02-01,16:25,2021-02-01 14:00:00,2021-02-01 15:25:00,MAD,BCN,UX,...,483.25,MAD -> BCN,78,,Madrid,MAD,Barcelona,BCN,Spain,Spain
1,2021-01-31,2021-02-01,09:35,2021-02-01,10:50,2021-02-01 08:35:00,2021-02-01 09:50:00,MAD,BCN,VY,...,483.25,MAD -> BCN,91,,Madrid,MAD,Barcelona,BCN,Spain,Spain
2,2021-01-31,2021-02-01,19:45,2021-02-01,21:05,2021-02-01 18:45:00,2021-02-01 20:05:00,MAD,BCN,IB,...,483.25,MAD -> BCN,91,5.0,Madrid,MAD,Barcelona,BCN,Spain,Spain
3,2021-01-31,2021-02-01,19:15,2021-02-02,08:50,2021-02-01 18:15:00,2021-02-02 07:50:00,MAD,BCN,UX,...,483.25,MAD -> BCN,108,3.0,Madrid,MAD,Barcelona,BCN,Spain,Spain
4,2021-01-31,2021-02-01,14:50,2021-02-02,08:50,2021-02-01 13:50:00,2021-02-02 07:50:00,MAD,BCN,UX,...,483.25,MAD -> BCN,112,3.0,Madrid,MAD,Barcelona,BCN,Spain,Spain


## Dealing with missing values and outliers

In [4]:
flights.isna().sum()

collectionDate         0
dDate                  0
dTime                  0
aDate                  0
aTime                  0
dTimeUTC               0
aTimeUTC               0
flyFrom                0
flyTo                  0
airlines               0
flight_no              0
fly_duration           0
distance               0
route                  0
price                  0
seats             306707
cityFrom               0
cityCodeFrom           0
cityTo                 0
cityCodeTo             0
countryFrom            0
countryTo              0
dtype: int64

Let's remove 'seats' columns for now

## Feature Engineering

### Missing values

In [86]:
flights['seats'].fillna(0, inplace=True)

### Outliers?

### Adding new features

In [87]:
# Day of the week
flights['day_of_week'] = pd.to_datetime(flights['dDate']).apply(lambda x: x.day_of_week)
days_of_week = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
flights['day_of_week'] = flights['day_of_week'].map(days_of_week)

# Session (morning, afternoon, night)
flights['session'] = pd.cut(pd.to_datetime(flights['dTime']), bins=4, labels=['night', 'morning', 'afternoon', 'evening'])

# Route
flights['route'] = flights['flyFrom']+'-'+flights['flyTo']

# Days to Departure
collected = pd.to_datetime(flights['collectionDate'])
departure =  pd.to_datetime(flights['dDate'])
daysToDep = departure - collected
flights['days_until_dep'] = daysToDep.apply(lambda x: str(x).split()[0])

## Save processed data

In [88]:
columns = ['flyFrom', 'flyTo','route', 'days_until_dep', 'fly_duration', 
           'distance', 'day_of_week', 'session', 'seats', 'price']

flights_processed = flights[columns]
flights_processed.head()

Unnamed: 0,flyFrom,flyTo,route,days_until_dep,fly_duration,distance,day_of_week,session,seats,price
0,MAD,BCN,MAD-BCN,1,1.416667,483.25,Monday,afternoon,0.0,78
1,MAD,BCN,MAD-BCN,1,1.25,483.25,Monday,morning,0.0,91
2,MAD,BCN,MAD-BCN,1,1.333333,483.25,Monday,evening,5.0,91
3,MAD,BCN,MAD-BCN,1,13.583333,483.25,Monday,evening,3.0,108
4,MAD,BCN,MAD-BCN,1,18.0,483.25,Monday,afternoon,3.0,112


In [83]:
flights_processed.to_csv('../data/processed/flights_processed.csv', index=False)