# Data Preparation

This notebook prepares and clean the dataset for the exploratory data analysis.

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load the data

In [59]:
columns = ['dTime', 'dTimeUTC', 'aTime', 'aTimeUTC', 'airlines',
               'fly_duration', 'flyFrom', 'cityFrom', 'cityCodeFrom','flyTo',
               'cityTo','cityCodeTo', 'distance', 'price', 'route', 'countryFrom',
          'countryTo', 'flight_no', 'seats', 'collectionDate']

flights = pd.read_csv('../data/raw/flights_raw.csv', names=columns)

In [46]:
flights.head()

Unnamed: 0,dTime,dTimeUTC,aTime,aTimeUTC,airlines,fly_duration,flyFrom,cityFrom,cityCodeFrom,flyTo,cityTo,cityCodeTo,distance,price,route,countryFrom,countryTo,flight_no,seats,collectionDate
0,2021-02-01 15:00:00,1612188000,2021-02-01 16:25:00,1612193100,UX,1h 25m,MAD,Madrid,MAD,BCN,Barcelona,BCN,483.25,78,MAD -> BCN,Spain,Spain,7703,,2021-01-31 18:19:43.233765
1,2021-02-01 09:35:00,1612168500,2021-02-01 10:50:00,1612173000,VY,1h 15m,MAD,Madrid,MAD,BCN,Barcelona,BCN,483.25,91,MAD -> BCN,Spain,Spain,1003,,2021-01-31 18:19:43.233765
2,2021-02-01 19:45:00,1612205100,2021-02-01 21:05:00,1612209900,IB,1h 20m,MAD,Madrid,MAD,BCN,Barcelona,BCN,483.25,91,MAD -> BCN,Spain,Spain,1946,5.0,2021-01-31 18:19:43.233765
3,2021-02-01 19:15:00,1612203300,2021-02-02 08:50:00,1612252200,UX,13h 35m,MAD,Madrid,MAD,BCN,Barcelona,BCN,483.25,108,MAD -> BCN,Spain,Spain,6097,3.0,2021-01-31 18:19:43.233765
4,2021-02-01 14:50:00,1612187400,2021-02-02 08:50:00,1612252200,UX,18h 0m,MAD,Madrid,MAD,BCN,Barcelona,BCN,483.25,112,MAD -> BCN,Spain,Spain,6067,3.0,2021-01-31 18:19:43.233765


### Cleaning date columns

In [61]:
flights['dDate'] = flights['dTime'].apply(lambda x: x.split(' ')[0])
flights['dTime'] = flights['dTime'].apply(lambda x: x.split(' ')[1][:5])
flights['aDate'] = flights['aTime'].apply(lambda x: x.split(' ')[0])
flights['aTime'] = flights['aTime'].apply(lambda x: x.split(' ')[1][:5])

In [60]:
flights['dTimeUTC'] = flights['dTimeUTC'].apply(lambda x: datetime.utcfromtimestamp(x))
flights['aTimeUTC'] = flights['aTimeUTC'].apply(lambda x: datetime.utcfromtimestamp(x))

In [63]:
flights['collectionDate'] = flights['collectionDate'].apply(lambda x: x.split(' ')[0])

In [64]:
flights[['collectionDate','dDate', 'dTime', 'aDate', 'aTime', 'dTimeUTC', 'aTimeUTC']].head()

Unnamed: 0,collectionDate,dDate,dTime,aDate,aTime,dTimeUTC,aTimeUTC
0,2021-01-31,2021-02-01,15:00,2021-02-01,16:25,2021-02-01 14:00:00,2021-02-01 15:25:00
1,2021-01-31,2021-02-01,09:35,2021-02-01,10:50,2021-02-01 08:35:00,2021-02-01 09:50:00
2,2021-01-31,2021-02-01,19:45,2021-02-01,21:05,2021-02-01 18:45:00,2021-02-01 20:05:00
3,2021-01-31,2021-02-01,19:15,2021-02-02,08:50,2021-02-01 18:15:00,2021-02-02 07:50:00
4,2021-01-31,2021-02-01,14:50,2021-02-02,08:50,2021-02-01 13:50:00,2021-02-02 07:50:00


### Store data for exploratory data analysis

In [69]:
columns = ['collectionDate','dDate', 'dTime', 'aDate', 'aTime', 'dTimeUTC', 'aTimeUTC',
           'flyFrom', 'flyTo', 'airlines', 'flight_no', 'fly_duration', 'distance', 'route',
           'price','seats', 'cityFrom', 'cityCodeFrom', 'cityTo', 'cityCodeTo', 'countryFrom', 
           'countryTo']

In [70]:
flights = flights[columns]

In [71]:
flights.head()

Unnamed: 0,collectionDate,dDate,dTime,aDate,aTime,dTimeUTC,aTimeUTC,flyFrom,flyTo,airlines,...,distance,route,price,seats,cityFrom,cityCodeFrom,cityTo,cityCodeTo,countryFrom,countryTo
0,2021-01-31,2021-02-01,15:00,2021-02-01,16:25,2021-02-01 14:00:00,2021-02-01 15:25:00,MAD,BCN,UX,...,483.25,MAD -> BCN,78,,Madrid,MAD,Barcelona,BCN,Spain,Spain
1,2021-01-31,2021-02-01,09:35,2021-02-01,10:50,2021-02-01 08:35:00,2021-02-01 09:50:00,MAD,BCN,VY,...,483.25,MAD -> BCN,91,,Madrid,MAD,Barcelona,BCN,Spain,Spain
2,2021-01-31,2021-02-01,19:45,2021-02-01,21:05,2021-02-01 18:45:00,2021-02-01 20:05:00,MAD,BCN,IB,...,483.25,MAD -> BCN,91,5.0,Madrid,MAD,Barcelona,BCN,Spain,Spain
3,2021-01-31,2021-02-01,19:15,2021-02-02,08:50,2021-02-01 18:15:00,2021-02-02 07:50:00,MAD,BCN,UX,...,483.25,MAD -> BCN,108,3.0,Madrid,MAD,Barcelona,BCN,Spain,Spain
4,2021-01-31,2021-02-01,14:50,2021-02-02,08:50,2021-02-01 13:50:00,2021-02-02 07:50:00,MAD,BCN,UX,...,483.25,MAD -> BCN,112,3.0,Madrid,MAD,Barcelona,BCN,Spain,Spain


In [73]:
flights.to_csv('../data/interim/flights_interim.csv', index=False)