# CRUCE DE LAS TRES TABLAS OBTENIDAS Y LIMPIADAS

## En este Notebook se va a desarrollar el proceso para identificar cada registro de vuelo con el parte METAR anterior a la hora programada de salida de cada vuelo.

Para ello, debido al gran número de registros que se van a "cruzar" entre los diferentes DataFrames, se va a utilizar la librería <span style="color:red">**DASK**</span> por su mejor eficiencia y velocidad respecto a Pandas.

In [1]:
# %pip install "dask[complete]"
# %pip install dask_ml

In [2]:
import time
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
import warnings
warnings.filterwarnings('ignore')

## Se cargan en DataFrame de *Dask* los 2 datasets previamente limpiados

In [3]:
fl_dask = dd.read_csv("../data/flights/flights_total.csv",dtype={"arr_mins_of_delay": "object", "dep_mins_of_delay": "object"})
mt_dask = dd.read_csv("../data/metars/metars_2017_2023.csv")


In [4]:
fl_dask.head()

Unnamed: 0,flight_id,departure_date_time,cod_flight_IATA,cod_flight_ICAO,day,week_day,status,airliner,cod_airliner_IATA,cod_airliner_ICAO,...,dep_situation,dep_mins_of_delay,city,cod_airport_IATA,cod_airport_ICAO,arrival,arr_situation,arr_mins_of_delay,duration,resta
0,1,2023-10-31 23:59:00,IB6833,IBE6833,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,...,late,35,Santiago,SCL,SCEL,09:13 UTC-03,early,-6,13h,41
1,2,2023-10-31 23:59:00,IB6841,IBE6841,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,...,late,33,Buenos Aires,EZE,SAEZ,08:29 UTC-03,on time,0,12h,33
2,3,2023-10-31 23:59:00,IB6011,IBE6011,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,...,late,25,Montevideo,MVD,SUMU,08:48 UTC-03,early,-6,12h,31
3,4,2023-10-31 23:55:00,IB6589,IBE6589,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,...,late,24,Bogota,BOG,SKBO,03:52 UTC-05,early,-27,10h,51
4,5,2023-10-31 23:55:00,IB6409,IBE6409,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,...,late,48,Mexico City,MEX,MMMX,04:55 CST,early,-19,11h,67


In [5]:
mt_dask.head()

Unnamed: 0,Metar_id,Date_time,Day,Hour,Condition,Temperature,Wind,Gusts,Relative_hum,Pressure
0,1,2023-10-31 23:30:00,2023-10-31,23:30,Fair,8,3,0,93,1017.0
1,2,2023-10-31 23:00:00,2023-10-31,23:00,Fair,8,1,0,87,1017.0
2,3,2023-10-31 22:30:00,2023-10-31,22:30,Fair,8,0,0,93,1017.0
3,4,2023-10-31 22:00:00,2023-10-31,22:00,Clear,8,0,0,93,1017.0
4,5,2023-10-31 21:30:00,2023-10-31,21:30,Clear,8,0,0,93,1017.0


## Se homogeniza el nombre de las columnas por las que se van a cruzar los registros, se ponen en un formato *datetime* común y se ordenan.

In [6]:
fl_dask = fl_dask.rename(columns={'departure_date_time': 'Date_time'})

In [7]:
mt_dask['Date_time'] = dd.to_datetime(mt_dask['Date_time'])
fl_dask['Date_time'] = dd.to_datetime(fl_dask['Date_time'])


In [8]:
fl_dask = fl_dask.sort_values('Date_time')
mt_dask = mt_dask.sort_values('Date_time')


## Se realiza un merge entre los dataframes de vuelos y metars por el Date_time, cogiendo el metar justamente anterior

In [9]:
%%time
fl_dask = fl_dask.sort_values('Date_time')
mtr_dask = mt_dask.sort_values('Date_time')

# Realiza el merge asof para encontrar el parte meteorológico más cercano en el tiempo
fl_dask = dd.merge_asof(fl_dask, mt_dask, on='Date_time', direction='backward')


CPU times: total: 1min 41s
Wall time: 2min 39s


In [10]:
pd.set_option('display.max_columns', None)

In [11]:
fl_dask.head()

Unnamed: 0,Date_time,flight_id,cod_flight_IATA,cod_flight_ICAO,day,week_day,status,airliner,cod_airliner_IATA,cod_airliner_ICAO,Scheduled_dep,depart_time,dep_situation,dep_mins_of_delay,city,cod_airport_IATA,cod_airport_ICAO,arrival,arr_situation,arr_mins_of_delay,duration,resta,Metar_id,Day,Hour,Condition,Temperature,Wind,Gusts,Relative_hum,Pressure
0,2017-11-01 00:05:00,922599,TP1005,TAP1005,2017-11-01,Wednesday,Landed,TAP - Air Portugal,TP,TAP,00:05,00:12,late,7,Porto,OPO,LPPR,00:05 WET,early,-10,52m,17,108890,2017-11-01,00:00,Clear,10,3,0,87,1021.0
1,2017-11-01 00:10:00,922598,IB3118,IBE3118,2017-11-01,Wednesday,Landed,Iberia,IB,IBE,00:10,00:15,late,5,Lisbon,LIS,LPPT,00:09 WET,early,-15,53m,20,108890,2017-11-01,00:00,Clear,10,3,0,87,1021.0
2,2017-11-01 00:13:00,922597,FX5036,FDX5036,2017-11-01,Wednesday,Landed,Federal Express (FedEx),FX,FDX,00:13,00:13,on time,0,Paris,CDG,LFPG,01:40 CET,on time,0,1h,0,108890,2017-11-01,00:00,Clear,10,3,0,87,1021.0
3,2017-11-01 00:40:00,922596,IB6409,IBE6409,2017-11-01,Wednesday,Landed,Iberia,IB,IBE,00:40,01:11,late,31,Mexico City,MEX,MMMX,05:33 CST,early,-16,11h,47,108889,2017-11-01,00:30,Clear,9,4,0,87,1022.0
4,2017-11-01 00:40:00,922595,IB6841,IBE6841,2017-11-01,Wednesday,Landed,Iberia,IB,IBE,00:40,00:53,late,13,Buenos Aires,EZE,SAEZ,08:47 UTC-03,early,-37,12h,50,108889,2017-11-01,00:30,Clear,9,4,0,87,1022.0


In [12]:
fl_dask = fl_dask.sort_values('Date_time', ascending=False)
fl_dask.head()

Unnamed: 0,Date_time,flight_id,cod_flight_IATA,cod_flight_ICAO,day,week_day,status,airliner,cod_airliner_IATA,cod_airliner_ICAO,Scheduled_dep,depart_time,dep_situation,dep_mins_of_delay,city,cod_airport_IATA,cod_airport_ICAO,arrival,arr_situation,arr_mins_of_delay,duration,resta,Metar_id,Day,Hour,Condition,Temperature,Wind,Gusts,Relative_hum,Pressure
497985,2023-10-31 23:59:00,1,IB6833,IBE6833,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,23:59,00:34,late,35,Santiago,SCL,SCEL,09:13 UTC-03,early,-6,13h,41,1,2023-10-31,23:30,Fair,8,3,0,93,1017.0
497984,2023-10-31 23:59:00,3,IB6011,IBE6011,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,23:59,00:24,late,25,Montevideo,MVD,SUMU,08:48 UTC-03,early,-6,12h,31,1,2023-10-31,23:30,Fair,8,3,0,93,1017.0
497983,2023-10-31 23:59:00,2,IB6841,IBE6841,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,23:59,00:32,late,33,Buenos Aires,EZE,SAEZ,08:29 UTC-03,on time,0,12h,33,1,2023-10-31,23:30,Fair,8,3,0,93,1017.0
497982,2023-10-31 23:55:00,6,IB6827,IBE6827,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,23:55,00:27,late,32,Sao Paulo,GRU,SBGR,06:49 UTC-03,early,-15,10h,47,1,2023-10-31,23:30,Fair,8,3,0,93,1017.0
497981,2023-10-31 23:55:00,4,IB6589,IBE6589,2023-10-31,Tuesday,Landed,Iberia,IB,IBE,23:55,00:19,late,24,Bogota,BOG,SKBO,03:52 UTC-05,early,-27,10h,51,1,2023-10-31,23:30,Fair,8,3,0,93,1017.0


In [13]:
fl_dask = fl_dask.rename(columns={'day': 'Date'})

In [16]:
fl_dask.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 31 entries, Date_time to Pressure
dtypes: datetime64[ns](1), object(22), float64(1), int64(7)

In [17]:
forma_df = fl_dask.shape[0].compute(), fl_dask.shape[1]
print("Forma del DataFrame:", forma_df)

Forma del DataFrame: (922599, 31)


In [14]:
fl_dask.to_csv("../data/flights/flights_with_metars.csv", index = False)

['C:\\Users\\daarr\\Desktop\\Ironhack\\Proyectos\\Final_project_MAD_Flights\\data\\flights\\flights_with_metars.csv\\0.part',
 'C:\\Users\\daarr\\Desktop\\Ironhack\\Proyectos\\Final_project_MAD_Flights\\data\\flights\\flights_with_metars.csv\\1.part']