In [2]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


In [3]:
# cargamos el dataframe correspondiente 
df = pd.read_csv("archivos/european_flights.csv")

display(df.head())

Unnamed: 0,YEAR,MONTH_NUM,MONTH_MON,FLT_DATE,APT_ICAO,APT_NAME,STATE_NAME,FLT_DEP_1,FLT_ARR_1,FLT_TOT_1,FLT_DEP_IFR_2,FLT_ARR_IFR_2,FLT_TOT_IFR_2,Pivot Label
0,2016,1,JAN,2016-01-01T00:00:00Z,EBAW,Antwerp,Belgium,4,3,7,,,,Antwerp (EBAW)
1,2016,1,JAN,2016-01-01T00:00:00Z,EBBR,Brussels,Belgium,174,171,345,174.0,161.0,335.0,Brussels (EBBR)
2,2016,1,JAN,2016-01-01T00:00:00Z,EBCI,Charleroi,Belgium,45,47,92,45.0,45.0,90.0,Charleroi (EBCI)
3,2016,1,JAN,2016-01-01T00:00:00Z,EBLG,Liège,Belgium,6,7,13,,,,Liège (EBLG)
4,2016,1,JAN,2016-01-01T00:00:00Z,EBOS,Ostend-Bruges,Belgium,7,7,14,,,,Ostend-Bruges (EBOS)


In [4]:
def exploracion(df):
    df_info = pd.DataFrame()
    df_info["% nulos"] = round(df.isna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["% no_nulos"] = round(df.notna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["tipo_dato"] = df.dtypes
    df_info["num_valores_unicos"] = df.nunique()
    print(f"""El DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.
Tiene {df.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df.duplicated().sum()/df.shape[0], 2)}% de los datos.
Hay {len(list(df_info[(df_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df_info[(df_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df_info[(df_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df_info[(df_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df.describe(exclude="O").T)
    return df_info

exploracion(df)

El DataFrame tiene 688099 filas y 14 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 3 columnas con datos nulos, y son:
['FLT_DEP_IFR_2', 'FLT_ARR_IFR_2', 'FLT_TOT_IFR_2']
y sin nulos hay 11 columnas y son:
['YEAR', 'MONTH_NUM', 'MONTH_MON', 'FLT_DATE', 'APT_ICAO', 'APT_NAME', 'STATE_NAME', 'FLT_DEP_1', 'FLT_ARR_1', 'FLT_TOT_1', 'Pivot Label']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
YEAR,0.0%,100.0%,int64,7
MONTH_NUM,0.0%,100.0%,int64,12
MONTH_MON,0.0%,100.0%,object,12
FLT_DATE,0.0%,100.0%,object,2343
APT_ICAO,0.0%,100.0%,object,332


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
MONTH_MON,688099,12,JAN,63638
FLT_DATE,688099,2343,2021-10-25T00:00:00Z,323
APT_ICAO,688099,332,LOWW,2343
APT_NAME,688099,333,Vienna,2343
STATE_NAME,688099,42,France,138694
Pivot Label,688099,333,Vienna (LOWW),2343


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,688099.0,2018.823684,1.841191,2016.0,2017.0,2019.0,2020.0,2022.0
MONTH_NUM,688099.0,6.301461,3.460197,1.0,3.0,6.0,9.0,12.0
FLT_DEP_1,688099.0,63.238884,110.738988,0.0,5.0,17.0,71.0,847.0
FLT_ARR_1,688099.0,63.278576,110.707515,0.0,5.0,17.0,71.0,813.0
FLT_TOT_1,688099.0,126.51746,221.415893,0.0,10.0,35.0,141.0,1628.0
FLT_DEP_IFR_2,208314.0,143.703097,153.594306,0.0,38.0,91.0,195.0,1039.0
FLT_ARR_IFR_2,208314.0,143.61053,153.359483,0.0,38.0,91.0,195.0,817.0
FLT_TOT_IFR_2,208314.0,287.313628,306.855582,0.0,76.0,182.0,390.0,1624.0


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
YEAR,0.0%,100.0%,int64,7
MONTH_NUM,0.0%,100.0%,int64,12
MONTH_MON,0.0%,100.0%,object,12
FLT_DATE,0.0%,100.0%,object,2343
APT_ICAO,0.0%,100.0%,object,332
APT_NAME,0.0%,100.0%,object,333
STATE_NAME,0.0%,100.0%,object,42
FLT_DEP_1,0.0%,100.0%,int64,808
FLT_ARR_1,0.0%,100.0%,int64,806
FLT_TOT_1,0.0%,100.0%,int64,1599


In [5]:
# Informacion general 
print(f'Informacion df: {df.info()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688099 entries, 0 to 688098
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   YEAR           688099 non-null  int64  
 1   MONTH_NUM      688099 non-null  int64  
 2   MONTH_MON      688099 non-null  object 
 3   FLT_DATE       688099 non-null  object 
 4   APT_ICAO       688099 non-null  object 
 5   APT_NAME       688099 non-null  object 
 6   STATE_NAME     688099 non-null  object 
 7   FLT_DEP_1      688099 non-null  int64  
 8   FLT_ARR_1      688099 non-null  int64  
 9   FLT_TOT_1      688099 non-null  int64  
 10  FLT_DEP_IFR_2  208314 non-null  float64
 11  FLT_ARR_IFR_2  208314 non-null  float64
 12  FLT_TOT_IFR_2  208314 non-null  float64
 13  Pivot Label    688099 non-null  object 
dtypes: float64(3), int64(5), object(6)
memory usage: 73.5+ MB
Informacion df: None


Conclusión excel vuelos: Podemos utilizar para comparar los vuelos entre los meses del año a lo largo de 2016 hasta 2022. También la diferencia entre países, y la media de la UE.

In [6]:
# cargamos el dataframe correspondiente 
df_aero = pd.read_csv("archivos/modified_busiest_airports_2022.csv")

display(df_aero.head())

Unnamed: 0,Rank,lat,long,Airport,Location,Country,Code (IATA/ICAO),Total passengers
0,1.0,33.6324,-84.4277,Hartsfield–Jackson Atlanta International Airport,"Atlanta, Georgia",United States,ATL/KATL,93699630
1,2.0,32.8998,-97.0403,Dallas Fort Worth International Airport,"Dallas–Fort Worth, Texas",United States,DFW/KDFW,73362946
2,3.0,39.8561,-104.6737,Denver International Airport,"Denver, Colorado",United States,DEN/KDEN,69286461
3,4.0,41.9742,-87.9073,O'Hare International Airport,"Chicago, Illinois",United States,ORD/KORD,68340619
4,5.0,25.2532,55.3657,Dubai International Airport,"Garhoud, Dubai",United Arab Emirates,DXB/OMDB,66069981


In [7]:
exploracion(df_aero)

El DataFrame tiene 50 filas y 8 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 0 columnas con datos nulos, y son:
[]
y sin nulos hay 8 columnas y son:
['Rank', 'lat', 'long', 'Airport', 'Location', 'Country', 'Code (IATA/ICAO)', 'Total passengers']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Rank,0.0%,100.0%,float64,50
lat,0.0%,100.0%,float64,50
long,0.0%,100.0%,float64,50
Airport,0.0%,100.0%,object,50
Location,0.0%,100.0%,object,50


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Airport,50,50,Hartsfield–Jackson Atlanta International Airport,1
Location,50,50,"Atlanta, Georgia",1
Country,50,24,United States,19
Code (IATA/ICAO),50,50,ATL/KATL,1


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rank,50.0,25.5,14.57738,1.0,13.25,25.5,37.75,50.0
lat,50.0,30.80229,18.18602,-33.95,25.25493,35.3817,41.67637,52.30806
long,50.0,-15.4512,81.81594,-122.3816,-83.64846,-2.012985,37.07592,151.1817
Total passengers,50.0,44428280.0,14712120.0,28754310.0,31657840.0,41295570.0,52025240.0,93699630.0


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Rank,0.0%,100.0%,float64,50
lat,0.0%,100.0%,float64,50
long,0.0%,100.0%,float64,50
Airport,0.0%,100.0%,object,50
Location,0.0%,100.0%,object,50
Country,0.0%,100.0%,object,24
Code (IATA/ICAO),0.0%,100.0%,object,50
Total passengers,0.0%,100.0%,int64,50


Conclusion sobre el dataset de aeropueto: no lo utilizaría, ya que no podemos separar por meses. Tenemos los datos total del año de 2022, pero no hay manera de hacer alguna conclusão por fechas navideñas o comparativas. 