### Package Python

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import os as os
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import autocorrelation_plot
from sklearn import linear_model

### Settings Jupyter

In [2]:
# set the max columns to none
pd.set_option('display.max_columns', None)

### Functions

In [3]:
def plotar(titulo: str, labelx: str, labely: str, x: str, y: str, dataset: dict, fontt: int, fontlx: int, fontly: int) -> dict: 
    """Função para plotagem de gráfico"""
    sns.set_palette('Accent')
    sns.set_style('darkgrid')
    ax = sns.lineplot(x = x, y = y, data = dataset)
    ax.figure.set_size_inches(12, 6)
    ax.set_title(titulo, loc='left', fontsize=fontt)
    ax.set_xlabel(labelx, fontsize=fontlx)
    ax.set_ylabel(labely, fontsize=fontly)
    ax = ax
    return ax

In [4]:
def plot_comparacao(x, y1,y2,y3, y4=None, params=False, dataset=None, titulo=None):
    plt.figure(figsize=(16,12))
    if params:
        ax = plt.subplot(4,1,1)
        ax.set_title(titulo, fontsize=18, loc='left')
        sns.lineplot(x = x, y = y1, data = dataset) 
        ax = plt.subplot(4,1,2)
        sns.lineplot(x = x, y = y2, data = dataset) 
        ax = plt.subplot(4,1,3)
        sns.lineplot(x = x, y = y3, data = dataset) 
        ax = plt.subplot(4,1,4)
        sns.lineplot(x = x, y = y4, data = dataset)     
    else:
        ax = plt.subplot(3,1,1)
        ax.set_title(titulo, fontsize=18, loc='left')
        sns.lineplot(x = x, y = y1, data = dataset) 
        ax = plt.subplot(3,1,2)
        sns.lineplot(x = x, y = y2, data = dataset) 
        ax = plt.subplot(3,1,3)
        sns.lineplot(x = x, y = y3, data = dataset)  
    ax = ax
    return ax

### Objective of this notebook.

This analyze aims to explore the ANAC database of flights from January 2022. It was used basic database for this study.

### Reading of bases

In [5]:
_path = os.getcwd()
_path

'c:\\git\\Gol_Cases'

In [6]:
_path1 = os.path.join(_path, 'data', 'basica2022-01.txt')
_path1

'c:\\git\\Gol_Cases\\data\\basica2022-01.txt'

In [7]:
_df = pd.read_csv(_path1, sep=';', encoding='latin1')
_df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(69597, 110)

* Cleaning not necessary data for analyze.

In [8]:
_df = _df.drop(columns=[
'sg_empresa_icao',	
'sg_empresa_iata',
'nr_singular',	
'id_di',	
'cd_di',
'ds_di',
'ds_grupo_di', 
'nr_ano_referencia',
'nr_semestre_referencia',
'nr_trimestre_referencia',
'nr_ano_mes_referencia',
'id_tipo_linha',
'cd_tipo_linha',
'ds_tipo_linha',
'ds_natureza_tipo_linha',
'ds_servico_tipo_linha',
'nr_ano_partida_real',
'nr_semestre_partida_real',
'nm_semestre_partida_real',
'nr_trimestre_partida_real',
'nm_trimestre_partida_real',
'nr_mes_partida_real',
'nm_mes_partida_real',
'nr_semana_partida_real',
'nm_dia_semana_partida_real',
'nr_ano_mes_partida_real',
'sg_icao_origem',
'sg_uf_origem',
'nm_regiao_origem',	
'nr_ano_chegada_real',	
'nr_semestre_chegada_real',
'nm_semestre_chegada_real',
'nr_trimestre_chegada_real',
'nm_trimestre_chegada_real',
'nr_mes_chegada_real',
'nm_mes_chegada_real',
'nr_ano_mes_chegada_real',
'ds_matricula',
'sg_uf_destino',
'nm_regiao_destino',
'nr_escala_destino',
'lt_combustivel',
'kg_bagagem_livre',
'kg_bagagem_excesso',
'nr_decolagem',
'id_arquivo',
'nm_arquivo',
'nr_linha',	
'dt_sistema'])
_df.shape

(69597, 61)

In [9]:
_df = _df.rename(columns={'nr_assentos_ofertados':'pax_ofertados', 'nr_passag_pagos':'pax', 'nr_rpk':'rpk'})

In [10]:
_df.dtypes

id_basica                int64
id_empresa               int64
nm_empresa              object
nm_pais                 object
ds_tipo_empresa         object
                         ...  
nr_bagagem_gratis_km     int64
nr_ask                   int64
rpk                      int64
nr_atk                   int64
nr_rtk                   int64
Length: 61, dtype: object

Verifying nan values in the columns.

In [11]:
_df.describe()

Unnamed: 0,id_basica,id_empresa,nr_voo,nr_mes_referencia,nr_semana_referencia,nr_dia_referencia,nr_dia_partida_real,id_aerodromo_origem,nr_etapa,nr_semana_chegada_real,nr_dia_chegada_real,id_equipamento,id_aerodromo_destino,pax_ofertados,kg_payload,km_distancia,pax,nr_passag_gratis,kg_carga_paga,kg_carga_gratis,kg_correio,kg_peso,nr_pax_gratis_km,nr_carga_paga_km,nr_carga_gratis_km,nr_correio_km,nr_bagagem_paga_km,nr_bagagem_gratis_km,nr_ask,rpk,nr_atk,nr_rtk
count,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69596.0,69596.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0,69597.0
mean,22641910.0,1000946.0,3490.833944,1.0,3.571246,15.898473,15.901504,621.361898,1.055591,3.588022,15.910225,187.901835,623.448583,154.157967,19613.930442,1494.948403,123.306163,1.892467,1667.116973,6.296378,97.127965,12290.853686,3125.178312,8564423.0,17100.49,161172.8,12019.16,1530424.0,281952.8,224111.7,46148.29,28228.93175
std,20536.45,394.5483,2255.404553,0.0,1.300965,8.905715,8.904413,1884.908792,0.292194,1.298763,8.89859,188.504497,1892.198097,66.30414,13506.770209,1795.244317,57.833278,2.585987,7688.578916,71.073279,963.443954,8684.315281,10002.984847,49408510.0,372065.6,1777468.0,37337.98,4861559.0,549642.5,422182.2,112489.2,70439.806904
min,22604550.0,1000002.0,2.0,1.0,1.0,1.0,1.0,5.0,0.0,1.0,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22624000.0,1000854.0,1802.0,1.0,2.0,8.0,8.0,218.0,1.0,2.0,8.0,16.0,218.0,118.0,14998.0,496.0,95.0,0.0,0.0,0.0,0.0,8654.0,0.0,0.0,0.0,0.0,0.0,206584.0,59312.0,45018.0,7160.0,4264.0
50%,22642010.0,1000909.0,3178.0,1.0,4.0,16.0,16.0,427.0,1.0,4.0,16.0,101.0,427.0,170.0,18165.0,874.0,130.0,1.0,53.0,0.0,0.0,11432.0,757.0,37823.0,0.0,0.0,0.0,637704.0,144210.0,108585.0,16283.0,9865.0
75%,22659810.0,1001093.0,4432.0,1.0,5.0,24.0,24.0,633.0,1.0,5.0,24.0,272.0,633.0,186.0,19800.0,1800.0,160.0,3.0,445.0,0.0,0.0,14188.0,2495.0,439236.0,0.0,0.0,7686.0,1630210.0,297792.0,249018.0,34735.0,23021.0
max,22677710.0,1007484.0,9998.0,1.0,6.0,31.0,31.0,21967.0,7.0,6.0,31.0,578.0,21967.0,516.0,132631.0,12217.0,423.0,39.0,112046.0,7790.0,29061.0,112046.0,264908.0,980930000.0,59430200.0,173552000.0,1328650.0,96776600.0,6303970.0,5167790.0,1230060.0,980930.0


In [12]:
_counts_cols = _df.count().reset_index()
_counts_cols.columns = ['columns', 'values']
_counts_cols

Unnamed: 0,columns,values
0,id_basica,69597
1,id_empresa,69597
2,nm_empresa,69597
3,nm_pais,69597
4,ds_tipo_empresa,69597
...,...,...
56,nr_bagagem_gratis_km,69597
57,nr_ask,69597
58,rpk,69597
59,nr_atk,69597


In [13]:
_counts_cols[_counts_cols['values']!=69597]

Unnamed: 0,columns,values
19,sg_iata_origem,69588
25,hr_chegada_real,69595
27,nr_semana_chegada_real,69596
28,nm_dia_semana_chegada_real,69596
29,nr_dia_chegada_real,69596
35,sg_iata_destino,69588
48,nr_horas_voadas,69595
50,nr_velocidade_media,69595


In [14]:
_df =_df.fillna(0)

Removing white spaces.

In [15]:
_cols = _df.select_dtypes(include='object')
_cols = _cols.columns.to_list()
for i in _cols:
    _df[i] = _df[i].str.strip()


In [16]:
_df.loc[19].to_list()

[22675653,
 1000358,
 'AEROVIAS DEL CONTINENTE AMERICANO S.A. AVIANCA',
 'COLÔMBIA',
 'ESTRANGEIRA REGULAR',
 86,
 '2022-01-11',
 '1º SEMESTRE',
 '1º TRIMESTRE',
 1,
 'JANEIRO',
 3,
 'TERÇA-FEIRA',
 11,
 'INTERNACIONAL',
 '07:28:00',
 '2022-01-11',
 11,
 301,
 'GRU',
 'GUARULHOS - GOVERNADOR ANDRÉ FRANCO MONTORO',
 'GUARULHOS',
 'BRASIL',
 'AMÉRICA DO SUL',
 1,
 '13:19:00',
 '2022-01-11',
 3.0,
 'TERÇA-FEIRA',
 11.0,
 15,
 'A320',
 'AIRBUS A320-100/200',
 111,
 'SKBO',
 'BOG',
 'EL DORADO INTERNATIONAL AIRPORT',
 'BOGOTÁ',
 'COLÔMBIA',
 'AMÉRICA DO SUL',
 150,
 17600,
 4336,
 150,
 0,
 57,
 0,
 0,
 '5,85',
 13557,
 '741,17',
 0,
 247152,
 0,
 0,
 0,
 0,
 650400,
 650400,
 76313,
 58783]