In [85]:
import os
import pandas as pd
import glob
from typing import List, Literal
import requests
from multiprocessing.pool import ThreadPool
from dotenv import load_dotenv
import json
import time

In [86]:
load_dotenv('.env')
API_KEY = os.getenv('API_KEY')

In [181]:
class IngestionRawAirport:
    """
    A class to extract data from airports.

    Attributes
    ----------
    source : Literal['VRA', 'AIR_CIA']
        The source of the data.
    delimiter : str, optional
        Delimiter character used in CSV files (default is ';').
    enconding : str, optional
        Enconding used in JSON files (default is 'utf-8-sig').
    orient : str, optional
        Indication of expected JSON string format.
    output_format: Literal['csv', 'parquet', 'json']
        Format of the output file.
    folder_path : str
        The path of the folder.
    file_format : Literal['csv', 'json']
        Format of the input file.
    icaos_list: list
        List to be filled with the ICAOs
        
    Methods
    -------
    get_file_paths() :
        Get a list of all files in the folder with the given file_format.
    get_df_from_csv(file_path: str, **kwargs) :
        Load the file from a CSV file to a dataframe.
    get_df_from_json(file_path: str, **kwargs) :
        Load the file from a JSON file to a dataframe.
    move_file(file_path: str) :
        Moves the file when the extraction is complete.
    extract(file_path: str, **kwargs) :
        Extract the data from a file and return a dataframe.
    normalize_columns_vra(df: pd.DataFrame) :
        Changes the columns names of VRA to snake case.
    normalize_columns_air_cia(self, df: pd.DataFrame) :
        Changes the columns names of AIR CIA to snake case.
    transform(df: pd.DataFrame) :
        Apply all transformations to the dataframe.
    add_icaos_to_list(df: pd.DataFrame) :
        Get a list of ICAOs from a dataframe
    request_icao(icao: str) :
        Request from a API the information for the ICAO
    get_icaos_data(icaos: List[str]):
        Get the data for all the ICAOs and create a dataframe.
    save_file(df: pd.DataFrame, file_name: str, source: str | None, **kwargs) :
        Saves the dataframe in the specified file format.
    process_icaos(df: pd.DataFrame, vra_suffix: str) :
        Processes the infomation from the API.
    process_file(file_path: str) :
        Process a file.
    process_data() :
        Process all the data from the folder.
    """

    def __init__(self, 
                 source: Literal['VRA', 'AIR_CIA'],
                 delimiter: str = ';',
                 encoding: str = 'utf-8-sig',
                 orient: str = 'records',
                 output_format: Literal['csv', 'parquet', 'json'] = 'parquet'
                 ) -> None:
        """
        Constructs all the necessary atributes for the airport data object.

        Parameters
        ----------
        source : Literal['VRA', 'AIR_CIA']
            The source of the data.
        file_format : Literal['csv', 'json']
            Format of the input file.
        delimiter : str, optional
            Delimiter character used in CSV files (default is ';').
        enconding : str, optional
            Enconding used in JSON files (default is 'utf-8-sig').
        orient : str, optional
            Indication of expected JSON string format.
        output_format: Literal['csv', 'parquet', 'json']
            Format of the output file.
        icaos: list
            List with all the ICAOs from the VRAs files.
        """

        self.source = source
        self.delimiter = delimiter
        self.encoding = encoding
        self.orient = orient
        self.output_format = output_format
        self.icaos_list = []

        with open('config.json') as f:
            config = json.load(f)

        self.folder_path = config[source]['folder_path']
        self.file_format = config[source]['file_format']


    def get_file_paths(self) -> List[str]:
        """
        Get a list of all files in the folder with the given file_format.

        Returns
        -------
        List[str]
            A list of strings.
        """

        file_paths = glob.glob(os.path.join(self.folder_path, f"*.{self.file_format}"))
        if not file_paths:
            raise ValueError(f"No {self.file_format} files found in the folder {self.folder_path}")
        
        return file_paths
    
    
    def get_df_from_csv(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Load the file from a CSV file to a dataframe

        Parameters
        ----------
        file_path : str
            File path to the CSV file.
        kwargs : kwargs
            Keyword arguments.
        Returns
        -------
        pd.DataFrame
            A dataframe.
        """

        if kwargs:
            return pd.read_csv(file_path, **kwargs)
        else:
            return pd.read_csv(file_path, delimiter=self.delimiter)
    
    
    def get_df_from_json(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Load the file from a JSON file to a dataframe.

        Parameters
        ----------
        file_path : str
            File path to the JSON file.
        kwargs : kwargs
            Keyword arguments.
            
        Returns
        -------
        pd.DataFrame
            A dataframe.
        """
        
        if kwargs:
            return pd.read_json(file_path, **kwargs)
        else:
            return pd.read_json(file_path, encoding=self.encoding, orient=self.orient)

    def move_file(self, file_path: str) -> None:
        """
        Moves the file when the extraction is complete.

        Parameters
        ----------
        file_path : str
            File path to the source file
        
        Returns
        -------
        None
        """

        new_path = file_path.replace('landing', 'proceeded')
        folder = '/'.join(new_path.split('/')[:-1])
        if not os.path.exists(folder):
            os.makedirs(folder)
        os.rename(file_path, new_path)   

    def extract(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Extract the data from a file and return a dataframe.

        Parameters
        ----------
        file_path : str
            File path to the file.
        kwargs : dict
            Keyword arguments.

        Returns
        -------
        pd.DataFrame
            A dataframe.
        """

        df = getattr(self, f'get_df_from_{self.file_format}')(file_path, **kwargs)
        self.move_file(file_path)
        
        return df
    

    def normalize_columns_vra(self, df: pd.DataFrame) -> None:
        """
        Changes the columns names of VRA to snake case.

        Parameters
        ----------
        df : pd.DataFrame
            A dataframe.

        Returns
        -------
        None
        """

        column_names_vra = {
            'ICAOEmpresaAérea': 'ICAO_empresa_area',
            'NúmeroVoo': 'numero_voo',
            'CódigoAutorização': 'codigo_autorizacao',
            'CódigoTipoLinha': 'codigo_tipo_linha',
            'ICAOAeródromoOrigem': 'ICAO_aerodromo_origem',
            'ICAOAeródromoDestino': 'ICAO_aerodromo_destino',
            'PartidaPrevista': 'partida_prevista',
            'PartidaReal': 'partida_real',
            'ChegadaPrevista': 'chegada_prevista',
            'ChegadaReal': 'chegada_real',
            'SituaçãoVoo': 'situacao_voo',
            'CódigoJustificativa': 'codigo_justificativa'}

        df.rename(columns=column_names_vra, inplace=True)


    def normalize_columns_air_cia(self, df: pd.DataFrame) -> None:
        """
        Changes the columns names of AIR CIA to snake case.

        Parameters
        ----------
        df : pd.DataFrame
            A dataframe.

        Returns
        -------
        None
        """

        column_names_air_cia = {
            'Razão Social': 'razao_social',
            'ICAO IATA': 'icao_iata',
            'CNPJ': 'cnpj',
            'Atividades Aéreas': 'atividades_aereas',
            'Endereço Sede': 'endereco_sede',
            'Telefone': 'telefone',
            'E-Mail': 'email',
            'Decisão Operacional': 'decisao_operacional',
            'Data Decisão Operacional': 'data_decisao_operacional',
            'Validade Operacional': 'validade_operacional'}

        df.rename(columns=column_names_air_cia, inplace=True)


    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Apply all transformations to the dataframe.

        Parameters
        ----------
        df : pd.DataFrame
            Dataframe to be transformed.

        Returns
        -------
        pd.DataFrame
            Dataframe with all transformations applied.
        """

        getattr(self, f'normalize_columns_{self.source.lower()}')(df)

        if self.source == 'VRA':
            df['codigo_autorizacao'] = df['codigo_autorizacao'].astype(str)

        df['timestamp'] = int(time.time())

        return df
    

    def add_to_icaos_list(self, df: pd.DataFrame) -> None:
        """
        Add the ICAO from the dataframe to the list.

        Parameters
        ----------
        df : pd.DataFrame
            Dataframe with a column with the ICAOs.
        
        Returns
        -------
        None
        """
        
        icaos_dest = df['ICAO_aerodromo_destino'].unique()
        icaos_orig = df['ICAO_aerodromo_origem'].unique()
        icaos = set([*icaos_dest] + [*icaos_orig])
        self.icaos_list.extend(icaos)
    

    def request_icao(self, icao: str) -> dict:
        """
        Request from a API the information for the ICAO

        Parameters
        ----------
        icao : str
            Code for an airport, should be a string of 4 letters.

        Returns
        -------
        dict
            Info about the airport.
        """

        url = "https://airport-info.p.rapidapi.com/airport"

        querystring = {"icao": icao}

        headers = {
            "X-RapidAPI-Key": API_KEY,
            "X-RapidAPI-Host": "airport-info.p.rapidapi.com"
        }

        response = requests.get(url, headers=headers, params=querystring)
        payload = response.json()

        if not payload.get('error') and not payload.get('message'):
            return payload    
    

    def get_icaos_data(self) -> List[dict]:
        """
        Get the data for all the ICAOs.

        Returns
        -------
        List[dict]
            A list of dicts with ICAOs info.
        """

        icaos = set(self.icaos_list)

        with ThreadPool(10) as pool:
            data = pool.map(self.request_icao, icaos)

        return data
    
    
    def save_file(self, df: pd.DataFrame, file_name: str, 
                  source: str | None = None,  **kwargs) -> None:
        """
        Saves the dataframe in the specified file format.

        Parameters
        ----------
        df : pd.DataFrame
            Dataframe to be saved as a file.
        file_name : str
            The name of the file.
        source : str |  None (optional)
            source of the data
        kwargs : kwargs
            Keyword arguments

        Returns
        -------
        None
        """

        if source == 'icaos':
            folder_path = f"{self.folder_path.replace('VRA', 'icaos').replace('landing', 'staged')}/{self.output_format}"
        else:
            folder_path = f"{self.folder_path.replace('landing', 'staged')}/{self.output_format}"
            
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        
        filepath = f'{folder_path}/{file_name}.{self.output_format}'

        if self.output_format == 'json' and not kwargs:
            kwargs = {'orient': 'records', 'indent': 4}
            
        getattr(df, f'to_{self.output_format}')(filepath, index=False, **kwargs)


    def process_icaos(self) -> None:
        """
        Processes the infomation from the API.

        Returns
        -------
        None
        """

        raw_data = self.get_icaos_data()
        data = [i for i in raw_data if pd.notna(i)]
        df_icaos = pd.DataFrame(data)
        timestamp = int(time.time())
        df_icaos['timestamp'] = timestamp
        self.save_file(df_icaos, file_name=f'icaos_info_{timestamp}', source='icaos')


    def process_file(self, file_path: str) -> None:
        """
        Process a file.

        Parameters
        ----------
        file_path : str 
            The path to the folder.
        
        Returns
        -------
        None
        """
        
        df = self.extract(file_path)
        df = self.transform(df)

        file_name = file_path.split('/')[-1].split('.')[0]

        if self.source == 'VRA':
            self.add_to_icaos_list(df)

        self.save_file(df, file_name)


    def process_data(self) -> None:
        """
        Process all the data from the folder.
        """
        
        file_paths = self.get_file_paths()
        
        with ThreadPool(10) as pool:
            pool.map(self.process_file, file_paths)

        self.process_icaos()        


In [88]:
sources = ['VRA', 'AIR_CIA']

for source in sources:
    ing = IngestionRawAirport(
        source=source
    )
    ing.process_data()