In [2]:
import os
import pandas as pd
from typing import List, Literal
import json

In [104]:
class IngestionSilverAirport:
    """
    A class to extract data from airports.

    Attributes
    ----------
    source : Literal['VRA', 'AIR_CIA', 'icaos']
        The source of the data.
    output_format: Literal['csv', 'parquet', 'json']
        Format of the output file.
    folder_path : str
        The path of the folder.
    file_format : Literal['csv', 'json', 'parquet']
        Format of the input file.
    pks: List[str]
        The primary keys of the table.
    order_by: str
        The order to filter the table.
        
    Methods
    -------
    get_df_from_csv(file_path: str, **kwargs) -> pd.DataFrame :
        Load the file from a CSV file to a dataframe.
    get_df_from_json(file_path: str, **kwargs) -> pd.DataFrame :
        Load the file from a JSON file to a dataframe.
    get_df_from_parquet(file_path: str, **kwargs) -> pd.DataFrame :
        Load the file from a JSON file to a dataframe.
    extract(file_path: str, **kwargs) -> pd.DataFrame :
        Extract the data from a file and return a dataframe.
    upsert(self, df: pd.DataFrame) -> pd.DataFrame :
        Selects only the most up to date rows from the dataframe.
    strip_columns(df: pd.DataFrame) -> pd.DataFrame :
        Remove whitespaces in the beginning and in the end 
        for all columns in a dataframe.
    str_to_none(df: pd.DataFrame) -> pd.DataFrame :
        Transform different strings that represent null values into None type.
    transform(df: pd.DataFrame) -> pd.DataFrame :
        Apply all transformations to the dataframe.
    save_file(df: pd.DataFrame, source: str | None, **kwargs) -> None:
        Saves the dataframe in the specified file format.
    process_data() -> None:
        Process all the data from the folder.
    """

    def __init__(self, 
                 source: Literal['VRA', 'AIR_CIA', 'icaos'],
                 output_format: Literal['csv', 'parquet', 'json'] = 'parquet'
                 ) -> None:
        """
        Constructs all the necessary atributes for the airport data object.

        Parameters
        ----------
        source : Literal['VRA', 'AIR_CIA', 'icaos']
            The source of the data.
        output_format: Literal['csv', 'parquet', 'json']
            Format of the output file.
        """

        self.source = source
        self.output_format = output_format

        with open('config.json') as f:
            config = json.load(f)

        self.folder_path = config[source]['folder_path']
        self.file_format = config[source]['file_format']
        self.pks = config[source]['pks']
        self.order_by = config[source]['order_by']
    
    
    def get_df_from_csv(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Load the file from a CSV file to a dataframe.

        Parameters
        ----------
        file_path : str
            The path of the file.
        kwargs : kwargs
            Keyword arguments.
        Returns
        -------
        pd.DataFrame
            A dataframe.
        """

        if os.path.exists(file_path):
            return pd.read_csv(file_path, **kwargs)
        else:
            print(f"File {file_path} not found")

    
    def get_df_from_json(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Load the file from a JSON file to a dataframe.

        Parameters
        ----------
        file_path : str
            The path of the file.
        kwargs : kwargs
            Keyword arguments.
        Returns
        -------
        pd.DataFrame
            A dataframe.
        """

        if os.path.exists(file_path):
            return pd.read_json(file_path, **kwargs)
        else:
            print(f"File {file_path} not found")
        

    def get_df_from_parquet(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Load the file from a parquet file to a dataframe.

        Parameters
        ----------
        file_path : str
            The path of the file.
        kwargs : kwargs
            Keyword arguments.
        Returns
        -------
        pd.DataFrame
            A dataframe.
        """

        if os.path.exists(file_path):
            return pd.read_parquet(file_path, **kwargs)
        else:
            print(f"File {file_path} not found")
    

    def extract(self, file_path: str, **kwargs) -> pd.DataFrame:
        """
        Extract the data from a file and return a dataframe.

        Parameters
        ----------
        file_path : str
            The path of the file.
        kwargs : dict
            Keyword arguments.

        Returns
        -------
        List[pd.DataFrame]
            A list of dataframes.
        """

        df = getattr(self, f'get_df_from_{self.file_format}')(file_path, **kwargs)

        return df
    
    
    def create_columns_air_cia(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create new columns for the the dataframe air_cia.

        Parameters
        ----------
        df : pd.DataFrame 
            original dataframe

        Returns
        -------
        df : pd.DataFrame
            modified dataframe
        """

        df[['icao', 'iata']] = df['icao_iata'].str.split(' ', expand=True)
        df.drop(columns=['icao_iata'], inplace=True)

        return df
    

    def strip_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Remove whitespaces in the beginning and in the end 
        for all columns in a dataframe.

        Parameters
        ----------
        df : pd.DataFrame 
            original dataframe

        Returns
        -------
        df : pd.DataFrame
            modified dataframe
        """

        df = df.apply(lambda x: x.str.strip() if x.dtype == object else x)
        
        return df
    
    def str_to_none(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Transform different strings that represent null values into None type.

        Parameters
        ----------
        df : pd.DataFrame 
            original dataframe

        Returns
        -------
        df : pd.DataFrame
            modified dataframe
        """

        df = df.map(lambda x: None if x in ['', 'N/A'] else x)
        
        return df


    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Apply all transformations to the dataframe.

        Parameters
        ----------
        dfs : pd.DataFrame
            A dataframe to transform.

        Returns
        -------
        pd.DataFrame
            A dataframe with all transformations applied.
        """

        if self.source == 'AIR_CIA':
            df = self.create_columns_air_cia(df)

        df = self.strip_columns(df)
        df = self.str_to_none(df)

        return df
        
    
    def save_file(self, df: pd.DataFrame, **kwargs) -> None:
        """
        Saves the dataframe in the specified file format.

        Parameters
        ----------
        df : pd.DataFrame
            A dataframe to be saved as a file.
        kwargs : kwargs
            Keyword arguments

        Returns
        -------
        None
        """

        output_folder = self.folder_path.replace('bronze', 'silver')   
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        filepath = f'{output_folder}/silver_{self.source}.{self.output_format}'

        if self.output_format == 'json' and not kwargs:
            kwargs = {'orient': 'records', 'indent': 4}
            
        getattr(df, f'to_{self.output_format}')(filepath, index=False, **kwargs)
    

    def process_data(self) -> None:
        """
        Add the data from folder incremental with the history.

        Returns
        -------
        None
        """

        file_path = f'{self.folder_path}/bronze_{self.source}.{self.file_format}'
        df = self.extract(file_path)
        df = self.transform(df)
        self.save_file(df)

In [105]:
sources = ['VRA', 'AIR_CIA', 'icaos']

for source in sources:
    ing = IngestionSilverAirport(
        source=source
    )
    ing.process_data()