In [2]:
import pandas as pd

class DataTransformer:
    """
    Transforms data into a unnested dataframe.
    """

    def __init__(self):
        pass

    def transform_data(self, data):
        """
        Transforms input data (json or dataframe) into an unnested dataframe.

        Args:
        - data: Input json data or dataframe.

        Returns:
        - data: transformed data in a table format (dataframe)
        """
        if not isinstance(data, pd.DataFrame):
            df = pd.json_normalize(data)
        else:
            df = data

        self._explode_lists(df)
        self._expand_dicts(df)
        self._drop_empty_columns(df)

        return df

    def _explode_lists(self, df):
        """
        Explodes columns containing lists in the dataframe.

        Args:
        - df: Input dataframe.
        """
        for col in df.columns:
            if df[col].apply(lambda x: isinstance(x, list)).any():
                df = df.explode(col)

    def _expand_dicts(self, df):
        """
        Expands columns containing dictionaries in the dataframe.

        Args:
        - df: Input dataframe.
        """
        for col in df.columns:
            if df[col].apply(lambda x: isinstance(x, dict)).any():
                df = pd.concat([
                    df.drop([col], axis=1),
                    df[col].apply(pd.Series)
                    .rename(columns={
                        "delay": f"{col}_delay",
                        "time": f"{col}_time",
                        "uncertainty": f"{col}_uncertainty"
                    })], axis=1)

    def _drop_empty_columns(self, df):
        """
        Drops columns that contain only NaN values.

        Args:
        - df: Input dataframe.
        """
        cols_to_drop = df.columns[df.isnull().all()]
        df = df.drop(cols_to_drop, axis=1)

In [9]:
import urllib.request
import os

def get_dublin_bus_live_data():
    """
    Fetches live data from the Dublin Bus API.
    """
    url = os.environ.get('API_URL')
    print(url)
    hdr ={
        # Request headers
        'Cache-Control': 'no-cache',
        'x-api-key': os.environ.get('API_KEY'),
    }
    
    try:
        req = urllib.request.Request(url, headers=hdr)
        req.get_method = lambda: 'GET'
        response = urllib.request.urlopen(req)
        print(f"Response code {response.getcode()}")
        return response.read()
    except Exception as e:
        print(f'Extraction error: {e}')
        raise

def transform_dublin_bus_data(json_format_data):
    """
    Transforms Dublin Bus raw data (json) into tabular form.
    """
    tabular_format_data = DataTransformer().transform_data(json_format_data)
    return tabular_format_data


def load_dublin_bus_data(tabular_data):
    print(tabular_data)
    pass
    
def dublinbus_etl():
    
    # fetch the dublin bus live data (json format)
    dublin_bus_raw_data = get_dublin_bus_live_data()
    
    # returns data in tabular form
    dublin_bus_clean_data = transform_dublin_bus_data(dublin_bus_raw_data) 
    
    # load the clean data into database
    load_dublin_bus_data(dublin_bus_clean_data)    

In [10]:
dublinbus_etl()

None
Extraction error: unknown url type: 'None'


ValueError: unknown url type: 'None'