# Building an ETL Pipeline

## Importing Libraries

In [165]:
import pandas as pd # Data Transformation
import requests     # Establishing connection with the web (API)
import json         # Exposes an API familiar to users of the standard library marshal and pickle modules
import pytest       # Makes it easy to write small, readable tests
import os
from dotenv import load_dotenv
import sqlalchemy
from sqlalchemy.engine import Engine, create_engine
import ipywidgets as widgets

## [E] Extraction

### API
- Data Source: https://docs.coincap.io/#89deffa0-ab03-4e0a-8d92-637a857d2c91
- Data comes in a JSON format (semi-structured data) from a 3rd Party

### Extraction Function

In [166]:
def extract(api_get_request_url):
    """
    Create and return a raw JSON extracted from an external API via GET Request.

    Args:
        api_get_request_url: The URL for API's GET Request.

    Returns:
        raw_data_json: a raw JSON with the API data.
    """
    
    # GET Request. Package the request, send the request and catch the response r
    r = requests.get(api_get_request_url)
    
    # Check if the request was successful
    if r.status_code == 200:
        # Extract JSON data from response: Decode the JSON data into a dictionary
        raw_data_json = r.json()
    else:
        print(f"Error. Non-success status code: {r.status_code}")
    
    # Return the json
    return raw_data_json

In [167]:
# Function to Convert Dates to UNIX Milliseconds

from datetime import datetime

def convert_to_unix_milliseconds(start_date_str, end_date_str, date_format='%Y-%m-%d %H:%M:%S'):
    """
    Converts start and end dates to UNIX time in milliseconds.

    Parameters:
    start_date_str (str): The start date as a string.
    end_date_str (str): The end date as a string.
    date_format (str): The format of the input date strings (default is '%Y-%m-%d %H:%M:%S').

    Returns:
    tuple: A tuple containing the start and end dates in UNIX milliseconds.
    """
    try:
        # Convert the date strings to datetime objects
        start_date = datetime.strptime(start_date_str, date_format)
        end_date = datetime.strptime(end_date_str, date_format)

        # Convert datetime objects to UNIX time in seconds and then to milliseconds
        start_unix_ms = int(start_date.timestamp() * 1000)
        end_unix_ms = int(end_date.timestamp() * 1000)

        return start_unix_ms, end_unix_ms
    except ValueError as e:
        print(f"An error occurred: {e}")
        return None

# Calling the function:
start_date_str = '2023-01-01 12:00:00'
end_date_str = '2024-05-31 12:00:00'
start_unix_ms, end_unix_ms = convert_to_unix_milliseconds(start_date_str, end_date_str)
print(f"Start UNIX time in milliseconds: {start_unix_ms}")
print(f"End UNIX time in milliseconds: {end_unix_ms}")

# Building the GET Request URL
coin_str = 'bitcoin'
interval_str = 'd1'
start_date_str = start_unix_ms
end_date_str = end_unix_ms
print(f"This is the time series URL for {coin_str}:")
bitcoin_url = f"https://api.coincap.io/v2/assets/{coin_str}/history?interval={interval_str}&start={start_date_str}&end={end_date_str}"
print(bitcoin_url)

Start UNIX time in milliseconds: 1672574400000
End UNIX time in milliseconds: 1717156800000
This is the time series URL for bitcoin:
https://api.coincap.io/v2/assets/bitcoin/history?interval=d1&start=1672574400000&end=1717156800000


In [168]:
# Call the extract() function on a time series data
# d1 = the day interval represents average of 24 hour periods (https://docs.coincap.io/#89deffa0-ab03-4e0a-8d92-637a857d2c91)
raw_data_json = extract(bitcoin_url)

In [169]:
# Quick visualization
print("Printing JSON names (keys):")
for key in raw_data_json.keys():
    print("->", key)

# Convert JSON to string
json_str = json.dumps(raw_data_json, indent=2)

# Create a text area widget
json_output = widgets.Textarea(
    value=json_str,
    placeholder='JSON data',
    description='JSON:',
    disabled=True,
    layout=widgets.Layout(width='80%', height='200px')
)

# Display the widget
display(json_output)

Printing JSON names (keys):
-> data
-> timestamp


Textarea(value='{\n  "data": [\n    {\n      "priceUsd": "16708.5235619029337193",\n      "time": 167261760000…

## [T] Transformation
* Exploring JSON Structure
* Data Normalization
    * Transforming the JSON into a table (as a Dataframe)
* Data Exploration & Cleaning
    * Check for: Missing Data, Data Types, Rounding needs.
* Transforming into a CSV

### Exploring JSON Structure
* Exploring the JSON name-value pairs


In [170]:
# Exploring the JSON structure
print("This is the Json's type from the API request:", type(raw_data_json))
print("It's a dict, so let's check how many keys it has:", len(raw_data_json))

# Saving Keys and Values as Lists
raw_data_json_keys_ls = list(raw_data_json.keys())
raw_data_json_values_ls = list(raw_data_json.values())

print("It is a nested JSON, with two key-value pais: data and timestamp.")
print("The actual information comes in the 'data' key along with its associated 'timestamp' as another key.")
print("How many key-value pairs in the 'data' key:", len(raw_data_json_values_ls[0]))
potential_columns = len(raw_data_json_values_ls[0][0])
potential_rows = len(raw_data_json_values_ls[0])
print(f"-> There are {potential_columns} features in a single record of 'data'.")
print(f"-> In total, there are {potential_rows} records stored in 'data' and each record has {potential_columns} features.")
print("We should see the same when storing in Dataframe.")


This is the Json's type from the API request: <class 'dict'>
It's a dict, so let's check how many keys it has: 2
It is a nested JSON, with two key-value pais: data and timestamp.
The actual information comes in the 'data' key along with its associated 'timestamp' as another key.
How many key-value pairs in the 'data' key: 516
-> There are 3 features in a single record of 'data'.
-> In total, there are 516 records stored in 'data' and each record has 3 features.
We should see the same when storing in Dataframe.


### Transformation Function
Notes: 
* The list of records are stored in the 'data' key of the JSON file, as seen above.
* Before creating the transformation function, we performed a data exploration:
    * The price data was converted to Numerical (float) just in case.
    * The date seems to follow the ISO 8601 format. Thus, it was converted to Pandas `datetime`.
    * There were some **Missing** (`NULL`) data ('maxSupply', 'explorer'). We handled it by hardcoding within the transformation function.

In [171]:
# For the Converting part
# Selecting the columns and their correct types
cols_datatypes = {
    'priceUsd' : float
}

# For the Rounding part
# Selecting the columns we want to round to some decimal places
cols_to_round = ['priceUsd']

In [172]:
def transform(raw_data_json, cols_datatypes, cols_to_round):
    """
    Create and return a raw dataset in the DataFrame format based on a JSON.
    Here, the function is not automated to both the missing values and the rounding parts.
    You need to hardcode it here how to handle each case.

    Args:
        raw_data_json: The raw json returned by the extraction function.
        cols_datatypes: A dictionary of columns and the associated types you want for them.
    Returns:
        raw_normalized_data_df: a raw, normalized dataset in the DataFrame format.
    """    
    
    # 1) Data Normalization
    # record_path = "data": path to list of records.
    # sep = ".": Nested records will generate names separated by 'sep'.
    raw_normalized_data_df = pd.json_normalize(raw_data_json, record_path = "data", sep = '.') 

    # Assert that the transform function returns a pd.DataFrame
    assert isinstance(raw_normalized_data_df, pd.DataFrame)
    
    # 2) Data Cleaning
    # Inner Function to Convert Categorical to Numerical
    def convert_cols_cat_to_num(df, cols_datatypes):
        """
        Gets a dataframe with specific categorical columns and convert them into numerical columns. 

        Args:
            df: The dataframe with the columns you want to convert.
            cols_datatypes: A dictionary of columns and the associated types you want for them. 

        Returns:
            Engine: A dataframe with columns converted to the types you want. 
        """
        for col, datatype in cols_datatypes.items():
            if col in df.columns:
                df[col] = df[col].astype(datatype)
        return df
    
    # Converting Categorical to Numerical
    converted_data_df = convert_cols_cat_to_num(raw_normalized_data_df, cols_datatypes)

    # Inner function to convert ISO 8601 date to pandas datetime
    def convert_iso_date_to_datetime(df, col_name):
        try:
            # Convert the specified column to pandas datetime 
            df[col_name] = pd.to_datetime(df[col_name]).dt.date
            return df
        except Exception as e:
            print(f"An error occurred when converting the date: {e}")
            return None

    # Convert Date to Pandas Datetime
    converted_data_df = convert_iso_date_to_datetime(converted_data_df, 'date')
    
    # Inner function to drop columns
    def drop_columns(df, cols_to_drop):
        try:
            # Drop specified columns
            df.drop(columns=cols_to_drop, inplace=True)
            return df
        except Exception as e:
            print(f"An error occurred when dropping columns: {e}")
            return None
    
    # Dropping 'time' column
    converted_data_df = drop_columns(converted_data_df, ['time'])

    # Handling Missing Values
    # There is no non-null values, so we are okay.    

    # Inner function to round columns
    def round_to_two_decimal_places(df, cols_to_round):
        try:
            # Round columns to 2 decimal places
            # Note: 'applymap' applies a function that accepts and returns a scalar to every element of a DataFrame
            df[cols_to_round] = df[cols_to_round].apply(lambda x: x.round(2))
            return df
        except Exception as e:
            print(f"An error occurred when rounding: {e}")
            return None

    # Rounding columns
    cleaned_data_df = round_to_two_decimal_places(converted_data_df, cols_to_round)
    
    return cleaned_data_df

# Try-except block for the transform function
try:
    # Transform the raw_data_json into a cleaned and tabular dataframe
    cleaned_data_df = transform(raw_data_json, cols_datatypes, cols_to_round)
except:
    print("There is an error with the transform function.")

In [173]:
# Checking the cleaned dataset
cleaned_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   priceUsd  516 non-null    float64
 1   date      516 non-null    object 
dtypes: float64(1), object(1)
memory usage: 8.2+ KB


In [174]:
# Checking the first records
cleaned_data_df.head()

Unnamed: 0,priceUsd,date
0,16708.52,2023-01-02
1,16682.71,2023-01-03
2,16822.19,2023-01-04
3,16826.73,2023-01-05
4,16821.14,2023-01-06


* Transforming into a CSV

In [175]:
# Function to transform a Python Dataframe into a CSV file 

def df_to_csv(df, file_name):
    """
    Transforms a pandas DataFrame into a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to be transformed.
    file_name (str): The name of the CSV file to be created.

    Returns:
    None
    """
    try:
        df.to_csv(file_name, index=False)
        print(f"DataFrame successfully saved to {file_name}")
    except Exception as e:
        print(f"An error occurred while saving the DataFrame to CSV: {e}")

In [176]:
# Calling the function for the Crypto dataset
df_to_csv(cleaned_data_df, '/workspace/sources/bitcoin_time_series.csv')

DataFrame successfully saved to /workspace/sources/bitcoin_time_series.csv


## [L] Load
* Loading data to Postgres.
    * Open a SQL connection with SQLAlchemy
    * .to_sql()
* Data Quality checks:
    * Validate that data was correctly persisted in postgres
        * Ensure it can be queried
            * pd.read_sql()
        * Make sure counts match
        * Validate each row is present

In [177]:
# Create a Function that created the SQL Engine based on SQLAlchemy
def create_db_engine(connection_uri: str) -> Engine: # Arrow indicates that the function returns an object of type Engine
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    db_engine = create_engine(connection_uri)
    return db_engine

In [178]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

# Create the database engine
db_engine = create_db_engine(connection_uri)

# Close the engine connection
db_engine.dispose()

In [179]:
# Load/Persist data in Postgres
def load(cleaned_data_df, con_engine):
    """
    Load/persist data in PostgreSQL database.

    Args:
        raw_normalized_data_df (DataFrame): The DataFrame containing the data to be loaded.
        con_engine (Engine): SQLAlchemy engine for database connection.

    Returns:
        None
    """
    # to_sql: Write records stored in a DataFrame to a SQL database.
    cleaned_data_df.to_sql(name="bitcoin_price", con=con_engine, if_exists="replace", index=False)

Note:
* In the Load function, we are basically checking if a table named 'crypto_mkt' exists. If it exists, the `'if_exists' = 'replace'`  argument will drop the table before inserting new values.
* If you want something different, please check the [to_sql()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html) documentation.

In [180]:
# Call the load function to load the transformed data to persistent storage
load(cleaned_data_df, db_engine)

# Query the data in the crypto_mkt table, check the head of the DataFrame
to_validate = pd.read_sql("SELECT * FROM bitcoin_price", con=db_engine)
print(to_validate.head())

   priceUsd        date
0  16708.52  2023-01-02
1  16682.71  2023-01-03
2  16822.19  2023-01-04
3  16826.73  2023-01-05
4  16821.14  2023-01-06
