# Building an ETL Pipeline

## Importing Libraries

In [92]:
import pandas as pd # Data Transformation
import requests     # Establishing connection with the web (API)
import json         # Exposes an API familiar to users of the standard library marshal and pickle modules
import pytest       # Makes it easy to write small, readable tests
import os
from dotenv import load_dotenv
import sqlalchemy
from sqlalchemy.engine import Engine, create_engine
import ipywidgets as widgets

## [E] Extraction

### API
- Data Source: https://docs.coincap.io/#89deffa0-ab03-4e0a-8d92-637a857d2c91
- Data comes in a JSON format (semi-structured data) from a 3rd Party

### Extraction Function

In [93]:
def extract(api_get_request_url):
    """
    Create and return a raw JSON extracted from an external API via GET Request.

    Args:
        api_get_request_url: The URL for API's GET Request.

    Returns:
        raw_data_json: a raw JSON with the API data.
    """
    
    # GET Request. Package the request, send the request and catch the response r
    r = requests.get(api_get_request_url)
    
    # Check if the request was successful
    if r.status_code == 200:
        # Extract JSON data from response: Decode the JSON data into a dictionary
        raw_data_json = r.json()
    else:
        print(f"Error. Non-success status code: {r.status_code}")
    
    # Return the json
    return raw_data_json

# Call the extract() function
raw_data_json = extract("https://api.coincap.io/v2/assets")

In [94]:
# Quick visualization
print("Printing JSON names (keys):")
for key in raw_data_json.keys():
    print("->", key)

# Convert JSON to string
json_str = json.dumps(raw_data_json, indent=2)

# Create a text area widget
json_output = widgets.Textarea(
    value=json_str,
    placeholder='JSON data',
    description='JSON:',
    disabled=True,
    layout=widgets.Layout(width='80%', height='200px')
)

# Display the widget
display(json_output)

Printing JSON names (keys):
-> data
-> timestamp


Textarea(value='{\n  "data": [\n    {\n      "id": "bitcoin",\n      "rank": "1",\n      "symbol": "BTC",\n   …

## [T] Transformation
* Exploring JSON Structure
* Data Normalization
    * Transforming the JSON into a table (as a Dataframe)
* Data Exploration & Cleaning
    * Check for: Missing Data, Data Types, Rounding needs.
* Transforming into a CSV

### Exploring JSON Structure
* Exploring the JSON name-value pairs


In [95]:
# Exploring the JSON structure
print("This is the Json's type from the API request:", type(raw_data_json))
print("It's a dict, so let's check how many keys it has:", len(raw_data_json))

# Saving Keys and Values as Lists
raw_data_json_keys_ls = list(raw_data_json.keys())
raw_data_json_values_ls = list(raw_data_json.values())

print("It is a nested JSON, with two key-value pais: data and timestamp.")
print("The actual information comes in the 'data' key along with its associated 'timestamp' as another key.")
print("How many key-value pairs in the 'data' key:", len(raw_data_json_values_ls[0]))
potential_columns = len(raw_data_json_values_ls[0][0])
potential_rows = len(raw_data_json_values_ls[0])
print(f"-> There are {potential_columns} features in a single record of 'data'.")
print(f"-> In total, there are {potential_rows} records stored in 'data' and each record has {potential_columns} features.")
print("We should see the same when storing in Dataframe.")


This is the Json's type from the API request: <class 'dict'>
It's a dict, so let's check how many keys it has: 2
It is a nested JSON, with two key-value pais: data and timestamp.
The actual information comes in the 'data' key along with its associated 'timestamp' as another key.
How many key-value pairs in the 'data' key: 100
-> There are 12 features in a single record of 'data'.
-> In total, there are 100 records stored in 'data' and each record has 12 features.
We should see the same when storing in Dataframe.


### Transformation Function
Notes: 
* The list of records are stored in the 'data' key of the JSON file, as seen above.
* Before creating the transformation function, we performed a data exploration:
    * There were some columns that are supposed to be **Numerical** data, but are stored as **Categorical** (`object`) data. We converted them.
    * There were some **Missing** (`NULL`) data ('maxSupply', 'explorer'). We handled it by hardcoding within the transformation function.

In [96]:
# For the Converting part
# Selecting the columns and their correct types
cols_datatypes = {
    'rank' : int,
    'supply' : float,
    'maxSupply' : float,
    'marketCapUsd' : float,
    'volumeUsd24Hr' : float,
    'priceUsd' : float,
    'changePercent24Hr' : float,
    'vwap24Hr' : float
}

# For the Rounding part
# Selecting the columns we want to round to some decimal places
cols_to_round = ['supply', 'maxSupply', 'marketCapUsd', 'priceUsd', 'changePercent24Hr', 'vwap24Hr']

In [97]:
def transform(raw_data_json, cols_datatypes, cols_to_round):
    """
    Create and return a raw dataset in the DataFrame format based on a JSON.
    Here, the function is not automated to both the missing values and the rounding parts.
    You need to hardcode it here how to handle each case.

    Args:
        raw_data_json: The raw json returned by the extraction function.
        cols_datatypes: A dictionary of columns and the associated types you want for them.
    Returns:
        raw_normalized_data_df: a raw, normalized dataset in the DataFrame format.
    """    
    
    # 1) Data Normalization
    # record_path = "data": path to list of records.
    # sep = ".": Nested records will generate names separated by 'sep'.
    raw_normalized_data_df = pd.json_normalize(raw_data_json, record_path = "data", sep = '.') 

    # Assert that the transform function returns a pd.DataFrame
    assert isinstance(raw_normalized_data_df, pd.DataFrame)
    
    # 2) Data Cleaning
    # Inner Function to Convert Categorical to Numerical
    def convert_cols_cat_to_num(df, cols_datatypes):
        """
        Gets a dataframe with specific categorical columns and convert them into numerical columns. 

        Args:
            df: The dataframe with the columns you want to convert.
            cols_datatypes: A dictionary of columns and the associated types you want for them. 

        Returns:
            Engine: A dataframe with columns converted to the types you want. 
        """
        for col, datatype in cols_datatypes.items():
            if col in df.columns:
                df[col] = df[col].astype(datatype)
        return df
    
    # Converting Categorical to Numerical
    converted_data_df = convert_cols_cat_to_num(raw_normalized_data_df, cols_datatypes)
    # Save a copy
    cleaned_data_df = converted_data_df.copy()
    
    # Handling Missing Values
    cleaned_data_df["maxSupply"] = cleaned_data_df["maxSupply"].fillna(0)
    cleaned_data_df["explorer"] = cleaned_data_df["explorer"].fillna('not available')    

    # Inner function to round columns
    # def round_to_two_decimal_places(number):
    #     return round(number, 2)

    # Rounding columns
    # Note: 'applymap' applies a function that accepts and returns a scalar to every element of a DataFrame
    cleaned_data_df[cols_to_round] = cleaned_data_df[cols_to_round].apply(lambda x: x.round(2))

    return cleaned_data_df

# Try-except block for the transform function
try:
    # Transform the raw_data_json into a cleaned and tabular dataframe
    cleaned_data_df = transform(raw_data_json, cols_datatypes, cols_to_round)
except:
    print("There is an error with the transform function.")

In [98]:
# Checking the cleaned dataset
cleaned_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 100 non-null    object 
 1   rank               100 non-null    int64  
 2   symbol             100 non-null    object 
 3   name               100 non-null    object 
 4   supply             100 non-null    float64
 5   maxSupply          100 non-null    float64
 6   marketCapUsd       100 non-null    float64
 7   volumeUsd24Hr      100 non-null    float64
 8   priceUsd           100 non-null    float64
 9   changePercent24Hr  100 non-null    float64
 10  vwap24Hr           100 non-null    float64
 11  explorer           100 non-null    object 
dtypes: float64(7), int64(1), object(4)
memory usage: 9.5+ KB


In [99]:
# Checking the first records
cleaned_data_df.head()

Unnamed: 0,id,rank,symbol,name,supply,maxSupply,marketCapUsd,volumeUsd24Hr,priceUsd,changePercent24Hr,vwap24Hr,explorer
0,bitcoin,1,BTC,Bitcoin,19710390.0,21000000.0,1367663000000.0,3961673000.0,69387.91,-0.33,69670.5,https://blockchain.info/
1,ethereum,2,ETH,Ethereum,120154200.0,0.0,440701800000.0,2951669000.0,3667.8,-0.79,3691.5,https://etherscan.io/
2,tether,3,USDT,Tether,112481000000.0,0.0,112518500000.0,8539388000.0,1.0,-0.1,1.0,https://www.omniexplorer.info/asset/31
3,binance-coin,4,BNB,BNB,166801100.0,166801148.0,107491600000.0,529941500.0,644.43,-5.45,656.49,https://etherscan.io/token/0xB8c77482e45F1F44d...
4,solana,5,SOL,Solana,460784300.0,0.0,73051810000.0,326534500.0,158.54,-1.69,160.41,https://explorer.solana.com/


* Transforming into a CSV

In [100]:
# Function to transform a Python Dataframe into a CSV file 

def df_to_csv(df, file_name):
    """
    Transforms a pandas DataFrame into a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to be transformed.
    file_name (str): The name of the CSV file to be created.

    Returns:
    None
    """
    try:
        df.to_csv(file_name, index=False)
        print(f"DataFrame successfully saved to {file_name}")
    except Exception as e:
        print(f"An error occurred while saving the DataFrame to CSV: {e}")

In [101]:
# Calling the function for the Crypto dataset
df_to_csv(cleaned_data_df, '/workspace/sources/crypto_mkt.csv')

DataFrame successfully saved to /workspace/sources/crypto_mkt.csv


## [L] Load
* Loading data to Postgres.
    * Open a SQL connection with SQLAlchemy
    * .to_sql()
* Data Quality checks:
    * Validate that data was correctly persisted in postgres
        * Ensure it can be queried
            * pd.read_sql()
        * Make sure counts match
        * Validate each row is present

In [102]:
# Create a Function that created the SQL Engine based on SQLAlchemy
def create_db_engine(connection_uri: str) -> Engine: # Arrow indicates that the function returns an object of type Engine
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    db_engine = create_engine(connection_uri)
    return db_engine

In [103]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

# Create the database engine
db_engine = create_db_engine(connection_uri)

# Close the engine connection
db_engine.dispose()

In [104]:
# Load/Persist data in Postgres
def load(cleaned_data_df, con_engine):
    """
    Load/persist data in PostgreSQL database.

    Args:
        raw_normalized_data_df (DataFrame): The DataFrame containing the data to be loaded.
        con_engine (Engine): SQLAlchemy engine for database connection.

    Returns:
        None
    """
    # to_sql: Write records stored in a DataFrame to a SQL database.
    cleaned_data_df.to_sql(name="crypto_mkt", con=con_engine, if_exists="replace", index=False)

Note:
* In the Load function, we are basically checking if a table named 'crypto_mkt' exists. If it exists, the `'if_exists' = 'replace'`  argument will drop the table before inserting new values.
* If you want something different, please check the [to_sql()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html) documentation.

In [105]:
# Call the load function to load the transformed data to persistent storage
load(cleaned_data_df, db_engine)

# Query the data in the crypto_mkt table, check the head of the DataFrame
to_validate = pd.read_sql("SELECT * FROM crypto_mkt", con=db_engine)
print(to_validate.head())

             id  rank symbol      name        supply    maxSupply  \
0       bitcoin     1    BTC   Bitcoin  1.971039e+07   21000000.0   
1      ethereum     2    ETH  Ethereum  1.201542e+08          0.0   
2        tether     3   USDT    Tether  1.124810e+11          0.0   
3  binance-coin     4    BNB       BNB  1.668011e+08  166801148.0   
4        solana     5    SOL    Solana  4.607843e+08          0.0   

   marketCapUsd  volumeUsd24Hr  priceUsd  changePercent24Hr  vwap24Hr  \
0  1.367663e+12   3.961673e+09  69387.91              -0.33  69670.50   
1  4.407018e+11   2.951669e+09   3667.80              -0.79   3691.50   
2  1.125185e+11   8.539388e+09      1.00              -0.10      1.00   
3  1.074916e+11   5.299415e+08    644.43              -5.45    656.49   
4  7.305181e+10   3.265345e+08    158.54              -1.69    160.41   

                                            explorer  
0                           https://blockchain.info/  
1                              https