In [2]:
# Importing the required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

This `log_progress` function appends a message with a timestamp to a file named "code_log.txt", aiding in tracking progress or events during code execution.


In [3]:

def log_progress(message):

    timestamp_format = '%Y-%h-%d-%H:%M:%S'  # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now()  # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("./code_log.txt", "a") as f:
        f.write(timestamp + ' : ' + message + '\n')


This `extract` function fetches data from URL, parsing it using BeautifulSoup. It also extracts information from HTML tables with some  attributes and convert it to  df. each row should has three columns, Finally, it returns the DataFrame containing the extracted data.


In [4]:
def extract(url, table_attribs):
    page = requests.get(url).text
    data = BeautifulSoup(page, 'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    tables = data.find_all('tbody')
    rows = tables[0].find_all('tr')
    for row in rows:
        col = row.find_all('td')

        if len(col) == 3:  # Assuming each row has 3 columns
            # Extract text from the second and third columns
            name = col[1].text.strip()  # Get the text content, removing leading/trailing whitespaces
            mc_usd_billion = float(col[2].text.strip())

            # Create a dictionary with the extracted data
            data_dict = {"Name": name, "MC_USD_Billion": mc_usd_billion}

            # Create a DataFrame from the dictionary
            df1 = pd.DataFrame(data_dict, index=[0])

            # Concatenate the DataFrame to the main DataFrame
            df = pd.concat([df, df1], ignore_index=True)

    return df


This `transform` function enhances the df by adding three  new columns, each containing the transformed market  values to the country  currency. It reads exchange rate data from a CSV file, converts it into a dictionary, and uses it to perform the currency conversions. The output is a df includes the 3 new  columns for markt values in GBP, EUR, and INR, rounded to two decimal.


In [5]:
def transform(df, csv_path):

    exchange_rate_df = pd.read_csv(csv_path)
    exchange_rate = exchange_rate_df.set_index('Currency')['Rate'].to_dict()

    df['MC_GBP_Billion'] = np.round(df['MC_USD_Billion'] * exchange_rate['GBP'], 2)
    df['MC_EUR_Billion'] = np.round(df['MC_USD_Billion'] * exchange_rate['EUR'], 2)
    df['MC_INR_Billion'] = np.round(df['MC_USD_Billion'] * exchange_rate['INR'], 2)

    return df

The following three functions will:
1. Save the DataFrame to a CSV file.
2. Save the DataFrame to a database table using the provided table name and connection.
3. Execute a query on the specified database table and print the output.


In [6]:
def load_to_csv(df, output_path):
    df.to_csv(csv_path)


def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)


def run_query(query_statement, sql_connection):

    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

 ## Run the code and to get the ETL pipline output

In [9]:
# URL of the webpage containing data about the largest banks
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'

# Attributes of the table we want to extract from the webpage
table_attribs = ["Name", "MC_USD_Billion"]

# Path to the CSV file containing exchange rate information
csv_path0 = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-PY0221EN-Coursera/labs/v2/exchange_rate.csv'

# Path to save the extracted data as a CSV file
csv_path = './Largest_banks_data.csv'

# Name of the database to save the data
db_name = 'Banks.db'

# Name of the table in the database to save the data
table_name = 'Largest_banks'

# Logging progress
log_progress('Preliminaries complete. Initiating ETL process')

# Extract data from the specified URL and table attributes
df = extract(url, table_attribs)

# Logging progress
log_progress('Data extraction complete. Initiating Transformation process')

# Transform the extracted data using exchange rate information
df = transform(df, csv_path0)

# Logging progress
log_progress('Data transformation complete. Initiating loading process')

# Save the transformed data to a CSV file
load_to_csv(df, csv_path)

# Logging progress
log_progress('Data saved to CSV file')

# Establish a connection to the SQLite database
sql_connection = sqlite3.connect('Banks.db')

# Logging progress
log_progress('SQL Connection initiated.')

# Load the transformed data into the database as a table
load_to_db(df, sql_connection, table_name)
log_progress('Data loaded to Database as table. Running the query')

# Run queries on the database table and print the output
query_statement = f"SELECT * from {table_name} "
run_query(query_statement, sql_connection)

query_statement = f"SELECT AVG(MC_GBP_Billion) FROM  {table_name} "
run_query(query_statement, sql_connection)

query_statement = f"SELECT Name FROM  {table_name} LIMIT 5"
run_query(query_statement, sql_connection)

# Logging progress
log_progress('Process Complete.')

# Close the SQL connection
sql_connection.close()


SELECT * from Largest_banks 
                                      Name  MC_USD_Billion  MC_GBP_Billion  \
0                           JPMorgan Chase          432.92          346.34   
1                          Bank of America          231.52          185.22   
2  Industrial and Commercial Bank of China          194.56          155.65   
3               Agricultural Bank of China          160.68          128.54   
4                                HDFC Bank          157.91          126.33   
5                              Wells Fargo          155.87          124.70   
6                        HSBC Holdings PLC          148.90          119.12   
7                           Morgan Stanley          140.83          112.66   
8                  China Construction Bank          139.82          111.86   
9                            Bank of China          136.81          109.45   

   MC_EUR_Billion  MC_INR_Billion  
0          402.62        35910.71  
1          215.31        19204.58  
2   