In [40]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime 

In [41]:
def extract(url, table_attributes):
    page = requests.get(url).text
    data = BeautifulSoup(page, 'html.parser')
    df = pd.DataFrame(columns=table_attributes)
    tables = data.find_all('tbody')
    rows = tables[2].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col) != 0:
            if col[0].find('a') is not None and '—' not in col[2]:
                data_dict = {
                    "Country": col[0].a.contents[0],
                    "GDP_USD_millions": col[2].contents[0]
                }
                df1 = pd.DataFrame(data_dict, index=[0])
                df = pd.concat([df,df1], ignore_index=True)
    return df

In [42]:
def transform(df):
    GDP_list = df['GDP_USD_millions'].tolist()
    GDP_list = [float("".join(x.split(','))) for x in GDP_list]
    GDP_list = [np.round(x/1000,2)for x in GDP_list]
    df["GDP_USD_millions"] = GDP_list
    df = df.rename(columns = {"GDP_USD_millions":"GDP_USD_billions"})
    return df

In [43]:
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

In [44]:
def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

In [45]:
def run_query(query_statement, sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)


In [46]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./etl_project_log.txt","a") as f: 
        f.write(timestamp + ' : ' + message + '\n')

In [47]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
db_name = 'World_Economies.db'
table_name = 'Countries_by_GDP'
csv_path = './Countries_by_GDP.csv'
table_attributes = ["Country", "GDP_USD_millions"]
count = 0

In [48]:
log_progress('Preliminaries complete. Initiating ETL process')
df = extract(url, table_attributes)
log_progress('Data extraction complete. Initiating Transformation process')
df = transform(df)
log_progress('Data transformation complete. Initiating loading process')
load_to_csv(df, csv_path)
log_progress('Dataframe saved to csv')
sql_connection = sqlite3.connect('World_Economies.db')
log_progress('SQL Connection initiated.')
load_to_db(df, sql_connection, table_name)
log_progress('Data loaded to Database as table. Running the query')

In [51]:
print(df)

              Country  GDP_USD_billions
0       United States          26949.64
1      European Union          18351.13
2               China          17700.90
3             Germany           4429.84
4               Japan           4230.86
..                ...               ...
187  Marshall Islands              0.28
188             Palau              0.27
189          Kiribati              0.25
190             Nauru              0.15
191            Tuvalu              0.06

[192 rows x 2 columns]


In [50]:
query_statement = f"select * from {table_name} WHERE GDP_USD_billions >= 100"
run_query(query_statement, sql_connection)
log_progress('Process Complete.')
sql_connection.close()

select * from Countries_by_GDP WHERE GDP_USD_billions >= 100
           Country  GDP_USD_billions
0    United States          26949.64
1   European Union          18351.13
2            China          17700.90
3          Germany           4429.84
4            Japan           4230.86
..             ...               ...
65     Puerto Rico            117.52
66           Kenya            112.75
67            Oman            108.28
68        Bulgaria            103.10
69       Guatemala            102.76

[70 rows x 2 columns]
