## Web Scraping

In [1]:
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
url =  "https://id.wikipedia.org/wiki/Daftar_orang_terkaya_di_Indonesia"

In [3]:
def scrape(url):
    logging.info(f"Scraping website with url: '{url}' ...")
    return pd.read_html(url, header=None)

In [4]:
dfs = scrape(url)[7]

INFO:root:Scraping website with url: 'https://id.wikipedia.org/wiki/Daftar_orang_terkaya_di_Indonesia' ...
INFO:numexpr.utils:NumExpr defaulting to 2 threads.


In [None]:
dfs

### Practice
Lakukan Web scraping pandas read_html seperti diatas

# Cleaning Data


In [6]:
import re

In [7]:
def is_money_miliar(string_money):
    return string_money.lower().endswith("miliar")

In [8]:
def transform_money_format(string_money):
    half_clean_string = string_money.lower().replace(",", ".").replace(" ", "")
    return re.sub(r"[?\[M\]miliar|\[J\]juta]", "", half_clean_string)

In [9]:
def transform(df, tahun):
    logging.info("Transforming DataFrame ...")

    columns_mapping = {
        "Nomor Urut": "nomor_urut",
        "Nama": "nama",
        "Perusahaan": "perusahaan",
        "Kekayaan Bersih (US$)": "kekayaan_bersih_usd"
    }

    renamed_df = df.rename(columns=columns_mapping)
    renamed_df["tahun"] = tahun
    renamed_df["kekayaan_bersih_usd_juta"] = renamed_df["kekayaan_bersih_usd"].apply(
        lambda value: float(transform_money_format(value)) * 1000 if is_money_miliar(value) else float(transform_money_format(value))
    )

    return renamed_df[["nomor_urut", "tahun", "nama", "perusahaan", "kekayaan_bersih_usd_juta"]]

In [10]:
df_2020 = transform(dfs, 2020)

INFO:root:Transforming DataFrame ...


In [None]:
df_2020

### Practice
Lakukan Cleaning Data seperti diatas

# Storing Data to Database

In [12]:
from sqlalchemy import create_engine

In [13]:
DB_NAME = "web_scraping_db"
DB_USER = "username"
DB_PASSWORD = "secret"
DB_HOST = "34.71.186.247"
DB_PORT = "5432"
CONNECTION_STRING = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
TABLE_NAME = "orang_terkaya_indonesia"

In [14]:
CONNECTION_STRING

'postgresql://username:secret@34.71.186.247:5432/web_scraping_db'

In [15]:
def write_to_postgres(df, db_name, table_name, connection_string):
    engine = create_engine(connection_string)

    logging.info(f"Writing dataframe to database: '{db_name}', table: '{table_name}' ...")
    df.to_sql(name = table_name, con=engine, if_exists="replace", index=False)


In [None]:
pip install psycopg2-binary==2.8.6

In [None]:
write_to_postgres(df=df_2020, db_name=DB_NAME, table_name=TABLE_NAME, connection_string=CONNECTION_STRING)

### Practice
Storing Data ke PostgreSQL seperti diatas

# Read Data from Database

In [17]:
def read_from_postgres(db_name, table_name, connection_string):
    engine = create_engine(connection_string)

    logging.info(f"Reading postgres database: '{db_name}', table: '{table_name}' ...")
    return pd.read_sql_table(table_name, con=engine)

In [18]:
result_df = read_from_postgres(db_name=DB_NAME, table_name=TABLE_NAME, connection_string=CONNECTION_STRING)

INFO:root:Reading postgres database: 'web_scraping_db', table: 'orang_terkaya_indonesia' ...


In [None]:
print("Daftar Orang Terkaya di Indonesia:")
print(result_df.to_string())

### Practice
Read Data dari PostgreSQL seperti diatas

# Unittest

In [None]:
import unittest

class TestTransformer(unittest.TestCase):
    def test_is_money_miliar_when_string_money_contains_miliar(self):
        string_money = "35.5 miliar"
        actual = is_money_miliar(string_money)

        self.assertTrue(string_money)
    
    
    def test_is_money_miliar_when_string_money_not_contains_miliar(self):
        string_money = "980 juta"
        actual = is_money_miliar(string_money)

        self.assertFalse(actual)
    
    
    def test_transform_money_format_when_money_is_juta(self):
        string_money = "980 Juta"
        actual = transform_money_format(string_money)

        self.assertEqual(actual, "980")
    

    def test_transform_money_format_when_money_is_miliar(self):
        string_money = "35.6 miliar"
        actual = transform_money_format(string_money)

        self.assertEqual(actual, "35.5")
    

    def test_transform_money_format_when_money_contains_comma(self):
        string_money = "35,5 miliar"
        actual = transform_money_format(string_money)

        self.assertEqual(actual, "35.5")

unittest.main(argv=[''], verbosity=2, exit=False)

### Practice
Lakukan Unittest seperti diatas