# Platts Fujairah Statistics - Inventories  (weekly)

Extracting bunkers sale and inventories data from Platts Fujairah Statistics:

URL: https://fujairah.platts.com/fujairah


Requirement for using selenium for automating browser interaction:

- install browser driver:
https://sites.google.com/a/chromium.org/chromedriver/downloads
- have Chrome browser installed

If you have a problem while starting selenium, it's likely your Chrome has been upgraded.
Check the link above for new version of the webdriver, put it at ..\drivers and try again.


## Workflow

This notebook performs the following steps:

1. Open the website
1. Select the data points 
1. Download the data points in a dataframe stocks
    


## Requirements

In [None]:
pip install selenium

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import numpy as np
import datetime
import sqlalchemy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

## Selenium parametres

In [2]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-ssl-errors=yes')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome('/Users/LYNGOPOULOU_D/PycharmProjects/scraper/drivers/chromedriver')


In [3]:
def download_data():
    try:
        main_df = pd.DataFrame([])
        url = 'https://fujairah.platts.com/fujairah'
        driver.get(url)
        time.sleep(10)
        table = driver.find_element_by_id("panel-1074-table")
        table_rows = table.find_elements_by_tag_name('tr')
        stocks = []
        for tr in table_rows:
            td = tr.find_elements_by_tag_name('td')
            row = [tr.text for tr in td]
            stocks.append(row)
        panel = driver.find_element_by_id("component-1082")
        #panel = driver.find_element_by_xpath('.//div[@class = "x-component date-updated-outer table-last-column x-component-default"]')
        converted_date = panel.find_element_by_class_name('date-updated').text
        unit = driver.find_element_by_id("component-1070").text
        driver.close()
    except Exception as e:
        raise e
    return stocks, converted_date, unit



In [4]:
def transform_data(stocks, converted_date, unit):
    stocks = pd.DataFrame(stocks)
    stocks.columns = ['fuel', 'volume']
    stocks['volume'] = stocks['volume'].str.replace(',', '')
    stocks['volume'] = pd.to_numeric(stocks['volume'])
    stocks['date'] =  pd.to_datetime(converted_date)
    stocks['unit']= unit[unit.find('(')+1:unit.find(')')]
    return stocks



In [5]:
stocks, converted_date, unit = download_data()

In [6]:
stocks_df = transform_data(stocks, converted_date, unit)

In [7]:
stocks_df.head()

Unnamed: 0,fuel,volume,date,unit
0,Light Distillates,5111,2021-09-27,Mbbl
1,Middle Distillates,3698,2021-09-27,Mbbl
2,Heavy Distillates & Residues,6724,2021-09-27,Mbbl


In [2]:
from datetime import date

prefix = "plats_fuj_stocks"

today_calendar = date.today().isocalendar()
file_code = f"{prefix}_{today_calendar[0]}W{today_calendar[1]:02}"
file_code

'plats_fuj_stocks_2021W40'

In [4]:
f"{0:02}"

'00'

In [1]:
%pwd

'C:\\Users\\ROSA_L\\PycharmProjects\\scraper\\notebooks'

In [2]:
%cd ..

C:\Users\ROSA_L\PycharmProjects\scraper


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import logging
logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG)
logger.setLevel(logging.DEBUG)

In [7]:
from scraper.core import factory

job = factory.get_scraper_job('com_platts_fujairah', 'platts_fujairah_stocks')

DEBUG:scraper.core.factory:Loading module scraper.jobs.com_platts_fujairah.platts_fujairah_stocks
DEBUG:scraper.core.factory:Getting class PlattsFujairahStocksJob
INFO:scraper.core.job:Temporary table name: #com_platts_fujairah_stocks_temp, final table name: com_platts_fujairah_stocks_data
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:50283/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "platformName": "any", "goog:chromeOptions": {"prefs": {"download.default_directory": "C:\\Users\\ROSA_L\\PycharmProjects\\scraper\\filestore"}, "extensions": [], "args": ["--headless", "--disable-dev-shm-usage", "window-size=1920x1480"]}}}, "desiredCapabilities": {"browserName": "chrome", "version": "", "platform": "ANY", "goog:chromeOptions": {"prefs": {"download.default_directory": "C:\\Users\\ROSA_L\\PycharmProjects\\scraper\\filestore"}, "extensions": [], "args": ["--headless", "--disable-dev-shm-usage", "window-size=1920x1480"]}}}
D

In [8]:
job.run()

INFO:scraper.jobs.com_platts_fujairah.platts_fujairah_stocks:Defining sources to load.
DEBUG:scraper.core.job:download: True, parallel download: True
INFO:scraper.jobs.com_platts_fujairah.platts_fujairah_stocks:Downloading data from https://fujairah.platts.com/fujairah
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:50283/session/d7951bde187fbdda3e075dcdbee43571/url {"url": "https://fujairah.platts.com/fujairah"}
DEBUG:urllib3.connectionpool:http://127.0.0.1:50283 "POST /session/d7951bde187fbdda3e075dcdbee43571/url HTTP/1.1" 200 14
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:50283/session/d7951bde187fbdda3e075dcdbee43571/element {"using": "css selector", "value": "[id=\"panel-1074-table\"]"}
DEBUG:urllib3.connectionpool:http://127.0.0.1:50283 "POST /session/d7951bde187fbdda3e075dcdbee43571/element HTTP/1.1" 200 88
DEBUG:selenium.webdriver.remote.remote_connection:Finis

In [6]:
job.get_sources()

INFO:scraper.jobs.com_platts_fujairah.platts_fujairah_stocks:Defining sources to load.


In [8]:
vars(job.sources[0])

{'code': 'platts_fuj_stocks_2021W40',
 'url': 'https://fujairah.platts.com/fujairah',
 'path': 'platts_fuj_stocks_2021W40.csv',
 'long_name': ' Platts Fujairah Stocks - Weekly Data - 2021 Week 40'}

In [9]:
job.download_source(job.sources[0])

INFO:scraper.jobs.com_platts_fujairah.platts_fujairah_stocks:Downloading data from https://fujairah.platts.com/fujairah
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:61786/session/37f94f0b8a685a366d5482390a202edd/url {"url": "https://fujairah.platts.com/fujairah"}
DEBUG:urllib3.connectionpool:http://127.0.0.1:61786 "POST /session/37f94f0b8a685a366d5482390a202edd/url HTTP/1.1" 200 14
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:61786/session/37f94f0b8a685a366d5482390a202edd/element {"using": "css selector", "value": "[id=\"panel-1074-table\"]"}
DEBUG:urllib3.connectionpool:http://127.0.0.1:61786 "POST /session/37f94f0b8a685a366d5482390a202edd/element HTTP/1.1" 200 88
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:61786/session/37f94f0b8a685a366d5482390a202edd/element/4a2c94fa-41

# Initilize the table

Let's create the table.


In [7]:
from scraper.core import factory

job = factory.get_scraper_job('com_platts_fujairah', 'platts_fujairah_stocks', full_load=True)
job.run()

DEBUG:scraper.core.factory:Loading module scraper.jobs.com_platts_fujairah.platts_fujairah_stocks
DEBUG:scraper.core.factory:Getting class PlattsFujairahStocksJob
INFO:scraper.core.job:Temporary table name: #com_platts_fujairah_stocks_temp, final table name: com_platts_fujairah_stocks_data
DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:62077/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "platformName": "any", "goog:chromeOptions": {"prefs": {"download.default_directory": "C:\\Users\\ROSA_L\\PycharmProjects\\scraper\\filestore"}, "extensions": [], "args": ["--headless", "--disable-dev-shm-usage", "window-size=1920x1480"]}}}, "desiredCapabilities": {"browserName": "chrome", "version": "", "platform": "ANY", "goog:chromeOptions": {"prefs": {"download.default_directory": "C:\\Users\\ROSA_L\\PycharmProjects\\scraper\\filestore"}, "extensions": [], "args": ["--headless", "--disable-dev-shm-usage", "window-size=1920x1480"]}}}
D

In [11]:
job.data

Unnamed: 0,fuel,volume,date,unit,date_created,date_modified
0,Light Distillates,5111,2021-09-27,Mbbl,2021-10-04 16:53:37.214780,NaT
1,Middle Distillates,3698,2021-09-27,Mbbl,2021-10-04 16:53:37.214780,NaT
2,Heavy Distillates & Residues,6724,2021-09-27,Mbbl,2021-10-04 16:53:37.214780,NaT


# Test the parallelize()

We need to test the parallelize() method, which seems to not raise exceptions raised in the called function.

### Current problem

The parallelize() function does not raises exceptions raised by the called function:

In [36]:
from concurrent.futures import ThreadPoolExecutor

def parallelize(function, param_list, max_workers=5):
    """
    Parallelizes the execution of the given function.
    :param function: the function to run in parallel.
    :param param_list: the list of parameters for each execution of the function.
    :param max_workers: maximum number of workers (it defaults to 5).
    :return: None
    """
    print(f'Executing function {function.__name__}() over {len(param_list)} items with a maximum of {max_workers} parallel workers.')
    with ThreadPoolExecutor(max_workers) as executor:
        futures = executor.map(function, param_list)           
        #return [result for result in executor.map(function, param_list)]

class TestParallel:
    
    number_of_calls: int = 0
    
    @classmethod
    def test_call(cls, param):
        cls.number_of_calls += 1
        
        if cls.number_of_calls > 5:
            raise Exception("This is the test exception")
        
        print(f"test_call() with parameter: {param}")
        
        return param

params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
            
test_parallel = TestParallel()

parallelize(test_parallel.test_call, params)

Executing function test_call() over 7 items with a maximum of 5 parallel workers.
test_call() with parameter: a
test_call() with parameter: b
test_call() with parameter: c
test_call() with parameter: d
test_call() with parameter: e


### Solution
The concurrent.futures.map() function raises generated exceptions only when we *consume* the results in the returned *iterator*.
So, if we don't consume the result of a problematic execution, **it will not raise the exception generated by it**.

So we need to explicity consume all the results by building a list comprehension (not a generator comprehension).

The following code gives the expected result:

In [37]:
#from scraper.core.utils import parallelize
from concurrent.futures import ThreadPoolExecutor

def parallelize(function, param_list, max_workers=5):
    """
    Parallelizes the execution of the given function.
    :param function: the function to run in parallel.
    :param param_list: the list of parameters for each execution of the function.
    :param max_workers: maximum number of workers (it defaults to 5).
    :return: None
    """
    print(f'Executing function {function.__name__}() over {len(param_list)} items with a maximum of {max_workers} parallel workers.')
    with ThreadPoolExecutor(max_workers) as executor:
        #futures = executor.map(function, param_list)           
        return [result for result in executor.map(function, param_list)]

class TestParallel:
    
    number_of_calls: int = 0
    
    @classmethod
    def test_call(cls, param):
        cls.number_of_calls += 1
        
        if cls.number_of_calls > 5:
            raise Exception("This is the test exception")
        
        print(f"test_call() with parameter: {param}")
        
        return param

params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
            
test_parallel = TestParallel()

parallelize(test_parallel.test_call, params)

Executing function test_call() over 7 items with a maximum of 5 parallel workers.
test_call() with parameter: a
test_call() with parameter: b
test_call() with parameter: c
test_call() with parameter: d
test_call() with parameter: e


Exception: This is the test exception

In [None]:
from scraper.core.utils import get_country_dict

country_list = get_country_dict()

print(f"country list: {country_list}")
print("\n")

print(f"country list values: {country_list.values()}")

In [46]:
"Aruba" in country_list.values()

False

In [66]:
import functools
import pycountry
from timeit import timeit

@functools.lru_cache(maxsize=1)
def get_country_dict():
    country_list = list(pycountry.countries)
    country_name_dict = {country.name: country.alpha_3 for country in country_list}
    country_name_begin_dict = {country.name.split(',')[0]: country.alpha_3 for country in country_list}
    country_alpha_2_dict = {country.alpha_2: country.alpha_3 for country in country_list}
    country_official_name_dict = {}

    # Build official name dictionary in try except, in order to skip countries with no official name
    for country in country_list:
        if 'official_name' in country.__dict__['_fields']:
            country_official_name_dict[country.official_name] = country.alpha_3

    country_dict = {'name': country_name_dict,
                    'alpha_2': country_alpha_2_dict,
                    'official_name': country_official_name_dict,
                    'name_begin': country_name_begin_dict
                    }
    return country_dict

# COUNTRY_DICT = get_country_dict()

def get_country_iso3(country_field):
    for country_mapping in get_country_dict().values():
        if country_field in country_mapping:        
            return country_mapping[country_field]
    return country_field

get_country_iso3('Brazil')
get_country_iso3('Brazil')
get_country_iso3('Aruba')
get_country_iso3('Senegal')

get_country_dict.cache_info()

CacheInfo(hits=3, misses=1, maxsize=1, currsize=1)