# Scrapper for Monthly Refinery data - Russia

Scrapper intended to get monthly refinery information for primary and secondary oil products for Russia.

# Requirements/limitations

This script runs currently only on Windows (necessary to convert Excel file from .xlsx to .xls for Pandas performance).

## Data source

Argus fundamentals file "Russian refinery output.xlsx".

## How to use

1. run the script : it will generate a .xls and .csv file on the same directory
1. import data into an excel file using PowerQuery and pivot *Period* column using Value as value.




In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

C:\Users\ROSA_L\PycharmProjects\scraper


In [3]:
from scraper.core import factory
from scraper.core.dimension import Updater
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(level=logging.DEBUG)

In [17]:
ru = factory.get_scraper_job('com_argusmedia', 'ru_ref_output')

DEBUG:scraper.core.factory:Loading module scraper.jobs.com_argusmedia.ru_ref_output
DEBUG:scraper.core.factory:Getting class RuRefOutputJob
DEBUG:scraper.jobs.com_argusmedia.ru_ref_output:loading product mapping C:\Users\ROSA_L\PycharmProjects\scraper\scraper\jobs\com_argusmedia\product_mapping.csv
DEBUG:scraper.jobs.com_argusmedia.ru_ref_output:Mapping loaded: 61 rows and 3 columns.
DEBUG:scraper.jobs.com_argusmedia.ru_ref_output:loading product mapping C:\Users\ROSA_L\PycharmProjects\scraper\scraper\jobs\com_argusmedia\entity_mapping.csv
DEBUG:scraper.jobs.com_argusmedia.ru_ref_output:Mapping loaded: 103 rows and 3 columns.


In [18]:
ru.run(download=False)

DEBUG:scraper.core.job:remove_existing_dynamic_dim: query - http://vipenta.iea.org:8000/dimension/source
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): vipenta.iea.org:8000
DEBUG:urllib3.connectionpool:http://vipenta.iea.org:8000 "GET /dimension/source HTTP/1.1" 200 770751
DEBUG:scraper.core.job:self.dynamic_dim['source'] size before: 1
DEBUG:scraper.core.job:self.dynamic_dim['source'] size after: 0
INFO:scraper.core.utils:download_and_get_checksum: 153.99909019470215 ms
DEBUG:scraper.core.job:rm_sources_up_to_date: processing com_argusmedia_ru_ref_output
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): vipenta.iea.org:8000
DEBUG:urllib3.connectionpool:http://vipenta.iea.org:8000 "GET /dimension/source?code=com_argusmedia_ru_ref_output HTTP/1.1" 200 772
INFO:scraper.core.utils:rm_sources_up_to_date: 41.002750396728516 ms
DEBUG:scraper.jobs.com_argusmedia.ru_ref_output:Transforming data...
DEBUG:scraper.jobs.com_argusmedia.ru_ref_output:Loading provider ...

In [7]:
for x in ['Company/refinery', 'Crude throughput', "Crude throughput '000 b/d", "±% Mar ('000 b/d)", 'Crude throughput YTD', "Crude throughput YTD '000 b/d", '±% YTD 2019', 'Naphtha', '±% Mar', 'Naphtha YTD', '±% YTD 2019.1', 'Jet-kerosine', '±% Mar.1', 'Jet-kerosine YTD', '±% YTD 2019.2']:
    print(x)
    print("±%" not in x)

Company/refinery
True
Crude throughput
True
Crude throughput '000 b/d
True
±% Mar ('000 b/d)
False
Crude throughput YTD
True
Crude throughput YTD '000 b/d
True
±% YTD 2019
False
Naphtha
True
±% Mar
False
Naphtha YTD
True
±% YTD 2019.1
False
Jet-kerosine
True
±% Mar.1
False
Jet-kerosine YTD
True
±% YTD 2019.2
False


In [21]:
for x in ['entity', 'detail']:
    ru_upd = Updater(RussianRefOutputJob,x)
    ru_upd.run(download=False)

2019-05-14 18:41:27,676 - scraper.core.utils - DEBUG: loading product mapping C:\Users\ROSA_L\PycharmProjects\scrapper\scraper\jobs\com_argusmedia\product_mapping.csv
2019-05-14 18:41:27,682 - scraper.core.utils - DEBUG: Mapping loaded: 61 rows and 3 columns.
2019-05-14 18:41:27,683 - scraper.core.utils - DEBUG: loading product mapping C:\Users\ROSA_L\PycharmProjects\scrapper\scraper\jobs\com_argusmedia\entity_mapping.csv
2019-05-14 18:41:27,688 - scraper.core.utils - DEBUG: Mapping loaded: 103 rows and 3 columns.
2019-05-14 18:41:27,865 - scraper.core.utils - INFO: download_and_get_checksum: 176.12910270690918 ms
2019-05-14 18:41:27,866 - scraper.core.utils - DEBUG: Transforming data...
2019-05-14 18:41:27,867 - scraper.core.utils - DEBUG: Loading provider ...
2019-05-14 18:41:27,867 - scraper.core.utils - DEBUG: Adding provider to dynamic_dim: COM_ARGUSMEDIA
2019-05-14 18:41:27,868 - scraper.core.utils - INFO: Ignoring the following sheets: []
2019-05-14 18:41:32,818 - scraper.core.u

In [60]:
entity_df = ru.mappings['entity']\
                        .drop_duplicates(subset='short_name')\
                        .rename(columns={"short_name": "code"})\
                        .assign(category="refinery")

entity_df["meta_data"] = entity_df.apply(lambda x: {'url': x['detail_url']}, axis=1)
entity_df.drop(columns=["detail_url"], inplace=True)
entity_df.to_dict('records')

Int64Index([40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
            ...
            70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
           dtype='int64', length=120)

In [15]:
detail_df = ru.mappings['product'] \
                .drop_duplicates(subset=['detail_code']) \
                .assign(category="REFINERY_PRODUCTS")
detail_df = detail_df.query("detail_code != 'TOTAL'")

detail_df['code'] = "COM_ARGUSMEDIA_" + detail_df["detail_code"]
detail_df["json"] = detail_df.apply(lambda x: {'detail': 'None'}, axis=1)
detail_df.rename(columns={"detail_desc": "description"}, inplace=True)
detail_df.drop(columns=['product_code', 'detail_code'], inplace=True)

detail_df

Unnamed: 0_level_0,description,category,code,json
argus_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FUEL OIL 0.5%,Fuel Oil 0.5%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL05,{'detail': 'None'}
FUEL OIL 1%,Fuel Oil 1.0%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL10,{'detail': 'None'}
FUEL OIL 1.5%,Fuel Oil 1.5%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL15,{'detail': 'None'}
FUEL OIL 2%,Fuel Oil 2.0%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL20,{'detail': 'None'}
FUEL OIL 2.5%,Fuel Oil 2.5%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL25,{'detail': 'None'}
FUEL OIL 3%,Fuel Oil 3.0%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL30,{'detail': 'None'}
FUEL OIL 3.5%,Fuel Oil 3.5%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_FUELOIL35,{'detail': 'None'}
GASOIL 0.001%,Gasoil 0.001%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_GASOIL0001,{'detail': 'None'}
GASOIL 0.005%,Gasoil 0.005%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_GASOIL0005,{'detail': 'None'}
GASOIL 0.035%,Gasoil 0.035%,REFINERY_PRODUCTS,COM_ARGUSMEDIA_GASOIL0035,{'detail': 'None'}


In [10]:
[ x['code'] for x in ru.dynamic_dim['detail']]

['COM_ARGUSMEDIA_FUELOIL05',
 'COM_ARGUSMEDIA_FUELOIL10',
 'COM_ARGUSMEDIA_FUELOIL15',
 'COM_ARGUSMEDIA_FUELOIL20',
 'COM_ARGUSMEDIA_FUELOIL25',
 'COM_ARGUSMEDIA_FUELOIL30',
 'COM_ARGUSMEDIA_FUELOIL35',
 'COM_ARGUSMEDIA_GASOIL0001',
 'COM_ARGUSMEDIA_GASOIL0005',
 'COM_ARGUSMEDIA_GASOIL0035',
 'COM_ARGUSMEDIA_GASOIL005',
 'COM_ARGUSMEDIA_GASOIL01',
 'COM_ARGUSMEDIA_GASOIL02',
 'COM_ARGUSMEDIA_GASOIL05',
 'COM_ARGUSMEDIA_WINGASOIL',
 'COM_ARGUSMEDIA_GASOLINEA76A80',
 'COM_ARGUSMEDIA_GASOLINEA92',
 'COM_ARGUSMEDIA_GASOLINEA95',
 'COM_ARGUSMEDIA_GASOLINEA98',
 'COM_ARGUSMEDIA_GASOLINEA92PLUS',
 'COM_ARGUSMEDIA_OTHGAS',
 'COM_ARGUSMEDIA_OTHGAS',
 'COM_ARGUSMEDIA_VGO']

In [11]:
ru.mappings['product']

Unnamed: 0_level_0,product_code,detail_code,detail_desc
argus_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CRUDE THROUGHPUT ’000 B/D,CRUDEOIL,TOTAL,Crude Throughput (Kbd)
CRUDE THROUGHPUT '000 B/D,CRUDEOIL,TOTAL,Crude Throughput (Kbd)
FUEL OIL,FUELOIL,TOTAL,Fuel Oil
FUEL OIL 0.5%,FUELOIL,FUELOIL05,Fuel Oil 0.5%
FUEL OIL 1%,FUELOIL,FUELOIL10,Fuel Oil 1.0%
FUEL OIL 1.0%,FUELOIL,FUELOIL10,Fuel Oil 1.0%
FUEL OIL 1.5%,FUELOIL,FUELOIL15,Fuel Oil 1.5%
FUEL OIL 2%,FUELOIL,FUELOIL20,Fuel Oil 2.0%
FUEL OIL 2.0%,FUELOIL,FUELOIL20,Fuel Oil 2.0%
FUEL OIL 2.5%,FUELOIL,FUELOIL25,Fuel Oil 2.5%


In [11]:
s = set([x['product'] for x in ru.data])
s

{'CRUDEOIL',
 'GASDIES',
 'JETKERO',
 'MOTORGAS',
 'NAPHTHA',
 'OTHGASOIL',
 'OTHPRODS',
 'RESFUEL'}

In [49]:
import pandas as pd
df = pd.DataFrame(ru.data)

In [227]:
len(df)

485

In [228]:
df2 = df.drop_duplicates()

In [229]:
len(df2)

485

In [380]:
dfg = df.groupby(['period', 'entity', 'product','product_detail']).filter(lambda g: len(g) > 1).groupby(['period', 'entity', 'product','product_detail']).size().sort_values(ascending=False)

In [381]:
dfg

Series([], dtype: int64)

In [182]:
dfg.index.get_level_values(0).unique()

Index([], dtype='object', name='period')

In [146]:
df.query("period == 'SEP2013' and entity == 'TOTAL' and product == 'GASDIES' ")

Unnamed: 0,area,detail,entity,flow,frequency,original,period,product,provider,source,unit,value
27540,RUSSIA,,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,5758.4
27574,RUSSIA,COM_ARGUSMEDIA_GASOIL0001,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,2178.8
27592,RUSSIA,COM_ARGUSMEDIA_GASOIL0005,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,532.5
27599,RUSSIA,COM_ARGUSMEDIA_GASOIL0035,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,1438.2
27614,RUSSIA,COM_ARGUSMEDIA_GASOIL005,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,6.9
27617,RUSSIA,COM_ARGUSMEDIA_GASOIL01,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,84.1
27621,RUSSIA,COM_ARGUSMEDIA_GASOIL02,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,705.2
27636,RUSSIA,COM_ARGUSMEDIA_GASOIL0035,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,139.6
27639,RUSSIA,COM_ARGUSMEDIA_GASOIL005,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,419.1
27660,RUSSIA,COM_ARGUSMEDIA_GASOIL01,TOTAL,REFGROUT,Monthly,True,SEP2013,GASDIES,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,121.3


In [103]:
df.query("period == 'JAN2013' and entity == 'PERM' and product == 'JETKERO' and detail == 'None'")

Unnamed: 0,area,detail,entity,flow,frequency,original,period,product,provider,source,unit,value
24889,RUSSIA,,PERM,REFGROUT,Monthly,True,JAN2013,JETKERO,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,43.6
24912,RUSSIA,,PERM,REFGROUT,Monthly,True,JAN2013,JETKERO,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,235.3


In [97]:
df.query("period == 'JAN2017' and entity == 'TOTAL' and product == 'RESFUEL' and detail== 'COM_ARGUSMEDIA_FUELOIL25'")

Unnamed: 0,area,detail,entity,flow,frequency,original,period,product,provider,source,unit,value
46988,RUSSIA,COM_ARGUSMEDIA_FUELOIL25,TOTAL,REFGROUT,Monthly,True,JAN2017,RESFUEL,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,4993.4
47040,RUSSIA,COM_ARGUSMEDIA_FUELOIL25,TOTAL,REFGROUT,Monthly,True,JAN2017,RESFUEL,COM_ARGUSMEDIA,com_argusmedia_ru_ref_output,KBD,703.2


In [222]:
ru.get_sources()

In [379]:
import os
from scraper.settings import FILE_STORE_PATH

xls_path = os.path.join(FILE_STORE_PATH, ru.sources[0].path[:-1])
# sheets_to_process = ru.select_monthly_worksheets(xls_path, [])
sheets_to_process = ['Jan 2013']
df = ru.process_excel_file(xls_path, sheets_to_process)

2019-05-16 16:47:58,155 - scraper.core.utils - INFO: processing Jan 2013
2019-05-16 16:48:03,129 - scraper.core.utils - DEBUG: Cleaning data...
2019-05-16 16:48:03,130 - scraper.core.utils - DEBUG: Jan 2013: Replacing 'Company/refinery ’000t' by 'Company/refinery'
2019-05-16 16:48:03,136 - scraper.core.utils - DEBUG: breaking into tables ...
2019-05-16 16:48:03,139 - scraper.core.utils - DEBUG: melting and concatenating data ...
2019-05-16 16:48:03,236 - scraper.core.utils - DEBUG: adding period ...
2019-05-16 16:48:03,243 - scraper.core.utils - DEBUG: Filtering out rows where product contains '±%' or is in ['CRUDE THROUGHPUT ’000 B/D', '10PPM DIESEL  JAN 12', 'CRUDE THROUGHPUT'000 B/D']
2019-05-16 16:48:03,285 - scraper.core.utils - DEBUG: columns before join with product: ['period', 'entity', 'product', 'value']
2019-05-16 16:48:03,286 - scraper.core.utils - DEBUG: columns after join with product: ['period', 'entity', 'product', 'value', 'product_code', 'detail_code', 'detail_desc']


In [361]:
ROWS_TO_SKIP=5
df = pd.read_excel(xls_path, sheet_name="Jan 2017", skiprows=ROWS_TO_SKIP)
#df = ru.process_worksheet(xls_path, "Jul 2013")

In [257]:
df.columns

Index(['Company/refinery', 'Crude throughputs', 'Crude throughput ’000 b/d',
       '±% Dec', 'Crude throughput YTD', 'Crude throughput YTD ’000 b/d',
       '±% YTD 2012', 'Naphtha', '±% Dec.1', 'Naphtha YTD', '±% YTD 2012.1',
       'Gasoline', '±% Dec.2', 'Gasoline YTD', '±% YTD 2012.2', 'Unnamed: 15',
       'Unnamed: 16'],
      dtype='object')

In [172]:
df.rename(columns={'Other motor gasoline.1': 'Other motor gasoline YTD'}, inplace=True)

In [292]:
# df.iloc[204, 3]
df.iloc[:,3].replace('Total fuel oil', 'Total fuel oil YTD', inplace=True)

In [293]:
df.iloc[204,]

Company/refinery                   Company/refinery
Crude throughput                     Total fuel oil
Crude throughput '000 b/d                    ±% Sep
±% Sep                           Total fuel oil YTD
Crude throughput YTD                    ±% YTD 2012
Crude throughput YTD '000 b/d         Fuel oil 0.5%
±% YTD 2012                                  ±% Sep
Naphtha                          Fuel oil 0.5%, YTD
±% Sep.1                                ±% YTD 2012
Naphtha YTD                           Fuel oil 1.0%
±% YTD 2012.1                                ±% Sep
Jet-kerosine                     Fuel oil 1.0%, YTD
±% Sep.2                                ±% YTD 2012
Jet-kerosine YTD                      Fuel oil 1.5%
±% YTD 2012.2                                ±% Sep
Unnamed: 15                      Fuel oil 1.5%, YTD
Unnamed: 16                             ±% YTD 2012
Name: 204, dtype: object

In [205]:
df["product"].replace('10PPM DIESEL  JAN 12',"GASOIL 0.001%", inplace=True)

In [375]:
df.query("detail_desc == 'Gasoil 0.1%'")

Unnamed: 0,period,entity,entity_desc,entity_url,product,product_detail,detail_desc,value
740,SEP2013,TOTAL,Total Russia,,GASDIES,GASOIL01,Gasoil 0.1%,84.1
747,SEP2013,LUKOIL,Lukoil,http://www.lukoil.com/,GASDIES,GASOIL01,Gasoil 0.1%,-2.6
751,SEP2013,NNOVGOROD,Lukoil - Nizhny Novgorod Refinery,http://www.lukoil.com/Business/Downstream/OilR...,GASDIES,GASOIL01,Gasoil 0.1%,-2.6
763,SEP2013,ORSK,"Orsk Refinery (former Onaco, Orsknefteorgsintez)",http://www.ornpz.ru/,GASDIES,GASOIL01,Gasoil 0.1%,86.7
740,SEP2013,TOTAL,Total Russia,,GASDIES,GASOIL01,Gasoil 0.1%,121.3
741,SEP2013,BASHNEFTEKHIM,Bashneftekhim,,GASDIES,GASOIL01,Gasoil 0.1%,34.9
744,SEP2013,UFANEFTEKHIM,Bashneft - Ufaneftekhim,,GASDIES,GASOIL01,Gasoil 0.1%,34.9
747,SEP2013,LUKOIL,Lukoil,http://www.lukoil.com/,GASDIES,GASOIL01,Gasoil 0.1%,15.8
748,SEP2013,UKHTA,Lukoil - Ukhta,http://unp.lukoil.ru/ru,GASDIES,GASOIL01,Gasoil 0.1%,-0.2
749,SEP2013,PERM,Lukoil - Perm Refinery,http://www.lukoil.com/Business/Downstream/OilR...,GASDIES,GASOIL01,Gasoil 0.1%,16.0


In [362]:
start_index = df.loc[df.iloc[:, 1] == "Fuel oil 2.5%"].index
end_index = df.loc[df["Company/refinery"] == "Comparisons based on average daily volumes"].index
print (le)

2


In [349]:
import numpy as np
df.iloc[start_index[-1] + 1:end_index[-1] + 1, 1] = np.nan

In [382]:
df

Unnamed: 0,period,entity,entity_desc,entity_url,product,product_detail,detail_desc,value
0,JAN2013,TOTAL,Total Russia,,CRUDEOIL,TOTAL,Crude Throughput,23092.4
1,JAN2013,BASHNEFTEKHIM,Bashneftekhim,,CRUDEOIL,TOTAL,Crude Throughput,1732.3
2,JAN2013,NOVOIL,Bashneft - Novoil Refinery,http://www.bashneft.com/,CRUDEOIL,TOTAL,Crude Throughput,514.4
3,JAN2013,UFA,Bashneft - Ufa Oil Refinery,,CRUDEOIL,TOTAL,Crude Throughput,558.7
4,JAN2013,UFANEFTEKHIM,Bashneft - Ufaneftekhim,,CRUDEOIL,TOTAL,Crude Throughput,659.2
5,JAN2013,MOSCOW,Gazprom neft - Moscow Refinery,https://www.gazprom-neft.com/company/business/...,CRUDEOIL,TOTAL,Crude Throughput,810.4
6,JAN2013,GAZPROM,Gazprom,,CRUDEOIL,TOTAL,Crude Throughput,533.2
7,JAN2013,LUKOIL,Lukoil,http://www.lukoil.com/,CRUDEOIL,TOTAL,Crude Throughput,3935.2
8,JAN2013,UKHTA,Lukoil - Ukhta,http://unp.lukoil.ru/ru,CRUDEOIL,TOTAL,Crude Throughput,342.4
9,JAN2013,PERM,Lukoil - Perm Refinery,http://www.lukoil.com/Business/Downstream/OilR...,CRUDEOIL,TOTAL,Crude Throughput,1128.5


In [386]:
a = None

if a:
    print(a)

## Fix Authentication

Argus media changed recently their authentication schema to their website.
Now it seems that authentication is done through an API whose access is limited to IP address from their servers.

To bypass this limitation, we should try to:

* authenticate to their website through selenium
* recover all cookies from selenium into a requests.session
* download the file in the usual way with s.get()

Requirement for using selenium for automating browser interaction:

install browser driver: https://sites.google.com/a/chromium.org/chromedriver/downloads
have Chrome browser installed
If you have a problem while starting selenium, it's likely your Chrome has been upgraded. Check the link above for new version of the webdriver, put it at ..\drivers and try again.

In [2]:
%cd ..

C:\Users\ROSA_L\PycharmProjects\scraper


In [3]:
%load_ext autoreload
%autoreload 2

In [15]:
from scraper.jobs.utils import get_driver
from scraper.settings import ARGUS_USERNAME, ARGUS_PASSWORD

AUTH_URL = "https://myaccount.argusmedia.com/login"
driver = get_driver(headless=False)
driver.get(AUTH_URL)


In [20]:
from scraper.settings import ARGUS_USERNAME, ARGUS_PASSWORD
driver.find_element_by_id('username').send_keys(ARGUS_USERNAME)
driver.find_element_by_id('password').send_keys(ARGUS_PASSWORD)

In [21]:
driver.find_element_by_class_name('btn').click()

In [22]:
import requests

BASE_URL = "https://direct.argusmedia.com/DataAndDownloads/DownloadFile/3666"

headers = {
"User-Agent":
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
}
s = requests.session()
# add a header similar to the selenium one
s.headers.update(headers)

# add all cookies from selenium session into this one
s.cookies.update({c['name']:c['value'] for c in driver.get_cookies()})

response = s.get(BASE_URL)

print(response.ok)

True


In [4]:
from scraper.jobs.utils import get_driver, wait_file
from scraper.settings import ARGUS_USERNAME, ARGUS_PASSWORD, FILE_STORE_PATH
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
import requests

AUTH_URL = "https://myaccount.argusmedia.com/login"
BASE_URL = "https://direct.argusmedia.com/DataAndDownloads/DownloadFile/3666"
HEADERS = {
"User-Agent":
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
}

with get_driver(headless=False) as driver:
    logger.info(f'Authenticating to {AUTH_URL} with {ARGUS_USERNAME}...')
    driver.get(AUTH_URL)

    logger.debug(f'Entering username and password and clicking the button...')
    driver.find_element_by_id('username').send_keys(ARGUS_USERNAME)
    driver.find_element_by_id('password').send_keys(ARGUS_PASSWORD)
    driver.find_element_by_class_name('btn').click()
      
    file_path = FILE_STORE_PATH / 'Russian refinery output.xlsx'
    try:
        file_path.unlink()
    except FileNotFoundError as e:
        pass
    
    driver.get(BASE_URL)
    wait_file(file_path, 5, 300)
    new_path = file_path.with_name('com_argusmedia_ru_ref_output.xlsx')
    file_path.replace(new_path)
    

File C:\Users\ROSA_L\PycharmProjects\scraper\filestore\Russian refinery output.xlsx available. Waited 25 seconds.


In [None]:
print(BASE_URL)
response = s.get(BASE_URL)
print(response.ok)
response.content

In [15]:
s.get()

NameError: name 'response' is not defined

In [23]:
driver.close()

In [None]:
driver