# Scrapper for Annual Refinery data - Russia

Scrapper intended to get annual refinery information for primary and secondary oil products for Russia.

Data source: Argus fundamentals file for Russia.

## How to use

1. convert manually the Argus file from *.xslx to *.xls
2. run the script : it will generate a csv file on the same directory
3. import data into an excel file using PowerQuery and pivot *Period* column using Value as value.




In [None]:
import pandas as pd
import datetime

# 
# selects the worksheets used for annual purposes from argus excel file:
# + all worksheets starting with 'Dec'
# + the last worksheet (if it does not start with 'Dec')
#
def select_annual_worksheets(excelpath):
    xl = pd.ExcelFile(excelpath)
    xl.sheet_names

    # select all december months
    selectedWks = [x for x in xl.sheet_names if x.startswith('Dec')]

    # adding latest month of the last year (if not already included)
    if not xl.sheet_names[-1].startswith("Dec"):
        selectedWks.append(xl.sheet_names[-1])
    
    return selectedWks

#
# Filter empty or non-rows
#
def clean_data(df):
    #print('cleaning data ...')
    # drop empty rows
    df.dropna(how='all', inplace=True)
    # filter out 'Russian refinery ...' rows
    df = df[~df["Company/refinery"].str.contains("Russian refinery output")]
    df = df[~df["Company/refinery"].str.contains("Comparisons")]
    return df

# 
# It split the worksheets in separate data frames when finding a title line.
# It returns a dictionary of data frames.
#
def break_into_tables(df):
    #print('breaking into tables ...')
    # get the index of table breaks
    breakIndex = df.loc[df["Company/refinery"] == "Company/refinery"].index

    # loop to split tables
    tableDic = {}
    startIndex = 0
    for num, index in enumerate(breakIndex):
        endIndex = (index - 1)
        tableDic[num] = df.loc[startIndex:endIndex].copy()
        if num != 0:
            tableDic[num].columns = tableDic[num].loc[startIndex].values
            tableDic[num] = tableDic[num].iloc[1:]
        startIndex = index

    # this creates a data frame for the last table
    tableDic[num + 1] = df.loc[startIndex:].copy()
    tableDic[num + 1].columns = tableDic[num + 1].loc[startIndex].values
    tableDic[num + 1] = tableDic[num + 1].iloc[1:]
      
    return tableDic

#
# Unpivot each data frame of the dictionary and concat them.
# It returns the resulting data frame.
#
def melt_and_concat(tableDic):
    #print('melting and concatenating data ...')
    listOfMeltedDfs = None
    # for each table, melt and concat
    for myDf in tableDic.values():
        # drop empty columns
        myDf.dropna(how='all', axis=1, inplace=True)
      
        # get the list of columns to unpivot
        varcols = [x for x in myDf.columns if x != "Company/refinery"]
        
        # unpivot it !
        meltedDf = myDf.melt(id_vars=["Company/refinery"], value_vars=varcols, var_name="Product", value_name="Value")

        # concat data frames
        if not listOfMeltedDfs:
            listOfMeltedDfs = [meltedDf]
        else:
            listOfMeltedDfs.append(meltedDf)

    # concat all data frames
    return pd.concat(listOfMeltedDfs)

#
# It returns a data frame with a Period column added as the 1st column
#
def add_period(df, period):
    #print('adding period ...')
    # add period col
    df['Period'] = period
    
    # get the list of cols
    cols = df.columns.tolist()

    # Put 'Period' at first
    cols = cols[-1:] + cols[:-1]
    return df[cols]

#
# process each selected worksheet of the excel file.
#
def process_worksheet(wks):
    start = datetime.datetime.now()
    print('processing ' + wks)
    df = pd.read_excel(excelpath, sheet_name=wks,  skiprows=5)

    # apply transformations
    cleanedDf = clean_data(df)
    tableDict = break_into_tables(cleanedDf)
    concatDf = melt_and_concat(tableDict)

    # keep only columns ending with 'YTD'
    dfYTD = concatDf[concatDf["Product"].str.endswith('YTD')]
    # add period
    periodDf = add_period(dfYTD, wks)
    
    # standardize first column
    periodDf["Company/refinery"] = periodDf["Company/refinery"].str.lstrip("\xa0")
    periodDf["Company/refinery"] = periodDf["Company/refinery"].str.lstrip(" ")
    
    end = datetime.datetime.now()
    print(f"Duration for {wks}: {str(end - start)}")
    return periodDf


#
#  Main
#

#exceldir = 'G:\OMRrefinery\MTOMR\MTOMR 2019\Refining Model\Russia'
exceldir = 'C:\\Users\\ROSA_L\\Downloads'
#filename = 'Russian refinery output.xlsx'
filename = 'Russian refinery output.xls'
excelpath = exceldir + '\\' + filename

print(f'Processing {excelpath}')

# process selected worksheets
listOfWksDfs = [process_worksheet(wks) for wks in select_annual_worksheets(excelpath)]

# concat results
fullDf = pd.concat(listOfWksDfs)
    
csvFile = "Russia_refinery_annual.csv"
csvPath = exceldir + "\\" + csvFile

fullDf.to_csv(path_or_buf=csvPath, index=False)
print(f'Done: file {csvPath} exported.')

In [8]:
import requests

r = requests.get("http://www.eppo.go.th/epposite/images/Energy-Statistics/energyinformation/Energy_Statistics/Petroleum/T02_02_02-1.xls")

In [12]:
f"The status code is: {r.status_code}"

'The status code is: 200'

In [97]:
import pandas as pd

mappings = {mapping: pd.read_csv(f"C:\\Users\\ROSA_L\\Downloads\\Russia\\201901\\{mapping}_mapping.csv", encoding='windows-1252').set_index('argus_name') for mapping in ['entity','product']}

# open csv
# rename and lowercase column names
# join with products mapping
df = pd.read_csv(f"C:\\Users\\ROSA_L\\Downloads\\Russia\\201901\\Russian_refinery_output.csv")\
    .rename(str.lower, axis='columns').rename(columns={'company/refinery': 'entity'})\
    .join(mappings['product'], on='product',rsuffix='_product')

# get rejected rows from df x prod mapping
df_rej_products = df[df["short_name"].isnull()][['period','product']].drop_duplicates()

# drops nulls from right side of join
# join with entity mapping
# [~df["short_name"].isnull()].
df = df.join(mappings['entity'],on='entity',rsuffix='_entity')
df_rej_entities = df[df["short_name_entity"].isnull()][['period','entity']].drop_duplicates()

df = df[~df["short_name"].isnull() & ~df["short_name_entity"].isnull()]
df

Unnamed: 0,period,entity,product,value,short_name,detail_short,detail_long,short_name_entity,detail_long_name,detail_url
0,Jan-03,TOTAL FOR RUSSIA,CRUDE THROUGHPUT ’000 B/D,3613.03,CRUDE,TOTAL,Crude Throughput (Kbd),TOTAL,Total Russia,
1,Jan-03,BASHNEFT,CRUDE THROUGHPUT ’000 B/D,381.63,CRUDE,TOTAL,Crude Throughput (Kbd),BASHNEFT,Bashneft,http://www.bashneft.com/
2,Jan-03,NOVOIL,CRUDE THROUGHPUT ’000 B/D,110.49,CRUDE,TOTAL,Crude Throughput (Kbd),NOVOIL,Bashneft - Novoil Refinery,http://www.bashneft.com/
3,Jan-03,UFA,CRUDE THROUGHPUT ’000 B/D,143.13,CRUDE,TOTAL,Crude Throughput (Kbd),UFA,Bashneft - Ufa Oil Refinery,
4,Jan-03,UFANEFTEKHIM,CRUDE THROUGHPUT ’000 B/D,128.01,CRUDE,TOTAL,Crude Throughput (Kbd),UFANEFTEKHIM,Bashneft - Ufaneftekhim,
5,Jan-03,CFC,CRUDE THROUGHPUT ’000 B/D,186.50,CRUDE,TOTAL,Crude Throughput (Kbd),CFC,CFC,
6,Jan-03,GAZPROM,CRUDE THROUGHPUT ’000 B/D,140.65,CRUDE,TOTAL,Crude Throughput (Kbd),GAZPROM,Gazprom,
7,Jan-03,ASTRAKHANGAZPROM,CRUDE THROUGHPUT ’000 B/D,49.50,CRUDE,TOTAL,Crude Throughput (Kbd),ASTRAKHANGAZPROM,Astrakhan Gazprom,http://astrakhandobycha.gazprom.ru/
8,Jan-03,NORILSKGAZPROM,CRUDE THROUGHPUT ’000 B/D,0.00,CRUDE,TOTAL,Crude Throughput (Kbd),NORILSKGAZPROM,Norilskgazprom,http://www.tgaz.ru
9,Jan-03,SURGUTGAZDOBYCHA,CRUDE THROUGHPUT ’000 B/D,78.25,CRUDE,TOTAL,Crude Throughput (Kbd),SURGUTGAZPROM,Surgutgazprom (now Gazprom Surgut Transgaz),http://surgut-tr.gazprom.ru/about/


In [103]:
df_rej_entities['type'] = 'entity'
df_rej_entities = df_rej_entities.rename(columns={'entity': 'value'})

df_rej_products['type'] = 'product'
df_rej_products = df_rej_products.rename(columns={'product': 'value'})

dfrej = df_rej_entities.append(df_rej_products, sort=False)

dfrej[['period','type', 'value']]

Unnamed: 0,period,type,value
59050,Dec-18,entity,TEST
59052,Dec-18,entity,BB
59050,Dec-18,product,TEST
59051,Dec-18,product,BIDON
