In [1]:
import pandas as pd
from pandas import Series, DataFrame
import string
#import sqlite3
import country_converter as coco

In [2]:
def map_country_code(df):
    iso2 = []
    for country in df.index:
        if country=='USSR':
            iso2.append('SU')
        elif country=='Netherlands Antilles':
            iso2.append('AN')
        else:
            iso2.append(coco.convert(names=country, to='iso2'))
    df['Country Code'] = iso2
    df.insert(0, 'Country Name', df.index)
    df = df.set_index('Country Code')
    return df

In [3]:
# https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html
# Statistical Review of World Energy - all data, 1965-2021
# https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/xlsx/energy-economics/statistical-review/bp-stats-review-2022-all-data.xlsx 


In [4]:
def read_bp(sheet):
    missing_values = ['-', '^','♦']
    df = pd.read_excel('raw data/bp-stats-review-2022-all-data.xlsx', sheet_name=sheet, header=2, index_col=0, 
                       na_values=missing_values)
    omit = []
    for col in df.columns:
        if type(col) != int:
            omit.append(col)
    df = df.drop(columns=omit)
    df = df.dropna(how='all')
    idx = []
    for country in df.index:
        x = country.strip(string.digits)
        idx.append(x)
    unit = df.index.name.rstrip('*')
    unit = unit.strip(string.digits)
    df.insert(0, 'Unit', unit)
    df.insert(1, 'Data Source', 'BP')
    df['Country Name'] = idx
    df = df.set_index('Country Name')
    for country in df.index:
        strings = ['Total', 'Rest of World', 'Other', 'European Union', 'OECD', 'Central America', \
                   'Eastern Africa', 'Middle Africa', 'Western Africa', 'OPEC']
        for s in strings:
            if s in country:
                df = df.drop(country)
    df = map_country_code(df)
    return df

In [5]:
bp_oil_production = read_bp('Oil Production - Tonnes')
bp_oil_production.insert(1, 'Technology Name', 'Oil production')
bp_oil_production.to_csv('cleaned data/bp.csv')

In [6]:
bp_oil_production.insert(0, 'ID', bp_oil_production['Technology Name'] + ' - ' + bp_oil_production.index)


In [9]:
bp_oil_production = bp_oil_production.set_index('ID')
bp_oil_production

Unnamed: 0_level_0,Country Name,Technology Name,Unit,Data Source,1965,1966,1967,1968,1969,1970,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Oil production - CA,Canada,Oil production,Million tonnes,BP,43.874178,48.212154,52.70114,57.119336,62.218043,70.067889,...,182.912943,195.409623,209.77701,216.074424,218.844941,236.623854,257.745067,263.462162,252.022685,267.09647
Oil production - MX,Mexico,Oil production,Million tonnes,BP,18.053894,18.489542,20.463761,21.900666,22.965009,24.179007,...,143.857278,142.091212,137.343922,127.734295,121.516944,109.58459,102.400224,95.000334,95.111225,96.486319
Oil production - US,US,Oil production,Million tonnes,BP,427.694442,454.538861,484.22163,502.880332,511.351724,533.489849,...,395.245087,448.684134,524.659261,567.122838,542.593149,574.134693,669.055938,749.895128,711.550649,711.12469
Oil production - AR,Argentina,Oil production,Million tonnes,BP,13.764759,14.643966,15.962276,17.487586,18.106759,20.000517,...,30.840747,30.185948,29.786305,30.045688,28.630797,27.240149,27.542351,28.834834,27.657269,29.066317
Oil production - BR,Brazil,Oil production,Million tonnes,BP,5.045191,6.15929,7.75086,8.516863,9.289377,8.792737,...,112.366533,110.152868,122.454669,132.192891,136.707063,142.641816,140.603161,151.199171,159.344927,156.792441
Oil production - CO,Colombia,Oil production,Million tonnes,BP,10.654546,10.443867,10.075181,9.257762,11.233911,11.759925,...,49.862574,53.18949,52.162822,52.963156,46.787709,44.981902,45.584496,46.656255,41.251981,38.878026
Oil production - EC,Ecuador,Oil production,Million tonnes,BP,0.428781,0.375184,0.321586,0.268722,0.214391,0.214391,...,27.098114,28.24278,29.829982,29.108609,29.473041,28.477106,27.72283,28.460511,25.763542,25.344872
Oil production - PE,Peru,Oil production,Million tonnes,BP,3.394165,3.394165,3.811905,3.97943,3.864122,3.862692,...,6.865127,7.314465,7.541427,6.535412,5.848813,5.693446,5.857806,6.105302,5.466467,5.305072
Oil production - TT,Trinidad & Tobago,Oil production,Million tonnes,BP,6.67146,7.562912,8.850564,9.48521,7.775441,6.933514,...,5.182263,5.108392,5.054163,4.824142,4.331345,4.379927,3.900726,3.660321,3.431932,3.489987
Oil production - VE,Venezuela,Oil production,Million tonnes,BP,184.113962,178.785909,187.925042,191.762384,190.805531,197.238062,...,139.324997,137.781445,138.510268,147.618822,132.556389,114.095175,83.843239,52.057038,32.701681,33.408471


In [None]:
bp_oil_refining = read_bp('Oil - Refining capacity')
bp_oil_refining.insert(1, 'Technology Name', 'Oil refining')
bp_oil_refining.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_gas = read_bp('Gas Production - Bcm')
bp_gas.insert(1, 'Technology Name', 'Gas production')
bp_gas.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_coal = read_bp('Coal Production - Tonnes')
bp_coal.insert(1, 'Technology Name', 'Coal production')
bp_coal.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_nuclear = read_bp('Nuclear Generation - TWh')
bp_nuclear.insert(1, 'Technology Name', 'Nuclear generation')
bp_nuclear.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_hydro = read_bp('Hydro Generation - TWh')
bp_hydro.insert(1, 'Technology Name', 'Hydroelectricity generation')
bp_hydro.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_renewable = read_bp('Renewable power - TWh')
bp_renewable.insert(1, 'Technology Name', 'Renewable energy generation')
bp_renewable.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_wind_gen = read_bp('Wind Generation - TWh')
bp_wind_gen.insert(1, 'Technology Name', 'Wind power generation')
bp_wind_gen.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_solar_gen = read_bp('Solar Generation - TWh')
bp_solar_gen.insert(1, 'Technology Name', 'Solar power generation')
bp_solar_gen.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_electricity_gen = read_bp('Electricity Generation')
bp_electricity_gen.insert(1, 'Technology Name', 'Electricity generation')
bp_electricity_gen.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_lithium = read_bp('Lithium Production-Reserves')
bp_lithium.insert(1, 'Technology Name', 'Lithium production')
bp_lithium.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_cobalt = read_bp('Cobalt Production-Reserves')
bp_cobalt.insert(1, 'Technology Name', 'Cobalt production')
bp_cobalt.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_rare_earth = read_bp('Rare Earth Production-Reserves')
bp_rare_earth.insert(1, 'Technology Name', 'Rare earth production')
bp_rare_earth.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_graphite = read_bp('Graphite Production-Reserves')
bp_graphite.insert(1, 'Technology Name', 'Graphite production')
bp_graphite.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
def read_bp2(sheet):
    missing_values = ['-', '^','♦']
    df = pd.read_excel('raw data/bp-stats-review-2022-all-data.xlsx', sheet_name=sheet, header=3, index_col=0, 
                       na_values=missing_values)
    omit = []
    for col in df.columns:
        if type(col) != int:
            omit.append(col)
    df = df.drop(columns=omit)
    df = df.dropna(how='all')
    idx = []
    for country in df.index:
        x = country.strip(string.digits)
        idx.append(x)
    df.insert(0, 'Unit', df.index.name.rstrip('*'))
    df.insert(1, 'Data Source', 'BP')
    df['Country Name'] = idx
    df = df.set_index('Country Name')
    for country in df.index:
        strings = ['Total', 'Rest of World', 'Other', 'European Union', 'OECD', 'Central America', \
                   'Eastern Africa', 'Middle Africa', 'Western Africa', 'OPEC']
        for s in strings:
            if s in country:
                df = df.drop(country)
    df = map_country_code(df)
    return df

In [None]:
bp_solar = read_bp2('Solar Capacity')
bp_solar.insert(1, 'Technology Name', 'Solar capacity - Installed photovoltaic power')
bp_solar.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
bp_wind = read_bp2('Wind Capacity')
bp_wind.insert(1, 'Technology Name', 'Wind - Installed turbine capacity')
bp_wind.to_csv('cleaned data/bp.csv', mode='a', header=False)

In [None]:
def read_bp3(sheet):
    missing_values = ['-', '^','♦']
    df = pd.read_excel('raw data/bp-stats-review-2022-all-data.xlsx', sheet_name=sheet, header=2, index_col=0, 
                       na_values=missing_values, skipfooter=40)
    omit = []
    for col in df.columns:
        if type(col) != int:
            omit.append(col)
    df = df.drop(columns=omit)
    df = df.dropna(how='all')
    idx = []
    for country in df.index:
        x = country.strip(string.digits)
        idx.append(x)
    df.insert(0, 'Unit', df.index.name.rstrip('*'))
    df.insert(1, 'Data Source', 'BP')
    df['Country Name'] = idx
    df = df.set_index('Country Name')
    for country in df.index:
        strings = ['Total', 'Rest of World', 'Other', 'European Union', 'OECD', 'Central America', \
                   'Eastern Africa', 'Middle Africa', 'Western Africa', 'OPEC']
        for s in strings:
            if s in country:
                df = df.drop(country)
    df = map_country_code(df)
    return df

In [None]:
bp_biofuels = read_bp3('Biofuels production - PJ')
bp_biofuels.insert(1, 'Technology Name', 'Biofuels production')
bp_biofuels.to_csv('cleaned data/bp.csv', mode='a', header=False)