In [1]:
from __future__ import print_function

import mysql.connector
from mysql.connector import errorcode

import os
import pandas as pd

#### Fetch credentials

In [2]:
d = {}

cwd = os.getcwd()
file_path = cwd + '/../mysql/creds.txt'

with open(file_path, 'r') as f:
    for l in f.readlines(): d[l.split()[0]] = l.split()[1]

### Create project database

In [3]:
conn = mysql.connector.connect(user=d['user'], password=d['pw'])
cur = conn.cursor()

DB_NAME = 'bi_project'

In [4]:
def create_database(cursor):
    try:
        cursor.execute(
            "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(DB_NAME))
    except mysql.connector.Error as err:
        print("Failed creating database: {}".format(err))
        exit(1)
try:
    cur.execute("USE {}".format(DB_NAME))
except mysql.connector.Error as err:
    print("Database {} does not exists.".format(DB_NAME))
    if err.errno == errorcode.ER_BAD_DB_ERROR:
        create_database(cur)
        print("Database {} created successfully.".format(DB_NAME))
        conn.database = DB_NAME
    else:
        print(err)
        exit(1)

In [5]:
query = """
    drop table if exists Dim_Municipality
"""
cur.execute(query)

query = """
    create table Dim_Municipality (
        loc_id int not null,
        municipality varchar(20) not null,
        primary key(loc_id)
    )
"""
cur.execute(query)

query = f"""
    load data infile '{cwd}//src//municipalities//loc_id_municipality.csv'
    into table Dim_Municipality
    fields terminated by ',' 
    enclosed by '"'
    lines terminated by '\r\n'
    ignore 1 rows
    """
cur.execute(query) 

In [6]:
query = """
    select *
    from Dim_Municipality
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,loc_id,municipality
0,0,Grimstad
1,1,Arendal
2,2,Birkenes


In [7]:
query = """
    drop table if exists Municipality_Nearest_Station
"""
cur.execute(query)

query = """
    create table Municipality_Nearest_Station (
        municipality varchar(20) not null,
        nearest_station varchar(20) not null,
        primary key(municipality)
    )
"""
cur.execute(query)

query = f"""
    load data infile '{cwd}//src//municipalities//municipality_nearest_station.csv'
    into table Municipality_Nearest_Station
    fields terminated by ',' 
    enclosed by '"'
    lines terminated by '\r\n'
    ignore 1 rows
    """
cur.execute(query) 

In [8]:
query = """
    select *
    from Municipality_Nearest_Station
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,municipality,nearest_station
0,Arendal,SN36200
1,Birkenes,SN39040
2,Bygland,SN39750


Gather all elspotprices

In [34]:
query = """
    drop table if exists Dim_Elspot
"""
cur.execute(query)

query = """
    create table Dim_Elspot (
        time datetime not null,
        price varchar(10),
        
        primary key(time)
    )
"""
cur.execute(query)

In [35]:
folder = f'{cwd}//src//elspot_prices//'
for file in os.listdir(folder):
    if file.split('.')[-1] != 'csv': continue
        
    query = f"""
        load data infile '{folder}{file}'
        into table Dim_Elspot
        fields terminated by ',' 
        enclosed by '"'
        lines terminated by '\r\n'
        ignore 1 rows
    """
    cur.execute(query) 

In [38]:
query = """
    select *
    from Dim_Elspot
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,time,price
0,2018-01-01 00:00:00,25801
1,2018-01-01 01:00:00,25899
2,2018-01-01 02:00:00,25575


In [41]:
query = """
    select 
        extract(year from time) as year,
        count(*) as count
    from Dim_Elspot
    group by year
"""
pd.read_sql(query, conn).T

Unnamed: 0,0,1,2
year,2018,2019,2020
count,8760,8760,3000


In [48]:
query = """
    drop table if exists Dim_Oilspot
"""
cur.execute(query)

query = """
    create table Dim_Oilspot(
        day date not null,
        price varchar(10),
        primary key(day)
    )
"""
cur.execute(query)

In [49]:
query = f"""
    load data infile '{cwd}//src//oilspot_prices//Brent_oilspot_prices.csv'
    into table Dim_Oilspot
    fields terminated by ',' 
    enclosed by '"'
    lines terminated by '\r\n'
    ignore 1 rows
"""
cur.execute(query) 

In [52]:
query = """
    select *
    from Dim_Oilspot
    where day like '%2020-01%'
    limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,day,price
0,2020-01-01,67.77
1,2020-01-02,67.05
2,2020-01-03,69.08
3,2020-01-06,70.25
4,2020-01-07,68.74


In [54]:
query = """
    select
        extract(year from day) as year,
        count(*) as count
    from Dim_Oilspot
    group by year
"""
pd.read_sql(query, conn).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
year,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
count,160,255,254,256,257,257,252,252,253,254,...,248,250,252,251,257,256,256,254,259,84


In [82]:
query = """
    drop table if exists Exchange_Rates
"""
cur.execute(query)

query = """
    create temporary table Exchange_Rates(
        day varchar(20) not null,
        usd varchar(10),
        eur varchar(10),
        sek varchar(10),
        dkk varchar(10),
        gbp varchar(10),
        chf varchar(10),
        jpy varchar(10),
        cad varchar(10),
        isk varchar(10),
        aud varchar(10),
        primary key(day)
    )
"""
cur.execute(query)

In [83]:
folder = f'{cwd}//src//exchange_rates//'
for file in os.listdir(folder):
    if file.split('.')[-1] != 'csv': continue
    query = f"""
        load data infile '{folder}{file}'
        into table Exchange_Rates
        fields terminated by ',' 
        enclosed by '"'
        lines terminated by '\r\n'
        ignore 1 rows
    """
    cur.execute(query) 

In [84]:
query = """
    select *
    from Exchange_Rates
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,day,usd,eur,sek,dkk,gbp,chf,jpy,cad,isk,aud
0,01.02.2018,7.7079,9.5605,97.61,128.46,10.9483,826.05,7.034,6.2567,7.6612,6.1771
1,01.02.2019,8.436,9.6571,93.2,129.36,11.0588,848.44,7.7451,6.4169,7.0195,6.1136
2,01.03.2018,7.8968,9.6353,95.35,129.39,10.8613,835.64,7.3947,6.1538,7.7889,6.1019


In [103]:
query = """
    drop table if exists Dim_USD_NOK
"""
cur.execute(query)

query = """
    create table Dim_USD_NOK as(
        select
            str_to_date(day, '%d.%m.%Y') as day,
            usd as price_1_USD
        from Exchange_Rates
        order by day asc
    )
"""
cur.execute(query)

In [104]:
query = """
    select *
    from Dim_USD_NOK
    limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,day,price_1_USD
0,2018-01-02,8.1435
1,2018-01-03,8.1416
2,2018-01-04,8.1022
3,2018-01-05,8.0835
4,2018-01-08,8.0768


In [88]:
query = """
    drop table if exists Belastning_Gathered
"""
cur.execute(query)

query = """
    create temporary table Belastning_Gathered (
        a varchar(10),
        time datetime not null,
        consumption_kvah varchar(20),
        upper_tol_kvah varchar(10),
        upper_tol_p20 varchar(10),
        loc_id int not null,
        primary key(time, loc_id)
    )
"""
cur.execute(query)

In [89]:
folder = f'{cwd}//src//belastning_nettstasjon//'
for file in os.listdir(folder):
    if file.split('.')[-1] != 'csv': continue
    file_path = os.path.join(folder, file)

    query = f"""
        load data infile '{file_path}'
        into table Belastning_Gathered
        fields terminated by ',' 
        enclosed by '"'
        lines terminated by '\r\n'
        ignore 1 rows
    """
    cur.execute(query) 

In [90]:
query = """
    drop table if exists Dim_Belastning
"""
cur.execute(query)

query = """
    create table Dim_Belastning as (
        select 
            time,
            loc_id,
            consumption_kvah,
            upper_tol_kvah
        from Belastning_Gathered
    )
"""
cur.execute(query)

query = """
    alter table Dim_Belastning
    add primary key(time, loc_id)
"""
cur.execute(query)

In [91]:
query = """
    select 
        loc_id,
        count(*) as n_entries
    from Dim_Belastning
    group by loc_id
    order by loc_id
"""
pd.read_sql(query, conn).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
loc_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
n_entries,18357,18282,15084,18257,18329,13950,18259,18292,16061,11733,18357,18333,18237,6086,15296,12395,18285,18237,18357,18261


In [92]:
query = """
    select *
    from Dim_Belastning
    where time like '%2019-12%'
    and loc_id = 2
    limit 10
"""
pd.read_sql(query, conn)

Unnamed: 0,time,loc_id,consumption_kvah,upper_tol_kvah
0,2019-12-01 00:00:00,2,80.0390529679106,100
1,2019-12-01 01:00:00,2,74.5603782179248,100
2,2019-12-01 02:00:00,2,76.5261393250698,100
3,2019-12-01 03:00:00,2,76.5147044691411,100
4,2019-12-01 04:00:00,2,66.0170432539962,100
5,2019-12-01 05:00:00,2,73.5272058492637,100
6,2019-12-01 06:00:00,2,70.528363088902,100
7,2019-12-01 07:00:00,2,70.5443123150265,100
8,2019-12-01 08:00:00,2,71.5157325348765,100
9,2019-12-01 09:00:00,2,80.0390529679106,100


Import weather reports either as `.json` files or run the last bit of preprocessing in `frost_api_to_json.ipynb` to convert them to `.csv` files instead.

The following cell will (on my computer at least) demand of you 33 hours of waiting. The mentioned preprocessing takes about 40 minutes. Better do that and skip this cell.

#### Alternative 1: Load `.json` files into weather table

In [None]:
# from time import time
# 
# query = """
#     drop table if exists jsonTable
# """
# cur.execute(query)
# 
# query = """
#     create temporary table jsonTable ( 
#         jsonFile json
#     )
# """
# cur.execute(query)
# 
# # For every .json file in folder...
# folder = 'D://_bi//src//weather_jsons//'
# for file in os.listdir(folder):
#     if file.split('.')[-1] != 'json': continue
#     file_path = os.path.join(folder, file)
# 
#     # Save the file content in variable @jsonData
#     query = f"""
#         load data infile '{file_path}'
#         into table jsonTable
#         fields terminated by '\0' escaped by ''
#         lines terminated by '\0'
#         (@jsonData)
#     """
#     cur.execute(query)
#     
#     # The data contains a time series of length n
#     query = """
#         select json_length(json_extract(@jsonData, '$.data')) 
#     """
#     cur.execute(query)
#     n = cur.fetchall()[0][0]
#     
#     tic = time()
#     print(f'Rows of {file} loaded into table'.rjust(20), 'Mins elapsed'.rjust(20), 'Estimated minutes left'.rjust(30))
# 
#     # For every i = (0, 1,..., n-1) in @jsonData's data[i] array
#     # Store the ith timestamp, weather_station id, and temperature measurement
#     for i in range(n):
#         query = f"""
#             insert into Dim_Weather (time, weather_station, temp)
# 
#             with a as(
#                 select json_extract(@jsonData, '$.data[{i}].sourceId') as ws
#             ), b as (
#                 select substring(a.ws, 2, char_length(a.ws) - 4) as ws_trimmed
#                 from a
#             ), c as (
#                 select json_extract(@jsonData, '$.data[{i}].referenceTime') as rt
#             ), d as (
#                 select substring(c.rt, 2, char_length(c.rt) - 7) as rt_trimmed
#                 from c
#             ), e as (
#                 select replace(d.rt_trimmed, 'T', ' ') as time
#                 from d
#             ), f as (
#                 select json_extract(@jsonData, '$.data[{i}].observations[0].value') as temp
#             )
#             select
#                 e.time,
#                 b.ws_trimmed,
#                 f.temp
#             from
#                 e, b, f   
#         """
#         cur.execute(query)
#         
#         # Every 10 rows, print a status update
#         if not i % 10: 
#             toc = time()
#             print(f'{i}/{n}'.rjust(0), f'{(toc-tic)/60:.2f}'.rjust(42), f'{(n/(i+1) * (toc-tic) - (toc-tic))/60 :.2f}'.rjust(22), end='\r')
#             
#     print(f'Loaded file {file} into Dim_Weather')

#### Alternative 2: Preprocess `.json` files to `.csv` in `frost_api_to_json.ipynb` and load `.csv` files into weather table

In [93]:
query = """
    drop table if exists Weather_Gathered
"""
cur.execute(query)

query = """
    create temporary table Weather_Gathered (
        a int,
        b int, 
        element_id varchar(30),
        c int,
        level varchar(70),
        perf varchar(2),
        d int, 
        timeoff varchar(10),
        timeres varchar(10),
        times_id varchar(2),
        unit varchar(5),
        value varchar(10),
        referencetime datetime not null,
        source_id varchar(20) not null,
        primary key(referencetime, source_id)
    )
"""
cur.execute(query)

In [94]:
folder = f'{cwd}//src//weather_csvs//'

for file in os.listdir(folder):
    if file.split('.')[-1] != 'csv': continue
    file_path = os.path.join(folder, file)
    
    query = f"""
        load data infile '{file_path}'
        into table Weather_Gathered
        fields terminated by ',' 
        enclosed by '"'
        lines terminated by '\r\n'
        ignore 1 rows  
    """
    cur.execute(query)

In [95]:
query = """
    select *
    from Weather_Gathered
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,a,b,element_id,c,level,perf,d,timeoff,timeres,times_id,unit,value,referencetime,source_id
0,0,0,max(air_temperature PT1H),2,"{'levelType': 'height_above_ground', 'unit': '...",C,0,PT0H,PT1H,0,degC,-0.3,2018-01-01,SN35210:0
1,0,0,max(air_temperature PT1H),2,"{'levelType': 'height_above_ground', 'unit': '...",C,0,PT0H,PT1H,0,degC,6.9,2018-01-01,SN36200:0
2,0,0,max(air_temperature PT1H),2,"{'levelType': 'height_above_ground', 'unit': '...",C,0,PT0H,PT1H,0,degC,4.1,2018-01-01,SN36330:0


In [96]:
query = """
    drop table if exists Dim_Weather
"""
cur.execute(query)

query = """
    create table Dim_Weather as (
        select 
            referencetime as time,
            trim(trailing ':0' from source_id) as weather_station,
            value as temperature
        from Weather_Gathered
    )
"""
cur.execute(query)

query = """
    alter table Dim_Weather
    add primary key(time, weather_station)
"""
cur.execute(query)

In [97]:
query = """
    select 
        weather_station,
        count(*)
    from Dim_Weather
    group by weather_station
"""
pd.read_sql(query, conn).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
weather_station,SN35210,SN36200,SN36330,SN38140,SN39040,SN39750,SN40880,SN41090,SN41770,SN41825,SN42940
count(*),18960,18912,18545,18960,18923,18960,16162,18960,18921,18960,18960


In [98]:
query = """
    select *
    from Dim_Weather
    where time like '%2020-01%'
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,time,weather_station,temperature
0,2020-01-01,SN35210,4.7
1,2020-01-01,SN36200,7.3
2,2020-01-01,SN36330,5.3


In [99]:
query = """
    drop table if exists Fact
"""
cur.execute(query)

query = """
    create table Fact 
    as(
        select 
            bel.time as time,
            date(bel.time) as day,
            bel.loc_id,
            s.nearest_station as weather_station
        from Dim_Belastning bel 
        left join
        (
            select 
                m.loc_id as loc_id,
                mns.nearest_station
            from 
                Dim_Municipality m,
                Municipality_Nearest_Station mns
            where m.municipality = mns.municipality
        ) s
        on bel.loc_id = s.loc_id  
    )
"""
cur.execute(query) 

In [100]:
query = """
    select *
    from Fact
    where loc_id = 3
    limit 3
"""
pd.read_sql(query, conn)

Unnamed: 0,time,day,loc_id,weather_station
0,2018-01-01 00:00:00,2018-01-01,3,SN39750
1,2018-01-01 01:00:00,2018-01-01,3,SN39750
2,2018-01-01 02:00:00,2018-01-01,3,SN39750


In [101]:
query = """
    select count(*)
    from Fact
"""
pd.read_sql(query, conn)

Unnamed: 0,count(*)
0,328448


In [133]:
query = """
    select 
        f.time,
        f.loc_id,
        municipality,
        price_1_USD,
        consumption_kvah,
        temperature
    from
        fact f left join Dim_Belastning b on (f.time = b.time and f.loc_id = b.loc_id)
        left join Dim_Weather w on f.weather_station = w.weather_station
        left join Dim_USD_NOK usd on f.day = usd.day
        left join Dim_Municipality m on f.loc_id = m.loc_id
        
    where f.time like '%2019-04-17 17%'
    limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,time,loc_id,municipality,price_1_USD,consumption_kvah,temperature
0,2019-04-17 17:00:00,7,Gjerstad,8.4712,266.552058705237,-0.3
1,2019-04-17 17:00:00,19,Vegårshei,8.4712,25.6499512670102,-0.3
2,2019-04-17 17:00:00,1,Arendal,8.4712,85.9069263796581,6.9
3,2019-04-17 17:00:00,9,Arendal,8.4712,144.0,6.9
4,2019-04-17 17:00:00,12,Arendal,8.4712,197.294196569489,6.9


## Sample queries

i.  Find the locations with the highest and lowest average consumption and their municipality.

In [None]:
query = """
    with a as (
        select
            f.loc_id as location_id,
            m.municipality as municipality,
            avg(b.consumption_kvah) as average_consumption
        from Fact f
        inner join Dim_Municipality m on f.loc_id=m.loc_id
        inner join Dim_Belastning b on (f.time=b.time and f.loc_id=b.loc_id)
        group by f.loc_id
    ), b as (
        select *
        from a
        order by average_consumption desc
        limit 1
    ), c as (
        select *
        from a
        order by average_consumption asc
        limit 1
    )
    select *
    from b 
    union all
    select *
    from c
"""
pd.read_sql(query, conn)

ii. For those two extremes, what was the mean temperature, oil price and el spot price at their day in 2019 of maximum consumption?

In [None]:
query = """
    with temp_a as (
        select
            f.day as date,
            f.loc_id as location_id,
            f.weather_station as weather_station,
            avg(b.consumption_kvah) as average_consumption
            
        from Fact f
        inner join Dim_Belastning b on (f.time=b.time and f.loc_id=b.loc_id)
        where f.time like '%2019%'
        and f.loc_id in (10, 13)
        group by 
            location_id,
            date
            
    ), temp_b as (
        select
            location_id,
            max(average_consumption) as max_cons
        from temp_a
        group by location_id
        
    ), temp_c as (
        select 
            temp_a.date,
            temp_a.location_id,
            temp_a.weather_station,
            temp_a.average_consumption
        from
            temp_a,
            temp_b
            
        where temp_a.average_consumption = temp_b.max_cons
        
    )
        select 
            temp_c.date,
            temp_c.location_id,
            temp_c.average_consumption,
            avg(w.temperature) as avg_temp,
            o.price as oil_price,
            avg(e.price) as avg_el_price
        from
            temp_c
            inner join Dim_Elspot e on (date = date(e.time))
            inner join Dim_Oilspot o on (date = o.day)
            inner join Dim_Weather w on (date = date(w.time) and temp_c.weather_station = w.weather_station)
            
        group by date
"""
pd.read_sql(query, conn)

...And whats the mean daily temperature, oil price and el price in 2019 for those stations anyway?

In [None]:
query = """
    select
        f.loc_id,
        avg(w.temperature) as avg_temp,
        avg(o.price) as avg_oil_price,
        avg(e.price) as avg_el_price
    from 
        Fact f
        inner join Dim_Weather w on (f.time=w.time and f.weather_id=w.weather_station)
        inner join Dim_Oilspot o on (f.day=o.day)
        inner join Dim_Elspot e on (f.time=e.time)
    
    where f.time like '%2019%'
    and f.loc_id in (10, 13)
    group by f.loc_id
    order by f.loc_id desc
"""
pd.read_sql(query, conn)

So the day of maximum el consumption were characterized by:
 * For 13: Temperature ~ten degrees C higher than the average. Oil price slightly lower than average. El price a bit higher than usual.
 * For 10: Temperature ~ten degrees C lower than the average. Oil price lower than average. El price quite higher than the usual.

### Join all interesting data to output table

In [135]:
query = """
    select
        f.time as time,
        f.loc_id as location_id,
        b.consumption_kvah as el_consumption,
        w.temperature as degC,
        o.price as oil_price,
        e.price as el_price,
        usd.price_1_USD as price_1_USD
        
    from Fact f
    left join Dim_Belastning b on (f.time=b.time and f.loc_id=b.loc_id)
    left join Dim_Weather w on (f.time=w.time and f.weather_station=w.weather_station) 
    left join Dim_Oilspot o on (f.day=o.day)
    left join Dim_Elspot e on (f.time=e.time)
    left join Dim_USD_NOK usd on (f.day = usd.day)
"""
df = pd.read_sql(query, conn)

In [136]:
df.shape

(328448, 7)

In [137]:
df.head()

Unnamed: 0,time,location_id,el_consumption,degC,oil_price,el_price,price_1_USD
0,2018-01-02,0,23,3.8,66.65,25379,8.1435
1,2018-01-02,1,156,4.4,66.65,25379,8.1435
2,2018-01-02,3,20,1.4,66.65,25379,8.1435
3,2018-01-02,4,84,-1.6,66.65,25379,8.1435
4,2018-01-02,6,41,2.3,66.65,25379,8.1435


In [138]:
df.to_csv(f'{cwd}//sql_output//consumption_metatada_all_locations.csv')

In [None]:
query = """
    select *
    from Dim_Weather
"""
df = pd.read_sql(query, conn)

In [None]:
df.shape

In [None]:
df.to_csv(f'{cwd}//sql_output//Dim_Weather.csv')

In [None]:
query = """
    select 
        f.time,
        f.loc_id,
        f.weather_station,
        b.consumption_kvah
    from Fact f
    inner join Dim_Belastning b on (f.time = b.time and f.loc_id = b.loc_id)
"""
df = pd.read_sql(query, conn)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.to_csv(f'{cwd}//sql_output//Dim_Belastning_and_weather_station.csv')

In [139]:
query = """
    select *
    from Dim_Oilspot
"""
df = pd.read_sql(query, conn)

In [140]:
df.shape

(8370, 2)

In [141]:
df.to_csv(f'{cwd}//sql_output//Dim_Oilspot.csv')

In [142]:
query = """
    select *
    from Dim_Elspot
"""
df = pd.read_sql(query, conn)

In [143]:
df.shape

(20520, 2)

In [144]:
df.to_csv(f'{cwd}//sql_output//Dim_Elspot.csv')

In [145]:
query = """
    select *
    from Dim_USD_NOK
"""
df = pd.read_sql(query, conn)

In [146]:
df.head()

Unnamed: 0,day,price_1_USD
0,2018-01-02,8.1435
1,2018-01-03,8.1416
2,2018-01-04,8.1022
3,2018-01-05,8.0835
4,2018-01-08,8.0768


In [147]:
df.to_csv((f'{cwd}//sql_output//Dim_USD_NOK.csv'))

In [None]:
query = """
    select
        f.time as time,
        f.weather_id as weather_station,
        w.temperature as degC
        
    from Fact f
    right join Dim_Weather w on (f.time=w.time and f.weather_id=w.weather_station)
"""
df = pd.read_sql(query, conn)

In [None]:
df.shape

In [148]:
conn.close()
cur.close()

True