In [3]:
# imports
import json
import requests
import pandas as pd
import pymongo.errors
from bs4 import BeautifulSoup
from dateutil.parser import parse, ParserError
from io import StringIO
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi


## MongoDB

### Set up remote database

In [15]:
# read
with open("../.nosync/mongoDB.json", "r") as file:
    credentials = json.load(file)

uri = (
    "mongodb+srv://medvetslos:"
    + json.load(open("../.nosync/mongoDB.json"))["pwd"]
    + "@ind320-project.lunku.mongodb.net/?retryWrites=true&w=majority&appName=IND320-project"
)

mdb_client = MongoClient(uri, server_api=ServerApi("1"))

try:
    mdb_client.admin.command("ping")
    print("Pinged your deployment. Successfully connected to MongoDB.")
except Exception as exceptionMsg:
    print(exceptionMsg)

Pinged your deployment. Successfully connected to MongoDB.


In [16]:
# Creating collections for municipality data and gas prices
database = mdb_client["IND320-project"]
collection_names = ["muncipalities", "gas"]

for name in collection_names:
    # Checking if collection exists. If not, create the collection.
    try:
        database.create_collection(name)
        print(f"Collection '{name}' was created successfully.")
    except pymongo.errors.CollectionInvalid:
        print(f"Collection '{name}' already exists.")

municipalities = database["municipalities"]
gas = database["gas"]


Collection 'muncipalities' already exists.
Collection 'gas' already exists.


## Cassandra

In [17]:
from cassandra.cluster import Cluster

cluster = Cluster(["localhost"], port=9042)
session = cluster.connect()
keyspace = "ind320_project"
session.execute(
    "CREATE KEYSPACE IF NOT EXISTS" + 
    " " + 
    keyspace + 
    " " + 
    "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1};"
)

session.set_keyspace(keyspace)

## Webscraping



In [18]:
webscrape_url = "https://en.wikipedia.org/wiki/List_of_municipalities_of_Denmark"

page = requests.get(webscrape_url)
soup = BeautifulSoup(page.content, "html.parser")
wiki_table = soup.find("table", attrs={"class": "wikitable sortable"})

df_municipalities = pd.read_html(StringIO(str(wiki_table)))[0]
records_municipalities = df_municipalities.to_dict("records")

LAU_1_code = df_municipalities.columns.tolist()[0]


# check if data we are writing already exists
existing_entries = list(
    database["municipalities"].find(
        {
            LAU_1_code: {
                "$in": [record[LAU_1_code] for record in records_municipalities]
            }
        }
    )
)

# if new data,
new_entries = [
    entry for entry in records_municipalities
    if not any(existing_entry[LAU_1_code] == entry[LAU_1_code] for existing_entry in existing_entries)
]

# Writing to MongoDB
if len(new_entries) > 0:
    database["municipalities"].insert_many(new_entries)
    print("Data successfully written into the collection.")
else:
    print("No new data to be inserted into the collection.")

# database["municipalities"].delete_many({}) # delete all records

No new data to be inserted into the collection.


## API

In [4]:
api_url = "https://api.energidataservice.dk/dataset/"
gas = "GasDailyBalancingPrice"  # riktig datasett? data fra 2022-10-01 og utover. ikke 2022-01-01 som oppgaven tilsier
# production = "ProductionMunicipalityHour"
# consumption = "ConsumptionIndustry"
# prodcons = "ProductionConsumptionSettlement"
filter_query = "?offset=0&start=2022-01-01T00:00&end=2022-12-31T00:00&sort=HourUTC%20ASC"

api_datasets = {
    "gas" : "GasDailyBalancingPrice",
    "production" : "ProductionMunicipalityHour",
    "consumption" : "ConsumptionIndustry",
    "prodcons" : "ProductionConsumptionSettlement"
}


# # remote
# gas_json = requests.get(url=url_api+gas+query).content
# gas_data = pd.read_json(gas_json).to_dict("records")
# database["gas"].insert_many(gas_data)


#### local cassandra db

Idé, sende json data til pandas for lettere håndtering?

In [10]:
production_json = requests.get(api_url+api_datasets["production"]+filter_query)
production_records = production_json.json()["records"]

In [45]:
type(production_records[0]["SolarMWh"])

float

In [9]:
session.execute(f"DROP TABLE IF EXISTS {api_datasets["production"]}")

# production
production_table = f"""
    CREATE TABLE IF NOT EXISTS {api_datasets["production"]} (
    id timeuuid PRIMARY KEY,
    HourUTC timestamp,
    HourDK timestamp,
    MunicipalityNo int,
    SolarMWh float,
    OffshoreWindLt100MW_MWh float,
    OffshoreWindGe100MW_MWh float,
    OnshoreWindMWh float,
    ThermalPowerMWh float 
    )
"""

session.execute(production_table)

# inserting into table
insert_production = f"""
    INSERT INTO {api_datasets["production"]} (
    id,
    HourUTC,
    HourDK,
    MunicipalityNo, 
    SolarMWh,
    OffshoreWindLt100MW_MWh,
    OffshoreWindGe100MW_MWh,
    OnshoreWindMWh,
    ThermalPowerMWh 
    )
    VALUES (now(), ?, ?, ?, ?, ?, ?, ?, ?)
"""

prepared_production = session.prepare(insert_production)

Idea: automate what datatype each column of the cassandra database is going to be. use eval() to determine what kind datatype is going to be set
i.e.

- str -> text
- float -> float
- int -> int

for datetime objects, we can use dateutil.parser import parse to recognize timestamps

In [10]:
from cassandra.concurrent import execute_concurrent_with_args

production_data = [
    (
    parse(record['HourUTC']),
    parse(record['HourDK']),
    int(record['MunicipalityNo']),
    record['SolarMWh'],
    record['OffshoreWindLt100MW_MWh'],
    record['OffshoreWindGe100MW_MWh'],
    record['OnshoreWindMWh'],
    record["ThermalPowerMWh"]
    )
    for record in production_records
]

execute_concurrent_with_args(session, prepared_production, production_data,
                             concurrency=100)

NameError: name 'production_records' is not defined

In [26]:
consumption_json = requests.get(api_url+ api_datasets["consumption"] + filter_query)
consumption_records = consumption_json.json()["records"]
consumption_records[:1]

[{'HourUTC': '2021-12-31T23:00:00',
  'HourDK': '2022-01-01T00:00:00',
  'MunicipalityNo': '710',
  'Branche': 'Privat',
  'ConsumptionkWh': 10190.681}]

In [36]:
session.execute(f"DROP TABLE IF EXISTS {api_datasets["consumption"]}")

consumption_table = f""" 
    CREATE TABLE IF NOT EXISTS {api_datasets["consumption"]} (
        id timeuuid PRIMARY KEY,
        HourUTC timestamp, 
        HourDK timestamp, 
        MunicipalityNo int,
        Branche text,
        ConsumptionkWh float
    )
"""

session.execute(consumption_table)

insert_consumption = f"""
    INSERT INTO {api_datasets["consumption"]} (
    id,
    HourUTC,
    HourDK,
    MunicipalityNo, 
    Branche,
    ConsumptionkWh
    )
    VALUES (now(), ?, ?, ?, ?, ?) 
"""

prepare_consumption = session.prepare(insert_consumption)

In [37]:
consumption_data = [
    (
    parse(record['HourUTC']),
    parse(record['HourDK']),
    int(record['MunicipalityNo']),
    record["Branche"],
    record["ConsumptionkWh"]
    )
    for record in consumption_records
]

execute_concurrent_with_args(session, prepare_consumption, consumption_data,
                             concurrency=100)

[ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x40b726a20>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x763817c80>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x40b727ad0>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7638174a0>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x763815280>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x40b727f80>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x40b727860>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x763814dd0>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x40b7273e0>),
 ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x763814fe0>),
 Execution

In [41]:
consumption_rows = session.execute(f"SELECT * FROM {api_datasets["consumption"]} LIMIT 3")
for row in consumption_rows:
    print(row)

Row(id=UUID('5c807598-7f09-11ef-ba87-33401050b895'), branche='Erhverv', consumptionkwh=13543.0595703125, hourdk=datetime.datetime(2022, 8, 28, 6, 0), hourutc=datetime.datetime(2022, 8, 28, 4, 0), municipalityno=706)
Row(id=UUID('5fabc420-7f09-11ef-ba87-33401050b895'), branche='Privat', consumptionkwh=5751.69189453125, hourdk=datetime.datetime(2022, 9, 12, 13, 0), hourutc=datetime.datetime(2022, 9, 12, 11, 0), municipalityno=550)
Row(id=UUID('55024508-7f09-11ef-ba87-33401050b895'), branche='Privat', consumptionkwh=3022.52001953125, hourdk=datetime.datetime(2022, 7, 24, 2, 0), hourutc=datetime.datetime(2022, 7, 24, 0, 0), municipalityno=450)


In [5]:
prodcon_json = requests.get(api_url + api_datasets["prodcons"] + filter_query)
prodcon_records = prodcon_json.json()["records"]
prodcon_records[0]

{'HourUTC': '2021-12-31T23:00:00',
 'HourDK': '2022-01-01T00:00:00',
 'PriceArea': 'DK2',
 'CentralPowerMWh': 448.61849,
 'LocalPowerMWh': 50.807763,
 'CommercialPowerMWh': 274.005694,
 'LocalPowerSelfConMWh': 16.460406,
 'OffshoreWindLt100MW_MWh': 10.5253,
 'OffshoreWindGe100MW_MWh': 808.351601,
 'OnshoreWindLt50kW_MWh': 0.509807,
 'OnshoreWindGe50kW_MWh': 300.038752,
 'HydroPowerMWh': 0.0,
 'SolarPowerLt10kW_MWh': 0.015828,
 'SolarPowerGe10Lt40kW_MWh': 0.008131,
 'SolarPowerGe40kW_MWh': 0.0,
 'SolarPowerSelfConMWh': 0.0,
 'UnknownProdMWh': 0.02617,
 'ExchangeNO_MWh': None,
 'ExchangeSE_MWh': -830.0355,
 'ExchangeGE_MWh': 726.890008,
 'ExchangeNL_MWh': None,
 'ExchangeGB_MWh': None,
 'ExchangeGreatBelt_MWh': -390.7,
 'GrossConsumptionMWh': 1415.52245,
 'GridLossTransmissionMWh': 53.803703,
 'GridLossInterconnectorsMWh': 12.278992,
 'GridLossDistributionMWh': 64.092013,
 'PowerToHeatMWh': 7.07947}

In [11]:
production_records[0]

{'HourUTC': '2021-12-31T23:00:00',
 'HourDK': '2022-01-01T00:00:00',
 'MunicipalityNo': '510',
 'SolarMWh': 0.0,
 'OffshoreWindLt100MW_MWh': None,
 'OffshoreWindGe100MW_MWh': None,
 'OnshoreWindMWh': 12.903433,
 'ThermalPowerMWh': 0.0}

In [20]:
pd.DataFrame(production_records).dtypes

"""
json -> df
finnes det en eval tilsvarende for å bestemme kolonne dtype 
lage kolonner for tabell og bestemme tabelldtype
df -> json
"""

HourUTC                     object
HourDK                      object
MunicipalityNo              object
SolarMWh                   float64
OffshoreWindLt100MW_MWh    float64
OffshoreWindGe100MW_MWh    float64
OnshoreWindMWh             float64
ThermalPowerMWh            float64
dtype: object

In [29]:
def infer_dtype(value):
    try:
        evaluate = eval(value)
        return print(type(evaluate).__name__)
    except (SyntaxError, NameError, TypeError):
        pass
    
    try:
        parse(value)
        return print("datetime")
    except (ValueError, ParserError, TypeError):
        pass

    if value is None:
        return print("float")
    elif isinstance(value, str):
        return print("text")
    else:
        return print(type(value).__name__)
    

for val in production_records[0].values():
    infer_dtype(val)

datetime
datetime
int
float
float
float
float
float


In [26]:
infer_dtype("2021-12-31T23:00:00")

datetime
