In [28]:
# imports
import json
import requests
import pandas as pd
import pymongo.errors
from bs4 import BeautifulSoup
from io import StringIO
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi


## MongoDB

### Set up remote database

In [12]:
# read
with open("../.nosync/mongoDB.json", "r") as file:
    credentials = json.load(file)

uri = (
    "mongodb+srv://medvetslos:"
    + credentials["pwd"]
    + "@ind320-project.lunku.mongodb.net/?retryWrites=true&w=majority&appName=IND320-project"
)

mdb_client = MongoClient(uri, server_api=ServerApi("1"))

try:
    mdb_client.admin.command("ping")
    print("Pinged your deployment. Successfully connected to MongoDB.")
except Exception as exceptionMsg:
    print(exceptionMsg)

Pinged your deployment. Successfully connected to MongoDB.


In [31]:
# Creating collections for municipality data and gas prices
database = mdb_client["IND320-project"]
collection_names = ["muncipalities", "gas"]

for name in collection_names:
    # Checking if collection exists. If not, create the collection.
    try:
        database.create_collection(name)
        print(f"Collection '{name}' was created successfully.")
    except pymongo.errors.CollectionInvalid:
        print(f"Collection '{name}' already exists.")

municipalities = database["municipalities"]
gas = database["gas"]


Collection 'muncipalities' already exists.
Collection 'gas' already exists.


## Cassandra

In [None]:
from cassandra.cluster import Cluster

cluster = Cluster(["localhost"], port=9042)
session = cluster.connect()
keyspace = "new_table"
session.execute(
    "CREATE KEYSPACE IF NOT EXISTS" + 
    " " + 
    keyspace + 
    " " + 
    "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1};"
)


## Webscraping



In [27]:
webscrape_url = "https://en.wikipedia.org/wiki/List_of_municipalities_of_Denmark"

page = requests.get(webscrape_url)
soup = BeautifulSoup(page.content, "html.parser")
wiki_table = soup.find("table", attrs={"class": "wikitable sortable"})

df_municipalities = pd.read_html(StringIO(str(wiki_table)))[0]
records_municipalities = df_municipalities.to_dict("records")

LAU_1_code = df_municipalities.columns.tolist()[0]


# check if data we are writing already exists
existing_entries = list(
    database["municipalities"].find(
        {
            LAU_1_code: {
                "$in": [record[LAU_1_code] for record in records_municipalities]
            }
        }
    )
)

# if new data,
new_entries = [
    entry for entry in records_municipalities
    if not any(existing_entry[LAU_1_code] == entry[LAU_1_code] for existing_entry in existing_entries)
]

# Writing to MongoDB
if len(new_entries) > 0:
    database["municipalities"].insert_many(new_entries)
    print("Data successfully written into the collection.")
else:
    print("No new data to be inserted into the collection.")

# database["municipalities"].delete_many({}) # delete all records

No new data to be inserted into the collection.


## API

In [34]:
url_api = "https://api.energidataservice.dk/dataset/"
gas = "GasDailyBalancingPrice"  # riktig datasett? data fra 2022-10-01 og utover. ikke 2022-01-01 som oppgaven tilsier
production = "ProductionMunicipalityHour"
consumption = "productionconsumptionsettlement"
prodcons = "consumptionindustry"
query = "?start=2022-01-01&end=2022-12-31"

# # remote
# gas_json = requests.get(url=url_api+gas+query).content
# gas_data = pd.read_json(gas_json).to_dict("records")
# database["gas"].insert_many(gas_data)

# # local
# for table_name in [production, consumption, prodcons]:
#     table_data = requests.get(url=url_api+table_name+query)

#     # Table creation
#     session.execute("CREATE TABLE IF EXISTS" + " " + keyspace + "." +
#                     table_name + ";")


In [9]:
pd.DataFrame.from_dict(requests.get(url_api + gas + query).json()["records"])

In [37]:
requests.get(url_api + gas + query).json()["records"][0].keys())

'GasDay'