## Genetic Programming with Energy Data

Data from the [National Grid ESO API ](https://www.nationalgrideso.com/data-portal/api-guidance). 

In [4]:
# imports and installs

import numpy as np
import pandas as pd
import requests
import json
import random
import itertools
import sys

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import HDBSCAN
from sklearn_extra.cluster import KMedoids

from sklearn.metrics import silhouette_score

## Importing data
The data below is retrieved from the National Grid Energy API, consisting of time series around electricity flow, electricity generation, capacity, pricing, and carbon. 

Each of the calls consist of multiple time series and these are combined into a single dataframe and transposed for the clustering algorithm. A total of 34 time series, each of length 17520 are used to test the genetic clustering algorithm. 

In [5]:
# API Calls to the Britain national grid API. Calling to retrieve historic electricity demand,
# interconnector, wind and solar outturn, and carbon intensity data for 2022 and/or 2023.

URL = 'https://api.nationalgrideso.com/api/3/action/datastore_search_sql?sql=SELECT * FROM "bb44a1b5-75b1-4db2-8491-257f23385006"'
response = requests.get(URL).json()
URL2 = 'https://api.nationalgrideso.com/api/3/action/datastore_search_sql?sql=SELECT * FROM "bf5ab335-9b40-4ea4-b93a-ab4af7bce003"'
response2 = requests.get(URL2).json()
URL3 = 'https://api.nationalgrideso.com/api/3/action/datastore_search_sql?sql=SELECT * FROM "3372646d-419f-4599-97a9-6bb4e7e32862"'
response3 = requests.get(URL3).json()
URL4 = 'https://api.nationalgrideso.com/api/3/action/datastore_search_sql?sql=SELECT * FROM "c16b0e19-c02a-44a8-ba05-4db2c0545a2a"'
response4 = requests.get(URL4).json()


# Converting responses from json into pandas dataframe
df_demand_2022 = pd.json_normalize(
    response["result"]["records"],
    meta=[
        "IFA_FLOW",
        "TSD",
        "VIKING_FLOW",
        "IFA2_FLOW",
        "EMBEDDED_WIND_GENERATION",
        "ND",
        "MOYLE_FLOW",
        "NEMO_FLOW",
        "ELECLINK_FLOW",
        "PUMP_STORAGE_PUMPING",
        "EMBEDDED_WIND_CAPACITY",
        "SETTLEMENT_DATE",
        "ENGLAND_WALES_DEMAND",
        "EMBEDDED_SOLAR_CAPACITY",
        "SCOTTISH_TRANSFER",
        "NON_BM_STOR",
        "_FULL_TEXT",
        "SETTLEMENT_PERIOD",
        "EAST_WEST_FLOW",
        "NSL_FLOW",
        "BRITNED_FLOW",
        "_ID",
        "EMBEDDED_SOLAR_GENERATION",
    ],
)

df_demand_2023 = pd.json_normalize(
    response2["result"]["records"],
    meta=[
        "IFA_FLOW",
        "TSD",
        "VIKING_FLOW",
        "IFA2_FLOW",
        "EMBEDDED_WIND_GENERATION",
        "ND",
        "MOYLE_FLOW",
        "NEMO_FLOW",
        "ELECLINK_FLOW",
        "PUMP_STORAGE_PUMPING",
        "EMBEDDED_WIND_CAPACITY",
        "SETTLEMENT_DATE",
        "ENGLAND_WALES_DEMAND",
        "EMBEDDED_SOLAR_CAPACITY",
        "SCOTTISH_TRANSFER",
        "NON_BM_STOR",
        "_FULL_TEXT",
        "SETTLEMENT_PERIOD",
        "EAST_WEST_FLOW",
        "NSL_FLOW",
        "BRITNED_FLOW",
        "_ID",
        "EMBEDDED_SOLAR_GENERATION",
    ],
)

df_historic_prices_2022 = pd.json_normalize(
    response3["result"]["records"],
    meta=[
        "Settlement Period",
        "Half-hourly Charge",
        "Run Type",
        "Total Daily BSUoS Charge",
        "_full_text",
        "BSUoS Price (£/MWh Hour)",
        "Settlement Day",
        "_id",
    ],
)

df_carbon = pd.json_normalize(
    response4["result"]["records"],
    meta=[
        "East Midlands",
        "East England",
        "West Midlands",
        "North Scotland",
        "South Scotland",
        "_full_text",
        "South West England",
        "datetime",
        "North Wales and Merseyside",
        "North East England",
        "South East England",
        "South Wales",
        "North West England",
        "Yorkshire",
        "London",
        "_id",
        "South England",
    ],
)

# Conversions to datetime for extracting data for specific years
df_historic_prices_2022["Settlement Day"] = pd.to_datetime(
    df_historic_prices_2022["Settlement Day"]
)
df_historic_prices_2022 = df_historic_prices_2022[
    df_historic_prices_2022["Settlement Day"].dt.year == 2022
]

df_carbon["datetime"] = pd.to_datetime(df_carbon["datetime"])
df_carbon_2023 = df_carbon[df_carbon["datetime"].dt.year == 2023]

# Dropping unused columns for future concatenation
df_demand_2022 = df_demand_2022.drop(["_full_text", "NON_BM_STOR"], axis=1)
df_demand_2023 = df_demand_2023.drop(["_full_text", "NON_BM_STOR"], axis=1)
df_historic_prices_2022 = df_historic_prices_2022.drop(
    ["Run Type", "_full_text"], axis=1
)
df_carbon_2023 = df_carbon_2023.drop(["_full_text"], axis=1)

print(" ----- Shape of dataframes ----- ")
print(" --- 2022 Demand: ", df_demand_2022.shape)
print(" --- 2023 Demand: ", df_demand_2023.shape)
print(" --- 2022 Prices:", df_historic_prices_2022.shape)
print(" --- 2023 Carbon:", df_carbon_2023.shape)

 ----- Shape of dataframes ----- 
 --- 2022 Demand:  (17520, 19)
 --- 2023 Demand:  (17520, 21)
 --- 2022 Prices: (17520, 6)
 --- 2023 Carbon: (17520, 16)


In [6]:
# Dropping id columns and now unused date columns.
# Only want the time series that will be clustered - each are of size 17520
# so the "_id" column is able to be dropped.

df_demand_2022_noid = df_demand_2022.drop(
    ["_id", "SETTLEMENT_DATE", "SETTLEMENT_PERIOD"], axis=1
)
df_demand_2023_noid = df_demand_2023.drop(
    ["_id", "SETTLEMENT_DATE", "SETTLEMENT_PERIOD"], axis=1
)
df_demand_2023_noid.columns = [str(col) + "_2" for col in df_demand_2023_noid.columns]
df_historic_prices_2022_noid = df_historic_prices_2022.drop(
    ["Settlement Period", "Settlement Day", "_id"], axis=1
)
df_carbon_2023_noid = df_carbon_2023.drop(["_id", "datetime"], axis=1)

# Concatenating the dataframes.
df_full = pd.concat(
    [
        df_historic_prices_2022_noid.reset_index().drop("index", axis=1, inplace=True),
        df_demand_2022_noid,
        df_demand_2023_noid,
        df_carbon_2023_noid.reset_index().drop("index", axis=1, inplace=True),
    ],
    axis=1,
)

# Must perform scaling since clustering algorithms work on similarity/distance
df_full = StandardScaler().fit_transform(df_full)
df_full_transposed = df_full.transpose()

# Checking for any null values in the time series
null_count_full = 0
for curr_list in df_full_transposed:
    null_count = sum(1 for item in curr_list if item is None)
    null_count_full += null_count

print("Number of nulls:", null_count_full)

Number of nulls: 0


## Dictionaries for Model Generation

We are going to instantiate multiple clustering models as our initial population for the algorithm, and use these similarly to mutate and crossover algorithms. The models used in the algorithm are below, with associated parameters to adjust:
- KMeans: number of clusters, max iteration, tolerance
- KMedoids: number of clusters, metric, method, max iteration
- DBSCAN: epsilon, minimum samples, metric
- HDBSCAN: epsilon, minimum samples, metric

In [7]:
# Defining which parameters are appropriate to adjust for each clustering model

model_list = ["KMeans", "KMedoids", "DBSCAN", "HDBSCAN"]

list_dict_model_params = [
    {"KMeans": ["n_clusters", "max_iter", "tol"]},
    {"KMedoids": ["n_clusters", "metric_1", "method", "max_iter"]},
    {"DBSCAN": ["eps", "min_samples", "metric_1"]},
    {"HDBSCAN": ["metric_2", "min_samples", "eps"]},
]

# Defining which parameter values each model can take
dict_param_values = {
    "n_clusters": list(range(2, 11)),
    "max_iter": list(range(50, 510, 10)),
    "tol": list(np.arange(0.0001, 0.1001, 0.001)),
    "metric_1": [
        "euclidean",
        "cosine",
        "haversine",
        "l2",
        "cityblock",
        "l1",
        "manhattan",
    ],
    "metric_2": [
        "l2",
        "canberra",
        "manhattan",
        "euclidean",
        "braycurtis",
        "chebyshev",
        "hamming",
    ],
    "method": ["alternate", "pam"],
    "eps": list(np.arange(0.1, 4.1, 0.1)),
    "min_samples": list(range(3, 11)),
}

## Functions for the Genetic Algorithm

In [18]:
# The import evolution_fns includes all functions required for evolution
import evolution_fns

In [19]:
# The evolution function is an iterative algorithm that comprises of an initial population,
# fitness evalution, selection, crossover, and mutation. Returns the top 10 models.

evolution_fns.evolution(
    model_params=list_dict_model_params,
    param_values=dict_param_values,
    init_population_num=20,
    df=df_full_transposed,
    selection_param=0.8,
    crossover_repeat=2,
    mutation_repeat=2,
    cutoff_score=0.4,
)

Top model: KMeans(max_iter=50, tol=0.0061), associated score: 0.25340341983699355
Top model: KMeans(max_iter=50, tol=0.041100000000000005), associated score: 0.26062967012871296
Top model: KMeans(tol=0.032100000000000004), associated score: 0.2842059653587293
Top model: KMeans(tol=0.032100000000000004), associated score: 0.27468539890174
Top model: KMeans(max_iter=90, tol=0.0941), associated score: 0.28306130060266455
Top model: KMeans(max_iter=50, tol=0.0061), associated score: 0.28317910424671267
Top model: KMeans(max_iter=50, tol=0.0901), associated score: 0.2842059653587293
Top model: KMeans(max_iter=50, tol=0.041100000000000005), associated score: 0.2868184018099966
Top model: KMeans(max_iter=50, tol=0.0061), associated score: 0.28306130060266455
Top model: KMeans(max_iter=380, tol=0.032100000000000004), associated score: 0.2875367162737374
Top model: KMeans(max_iter=50, tol=0.0751), associated score: 0.28385385434315336
Top model: KMeans(max_iter=160, tol=0.06910000000000001), as

[(KMeans(max_iter=380, n_clusters=10, tol=0.032100000000000004),
  0.3226796662985823),
 (KMeans(max_iter=50, n_clusters=10, tol=0.0731), 0.3226796662985823),
 (KMeans(max_iter=50, n_clusters=10, tol=0.032100000000000004),
  0.31843003087318367),
 (KMeans(max_iter=260, n_clusters=10, tol=0.032100000000000004),
  0.31743999685079555),
 (KMeans(max_iter=50, n_clusters=10, tol=0.032100000000000004),
  0.31743999685079555),
 (KMeans(max_iter=50, n_clusters=10, tol=0.032100000000000004),
  0.31471874124499927),
 (KMeans(max_iter=50, n_clusters=10, tol=0.032100000000000004),
  0.31471874124499927),
 (KMeans(max_iter=50, n_clusters=10, tol=0.032100000000000004),
  0.3138412933519657),
 (KMeans(max_iter=390, n_clusters=10, tol=0.032100000000000004),
  0.31336945757920026),
 (KMeans(max_iter=390, n_clusters=10, tol=0.032100000000000004),
  0.3116664364498815)]