In [2]:
import sys
sys.path.append("..")

In [3]:
from db.duckdb.duckdbhelper import DuckDBDatabaseHelper
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cluster import KMeans
from pprint import pprint

In [4]:
db = DuckDBDatabaseHelper("../meters.db")
db.connect()

Connected to DuckDB database: ../meters.db


In [5]:
sql_query ="""

SELECT SUM(energy_sum),LCLid
FROM  meters m 
WHERE 
date_part('year', day) = 2014
GROUP BY LCLid
"""

db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

Connected to DuckDB database: ../meters.db
Fetched 5108 rows.
Connection closed.


In [6]:
energy_sum = pd.DataFrame(records,columns= ["energy_sum",
                                            "LCLid"]).fillna(0)

In [7]:
n_energy_sum = np.array(energy_sum["energy_sum"])

In [8]:
kmeans = KMeans(n_clusters=5, 
                random_state=0,
                  n_init="auto").fit(n_energy_sum.reshape(-1, 1))
labels = list(kmeans.labels_)

In [9]:
clusters = pd.DataFrame()

In [10]:
clusters["energy_sum"] = energy_sum["energy_sum"]

In [11]:
clusters["labels"] = labels

In [12]:
clusters

Unnamed: 0,energy_sum,labels
0,2231.721002,0
1,949.765999,2
2,1036.168000,2
3,1078.617000,2
4,2399.529000,0
...,...,...
5103,1319.622000,2
5104,500.209000,4
5105,616.961000,4
5106,349.471000,1


In [13]:
result = clusters.groupby('labels')['energy_sum'].agg(['median', 'count'])

In [14]:
result.sort_values(by="median",ascending=False)

Unnamed: 0_level_0,median,count
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
3,3284.566,53
0,1883.313,239
2,1087.135,776
4,631.936,1761
1,300.092,2279


In [15]:
energy_sum.head()

Unnamed: 0,energy_sum,LCLid
0,2231.721002,MAC000778
1,949.765999,MAC000850
2,1036.168,MAC002924
3,1078.617,MAC002937
4,2399.529,MAC003166


In [16]:
len(energy_sum),len(clusters)

(5108, 5108)

In [17]:
energy_sum["cluster_label"] = clusters["labels"]

In [18]:
energy_sum.head()

Unnamed: 0,energy_sum,LCLid,cluster_label
0,2231.721002,MAC000778,0
1,949.765999,MAC000850,2
2,1036.168,MAC002924,2
3,1078.617,MAC002937,2
4,2399.529,MAC003166,0


In [19]:
energy_sum[energy_sum["cluster_label"] == 2].head()

Unnamed: 0,energy_sum,LCLid,cluster_label
1,949.765999,MAC000850,2
2,1036.168,MAC002924,2
3,1078.617,MAC002937,2
11,913.555,MAC001598,2
35,989.239,MAC001371,2


In [20]:
energy_sum[energy_sum["cluster_label"] == 4].head()

Unnamed: 0,energy_sum,LCLid,cluster_label
5,528.289,MAC004529,4
8,576.595,MAC001546,4
14,661.406,MAC002069,4
19,872.723,MAC000681,4
20,758.442,MAC000709,4


In [21]:
sql_query = """
DROP TABLE IF EXISTS cluster_energy 
"""
db.connect()
records = db.execute_query(sql_query)
db.close_connection()

Connected to DuckDB database: ../meters.db
Query executed successfully.
Connection closed.


In [22]:
sql_query = """
CREATE TABLE IF NOT EXISTS cluster_energy AS
SELECT * FROM energy_sum
"""
db.connect()
db.register_df("energy_sum",energy_sum)
records = db.execute_query(sql_query)
db.close_connection()

Connected to DuckDB database: ../meters.db
Query executed successfully.
Connection closed.


In [23]:
sql_query = """
SELECT * FROM cluster_energy
"""
db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

Connected to DuckDB database: ../meters.db
Fetched 5108 rows.
Connection closed.


In [24]:
records[:4]

[(2231.7210023, 'MAC000778', 0),
 (949.7659987, 'MAC000850', 2),
 (1036.1679999, 'MAC002924', 2),
 (1078.6169998999997, 'MAC002937', 2)]

In [25]:
db.connect()
sql_query ="""
SELECT table_name, table_type 
FROM information_schema.tables;
"""
records = db.fetch_all(sql_query)
pprint(records)

db.close_connection()

Connected to DuckDB database: ../meters.db
Fetched 6 rows.
[('cluster_energy', 'BASE TABLE'),
 ('info_household', 'VIEW'),
 ('meters', 'VIEW'),
 ('uk_bank_holidays', 'VIEW'),
 ('weather_daily_darksky', 'VIEW'),
 ('weather_daily_darksky_modified', 'VIEW')]
Connection closed.
