In [None]:
from db.duckdb.duckdbhelper import DuckDBDatabaseHelper
from pprint import pprint
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
db = DuckDBDatabaseHelper("meters.db")
db.connect()

In [None]:
sql_query =f"SELECT COUNT(1) FROM  meters"

In [None]:
records = db.fetch_all(sql_query)
print(records[0][0]/1e6)

In [None]:
sql_query =f"SELECT COUNT(DISTINCT(LCLid)) FROM  meters"
records = db.fetch_all(sql_query)
print(f"Unique LCID is {records[0][0]} ")

In [None]:
sql_query =f"SELECT SUM(energy_sum) FROM  meters"
records = db.fetch_all(sql_query)
print(f"Unique LCID is {records[0][0]/1e6} ")

In [None]:
sql_query =f"""
SELECT SUM(energy_sum),datepart('year',day)
FROM  meters GROUP BY datepart('year',day)"""
records = db.fetch_all(sql_query)
print(records)

In [None]:
sql_query ="SELECT column_name FROM information_schema.columns WHERE table_name='meters'"
records = db.fetch_all(sql_query)
pprint(records)

In [None]:
sql_query =f"""
SELECT AVG(energy_median),datepart('year',day)
FROM  meters GROUP BY datepart('year',day)"""
records = db.fetch_all(sql_query)
pprint(records)

In [None]:
sql_query =f"""
SELECT AVG(energy_median)
FROM  meters WHERE day = '2012-12-26' """
records = db.fetch_all(sql_query)
print(records)

In [None]:
db.close_connection()

In [None]:
db.connect()
sql_query ="SELECT column_name FROM information_schema.columns WHERE table_name='info_household'"
records = db.fetch_all(sql_query)
pprint(records)

In [None]:
sql_query ="""

SELECT SUM(m.energy_sum),ih.Acorn_grouped
FROM  meters m LEFT JOIN info_household ih
ON 
m.LCLid = ih.LCLid
GROUP BY ih.Acorn_grouped

"""

db.connect()
records = db.fetch_all(sql_query)
print(records[:4])
db.close_connection()

In [None]:
df_p = pd.DataFrame(records, columns =["energy_sum","Acorn_grouped"])
df_p = df_p.sort_values(by="energy_sum",ascending=False)

In [None]:
def draw_plot(fig_size_x = 15,
              fig_size_y = 10,
              tick_params_labelsize = 14,
             xlabel_name_fontsize = 20,
             ylabel_name_fontsize = 20,
             title_name_fontsize = 20):
    
    #get current figure 
    fig=plt.gcf()
    
    #set the size of the figure
    fig.set_size_inches(fig_size_x,fig_size_y)

    #get axes of the current figure 
    ax =  fig.gca()

    # set the label size of the ticks of the axes
    ax.tick_params(labelsize=tick_params_labelsize)

    # set the label size of the x axis
    ax.set_xlabel(xlabel_name,fontsize = xlabel_name_fontsize)

    # set the label size of the y axis
    ax.set_ylabel(ylabel_name,fontsize = ylabel_name_fontsize)

     # set the title of the plot
    ax.set_title(title_name,fontsize = title_name_fontsize)


In [None]:
sns.barplot( y = 'Acorn_grouped', x = 'energy_sum', 
            data = df_p, color="gray")

title_name = "Acorn_grouped and energy_sum"
xlabel_name = "Energy Sum"
ylabel_name = "Acorn Group"

draw_plot()

In [None]:
sql_query ="""

SELECT COUNT(ih.Acorn_grouped),ih.Acorn_grouped
FROM  info_household ih
GROUP BY ih.Acorn_grouped

"""

db.connect()
records = db.fetch_all(sql_query)
print(records[:4])
db.close_connection()

In [None]:
df_p = pd.DataFrame(records, columns =["Count","Acorn_grouped"])
df_p = df_p.sort_values(by="Count",ascending=False)

In [None]:
sns.barplot( y = 'Acorn_grouped', x = 'Count', 
            data = df_p, color="gray")

title_name = "Acorn_grouped and Count"
xlabel_name = "Count"
ylabel_name = "Acorn Group"

draw_plot()

In [None]:
sql_query ="""

SELECT SUM(m.energy_sum),ih.file
FROM  meters m LEFT JOIN info_household ih
ON 
m.LCLid = ih.LCLid
GROUP BY ih.file

"""

db.connect()
records = db.fetch_all(sql_query)
print(records[:4])
db.close_connection()

In [None]:
df_p = pd.DataFrame(records, columns =["energy_sum","file"])
df_p = df_p.sort_values(by="energy_sum",ascending=False)

In [None]:
sns.barplot( y = 'file', x = 'energy_sum', 
            data = df_p[:20], color="gray")

title_name = "Blocks and Energy"
xlabel_name = "Energy"
ylabel_name = "Blocks"

draw_plot()

In [None]:
# Create a list of colors for the bars

colors = ['#F3BC07', '#A9A9A9', '#8B4513'] + ['#008080'] * (len(df_p[:20]) - 3)

# Set the style
sns.set_theme(style="whitegrid")

# Create the figure with a specified size
plt.figure(figsize=(12, 8))

# Create the barplot
bar_plot = sns.barplot(y='file', 
                       x='energy_sum', 
                       data=df_p[:20], 
                       hue='file', 
                       palette=colors, 
                       dodge=False, 
                       width=0.6)

# Set titles and labels with enhanced font sizes and styles
plt.title("Blocks and Energy", fontsize=20, fontweight='bold')
plt.xlabel("Energy", fontsize=16)
plt.ylabel("Blocks", fontsize=16)

# Add data labels on top of the bars
for p in bar_plot.patches:
    bar_plot.annotate(f'{p.get_width()}', 
                      (p.get_width(), p.get_y() + p.get_height() / 2), 
                      ha='center', va='center', 
                      fontsize=12, color='black', 
                      xytext=(10, 0), textcoords='offset points')



# Show gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
sql_query ="""

SELECT COUNT(DISTINCT(ih.file))
FROM info_household ih

"""

db.connect()
records = db.fetch_all(sql_query)
print(records[:4])
db.close_connection()

In [None]:
sql_query ="""

SELECT COUNT(1),ih.file
FROM info_household ih
GROUP BY (ih.file)

"""

db.connect()
records = db.fetch_all(sql_query)
pprint(records)
db.close_connection()

In [None]:
sql_query ="""

SELECT *
FROM  meters m LEFT JOIN info_household ih
ON 
m.LCLid = ih.LCLid

"""

db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

In [None]:
sql_query ="""

SELECT energy_sum
FROM  meters m 
WHERE 
date_part('year', day) = 2013
AND date_part('month', day) = 9
AND date_part('day', day) = 10
"""

db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

In [None]:
energy_sum = pd.DataFrame(records,columns= ["energy_sum"]).fillna(0)

In [None]:
from sklearn.cluster import KMeans

In [None]:
n_energy_sum = np.array(energy_sum["energy_sum"])

In [None]:
n_energy_sum

In [None]:
kmeans = KMeans(n_clusters=10, 
                random_state=0,
                  n_init="auto").fit(n_energy_sum.reshape(-1, 1))
labels = list(kmeans.labels_)

In [None]:
clusters = pd.DataFrame()

In [None]:
clusters["energy_sum"] = energy_sum

In [None]:
clusters["labels"] = labels

In [None]:
clusters

In [None]:
result = clusters.groupby('labels')['energy_sum'].agg(['mean', 'count'])

In [None]:
result