In [1]:
from db.duckdb.duckdbhelper import DuckDBDatabaseHelper
from pprint import pprint
import pandas as pd

In [2]:
NUM_ROWS = 1500000

In [3]:
db = DuckDBDatabaseHelper("meters.db")

In [4]:
sql_query ="""

SELECT day,energy_sum, 
temperatureMin, 
temperatureMax,
uvIndex,
dewPoint,                  
Type,
date_part('month', day) as month
FROM  meters m LEFT JOIN info_household ih
ON m.LCLid = ih.LCLid
LEFT JOIN weather_daily_darksky_modified wd
ON m.day = wd.formatted_temperatureMaxTime_weather_daily_darksky
LEFT JOIN uk_bank_holidays uk_bh
ON m.day = uk_bh."Bank holidays"

WHERE date_part('year', day) = 2013

"""

db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

Connected to DuckDB database: meters.db
Fetched 1966727 rows.
Connection closed.


In [5]:
records[:5]

[(datetime.date(2013, 1, 3), 10.074, 9.65, 11.41, 1.0, 9.12, None, 1),
 (datetime.date(2013, 1, 6),
  10.293000000000003,
  6.91,
  8.61,
  1.0,
  7.0,
  None,
  1),
 (datetime.date(2013, 1, 7), 9.439000000000002, 7.1, 9.08, 1.0, 6.78, None, 1),
 (datetime.date(2013, 1, 11),
  10.978999900000002,
  0.89,
  5.7,
  1.0,
  1.68,
  None,
  1),
 (datetime.date(2013, 1, 12),
  10.585999900000003,
  1.75,
  4.53,
  1.0,
  0.79,
  None,
  1)]

In [6]:
COLS = [
"day","energy_sum", 
"temperatureMin", 
"temperatureMax",
"uvIndex",
"dewPoint",                  
"Type","month" ]

In [7]:
df_model = pd.DataFrame(records,columns = COLS)

In [8]:
df_model.head()

Unnamed: 0,day,energy_sum,temperatureMin,temperatureMax,uvIndex,dewPoint,Type,month
0,2013-01-03,10.074,9.65,11.41,1.0,9.12,,1
1,2013-01-06,10.293,6.91,8.61,1.0,7.0,,1
2,2013-01-07,9.439,7.1,9.08,1.0,6.78,,1
3,2013-01-11,10.979,0.89,5.7,1.0,1.68,,1
4,2013-01-12,10.586,1.75,4.53,1.0,0.79,,1


In [9]:
COLS_FEAT = ["temperatureMin", 
"temperatureMax",
"uvIndex",
"dewPoint",                  
"Type","month"]
COLS_LABEL = "energy_sum"

In [10]:
df_model_sample = df_model.sample(n = NUM_ROWS)

In [11]:
X = df_model_sample[COLS_FEAT]

In [12]:
y = df_model_sample[COLS_LABEL]

In [13]:
X.head()

Unnamed: 0,temperatureMin,temperatureMax,uvIndex,dewPoint,Type,month
792553,13.18,22.74,4.0,14.34,,8
1790144,5.43,8.21,4.0,4.4,,5
889022,13.12,18.88,2.0,11.82,,9
251036,-0.11,2.23,1.0,-1.81,,1
628270,6.32,17.28,6.0,4.06,Spring bank holiday,5


In [14]:
y

792553      8.320
1790144    10.491
889022      3.376
251036     18.354
628270      4.079
            ...  
153669      8.028
526654      4.959
580353      1.436
1169145    16.024
1867974     3.982
Name: energy_sum, Length: 1500000, dtype: float64

In [15]:
df_model_sample.to_csv("train.csv",index = False)