In [1]:
import sys
sys.path.append("..")

In [2]:
from db.duckdb.duckdbhelper import DuckDBDatabaseHelper
from pprint import pprint
import pandas as pd

In [3]:
NUM_ROWS = 1500000

In [4]:
db = DuckDBDatabaseHelper("../meters.db")

In [5]:
sql_query ="""

SELECT day,energy_sum, 
temperatureMin, 
temperatureMax,
uvIndex,
dewPoint,                  
Type,
date_part('month', day) as month
FROM  meters m LEFT JOIN info_household ih
ON m.LCLid = ih.LCLid
LEFT JOIN weather_daily_darksky_modified wd
ON m.day = wd.formatted_temperatureMaxTime_weather_daily_darksky
LEFT JOIN uk_bank_holidays uk_bh
ON m.day = uk_bh."Bank holidays"

WHERE date_part('year', day) = 2013

"""

db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

Connected to DuckDB database: ../meters.db
Fetched 1966727 rows.
Connection closed.


In [6]:
records[:5]

[(datetime.date(2013, 1, 3), 10.074, 9.65, 11.41, 1.0, 9.12, None, 1),
 (datetime.date(2013, 1, 6),
  10.293000000000003,
  6.91,
  8.61,
  1.0,
  7.0,
  None,
  1),
 (datetime.date(2013, 1, 7), 9.439000000000002, 7.1, 9.08, 1.0, 6.78, None, 1),
 (datetime.date(2013, 1, 11),
  10.978999900000002,
  0.89,
  5.7,
  1.0,
  1.68,
  None,
  1),
 (datetime.date(2013, 1, 12),
  10.585999900000003,
  1.75,
  4.53,
  1.0,
  0.79,
  None,
  1)]

In [7]:
COLS = [
"day","energy_sum", 
"temperatureMin", 
"temperatureMax",
"uvIndex",
"dewPoint",                  
"Type","month" ]

In [8]:
df_model = pd.DataFrame(records,columns = COLS)

In [9]:
df_model.head()

Unnamed: 0,day,energy_sum,temperatureMin,temperatureMax,uvIndex,dewPoint,Type,month
0,2013-01-03,10.074,9.65,11.41,1.0,9.12,,1
1,2013-01-06,10.293,6.91,8.61,1.0,7.0,,1
2,2013-01-07,9.439,7.1,9.08,1.0,6.78,,1
3,2013-01-11,10.979,0.89,5.7,1.0,1.68,,1
4,2013-01-12,10.586,1.75,4.53,1.0,0.79,,1


In [10]:
COLS_FEAT = ["temperatureMin", 
"temperatureMax",
"uvIndex",
"dewPoint",                  
"Type","month"]
COLS_LABEL = "energy_sum"

In [11]:
df_model_sample = df_model.sample(n = NUM_ROWS)

In [12]:
X = df_model_sample[COLS_FEAT]

In [13]:
y = df_model_sample[COLS_LABEL]

In [14]:
X.head()

Unnamed: 0,temperatureMin,temperatureMax,uvIndex,dewPoint,Type,month
909215,10.02,20.25,5.0,8.31,,4
1185187,0.86,2.28,1.0,-5.51,,2
1654340,5.78,10.32,1.0,6.37,,11
1294395,3.53,8.38,1.0,2.1,,11
1919323,7.28,14.28,4.0,5.48,,5


In [15]:
y

909215     16.742
1185187    26.462
1654340     6.396
1294395    14.352
1919323     8.797
            ...  
1306628     9.098
34218       3.689
133480      9.428
1222080    11.625
983519      8.481
Name: energy_sum, Length: 1500000, dtype: float64

In [16]:
df_model_sample.to_csv("../model_data/train.csv",index = False)