In [1]:
import sys
sys.path.append("..")

In [2]:
from db.duckdb.duckdbhelper import DuckDBDatabaseHelper
from pprint import pprint
import pandas as pd

In [3]:
NUM_ROWS = 1500000

In [4]:
db = DuckDBDatabaseHelper("../meters.db")

In [5]:
sql_query ="""

SELECT day,energy_sum, 
temperatureMin, 
temperatureMax,
uvIndex,
dewPoint,                  
Type,
date_part('month', day) as month
FROM  meters m LEFT JOIN info_household ih
ON m.LCLid = ih.LCLid
LEFT JOIN weather_daily_darksky_modified wd
ON m.day = wd.formatted_temperatureMaxTime_weather_daily_darksky
LEFT JOIN uk_bank_holidays uk_bh
ON m.day = uk_bh."Bank holidays"

WHERE date_part('year', day) = 2014

"""

db.connect()
records = db.fetch_all(sql_query)
db.close_connection()

Connected to DuckDB database: ../meters.db
Fetched 298099 rows.
Connection closed.


In [6]:
records[:5]

[(datetime.date(2014, 1, 2),
  14.195000000000004,
  6.54,
  10.75,
  1.0,
  6.02,
  None,
  1),
 (datetime.date(2014, 1, 3),
  11.382000000000003,
  7.01,
  10.49,
  1.0,
  5.08,
  None,
  1),
 (datetime.date(2014, 1, 6), 14.302, 9.34, 12.57, 1.0, 7.64, None, 1),
 (datetime.date(2014, 1, 7), 16.492, 8.98, 11.38, 1.0, 6.63, None, 1),
 (datetime.date(2014, 1, 8),
  13.079000000000002,
  7.37,
  11.62,
  1.0,
  7.02,
  None,
  1)]

In [7]:
COLS = [
"day","energy_sum", 
"temperatureMin", 
"temperatureMax",
"uvIndex",
"dewPoint",                  
"Type","month" ]

In [8]:
df_model = pd.DataFrame(records,columns = COLS)

In [9]:
df_model.head()

Unnamed: 0,day,energy_sum,temperatureMin,temperatureMax,uvIndex,dewPoint,Type,month
0,2014-01-02,14.195,6.54,10.75,1.0,6.02,,1
1,2014-01-03,11.382,7.01,10.49,1.0,5.08,,1
2,2014-01-06,14.302,9.34,12.57,1.0,7.64,,1
3,2014-01-07,16.492,8.98,11.38,1.0,6.63,,1
4,2014-01-08,13.079,7.37,11.62,1.0,7.02,,1


In [10]:
df_model['Type_Binary'] = df_model['Type'].apply(lambda x: 1 if x is not None else 0)

In [11]:
COLS_FEAT = ["temperatureMin", 
"temperatureMax",
"uvIndex",
"dewPoint",                  
"Type_Binary","month"]
COLS_LABEL = ["energy_sum"]

In [12]:
df_model = df_model[COLS_FEAT + COLS_LABEL]

In [13]:
df_model = df_model.dropna()
len(df_model)

293000

In [14]:
df_model.columns

Index(['temperatureMin', 'temperatureMax', 'uvIndex', 'dewPoint',
       'Type_Binary', 'month', 'energy_sum'],
      dtype='object')

In [15]:
if len(df_model) < NUM_ROWS:
    NUM_ROWS2 = len(df_model)
else:
    NUM_ROWS2 = NUM_ROWS

In [16]:
df_model_sample = df_model.sample(n = NUM_ROWS2)

In [17]:
X = df_model_sample[COLS_FEAT]

In [18]:
y = df_model_sample[COLS_LABEL]

In [19]:
X.head()

Unnamed: 0,temperatureMin,temperatureMax,uvIndex,dewPoint,Type_Binary,month
185075,5.85,10.13,1.0,6.23,0,2
214065,4.13,9.81,1.0,4.16,0,2
175440,1.97,8.83,1.0,3.93,0,1
170366,5.38,9.9,1.0,2.95,0,2
82507,5.86,9.81,1.0,4.96,0,2


In [20]:
y

Unnamed: 0,energy_sum
185075,4.879
214065,3.123
175440,7.453
170366,23.695
82507,16.634
...,...
195052,19.858
12466,6.180
94720,8.551
213793,16.411


In [21]:
df_model_sample.to_csv("../model_data/test.csv",index = False)