In [67]:
import pandas as pd
import numpy as np

In [41]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [42]:
import warnings
warnings.filterwarnings('ignore')

CONSUMPTION_ID: Unique ID of the "observed consumption", unique for each line thus stands for the determination of observed consumption for a certain period for a certain object
CONSUMPTION_OBJECT_ID: ID of the object (i.e. building) to which the "observed consumption" is linked
CONSUMPTION_START DATE: Start date of the period in which the "observed consumption" has decreased, is equal to the penultimate meter reading recorded
CONSUMPTION_END DATE: End date of the period in which the "observed consumption" was taken, is equal to the last recorded meter reading
CONSUMPTION_ESTRATED_YN: Indication whether the last recorded meter reading (and thus the derived "observed consumption") was passed on by the customer or estimated by our system / employee
CONSUMPTION: "observed consumption" in M3, is calculated by the system using the penultimate meter reading and the last meter reading
POSTCODE: Postal code of the object
CITY: City of the object
OBJECT_TYPE_NAME: Object type the object had at the time of "CONSUMPTION_ENDDATE", theoretically an object type can change but this will generally be quite stable.


Additional information:
- A passed meter reading says something about the consumption of this person between the last meter reading and a previous meter reading.
- In principle, Waternet asks every year to report the meter reading. However, there are not always exactly 365 days between each meter reading.
- Also note that some meter readings are estimated. In some cases, Waternet is wrong with the estimate and it appears when a customer submits the meter reading. An overestimated value can be corrected so that a negative number can be seen at the meter reading.
- This file is anonymized. The original contained postal codes with fewer than 60 objects. For each zip code where this was the case, the zip code has been changed to 0000. We have done this to ensure that customer data cannot be traced.

In [131]:
df = pd.read_csv("pipeline_data/Export_Verbruik_2010-2019_anon.csv")

In [132]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [97]:
df.shape

(4203393, 9)

In [133]:
columns_eng = ["CONSUMPTION_ID","CONSUMPTION_OBJECT_ID","CONSUMPTION_START_DATE",
               "CONSUMPTION_END_DATE","CONSUMPTION_ESTRATED_YN","CONSUMPTION","POSTCODE","CITY","OBJECT_TYPE_NAME"]

df.columns = columns_eng

In [134]:
df['CONSUMPTION_START_DATE'] = df['CONSUMPTION_START_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
df['CONSUMPTION_END_DATE'] = df['CONSUMPTION_END_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))

In [164]:
df['OBJECT_TYPE_NAME'].unique()

array(['HHB'], dtype=object)

### REGIONS

In [168]:
df['POSTCODE']

0    1013
1    1013
2    1013
3    1013
4    1013
5    1013
6    1013
7    1013
8    1013
9    1013
Name: POSTCODE, dtype: int64

In [175]:
postcodes = pd.read_csv("data/postalcodes_ben_edit.csv")

In [176]:
keys = postcodes.postalcode.to_list()
values = postcodes.region.to_list()
postaldict = dict(zip(keys, values))

In [178]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [180]:
save_obj(postaldict,"data/postaldict")

In [181]:
df['region'] = df['POSTCODE'].map(postaldict) 

In [182]:
df

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_ESTRATED_YN,POSTCODE,CITY,OBJECT_TYPE_NAME,consumption_per_day,region
0,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.131455,West
1,0xD763B3D7286E78B4102C06DF1FE478C899B38743,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,J,1013,AMSTERDAM,HHB,0.131868,West
2,0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,J,1013,AMSTERDAM,HHB,0.133333,West
3,0x9B04DD93592300582B286C693DE79E64AA474DF5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.110119,West
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.152047,West
5,0xBC291865657B562B624937321FB8BA7E9BCF483C,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.163317,West
6,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.153846,West
7,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.071672,West
8,0x3816A647A63C730977442667C30A9EE1959CA600,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,J,1013,AMSTERDAM,HHB,0.138889,West
9,0x054DD295277D36C46768D05C0E49ECDD406D1687,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,N,1013,AMSTERDAM,HHB,0.093834,West


### DATES
##### Testing on smaller df

In [141]:
df_smol = df.head(10)

df_smol['CONSUMPTION_START_DATE'] = df_smol['CONSUMPTION_START_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
df_smol['CONSUMPTION_END_DATE'] = df_smol['CONSUMPTION_END_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))


In [142]:
df_smol.dtypes

CONSUMPTION_ID              object
CONSUMPTION_OBJECT_ID       object
CONSUMPTION_START_DATE      object
CONSUMPTION_END_DATE        object
CONSUMPTION_ESTRATED_YN     object
CONSUMPTION                float64
POSTCODE                     int64
CITY                        object
OBJECT_TYPE_NAME            object
dtype: object

In [145]:
df_smol['period'] = df_smol['CONSUMPTION_END_DATE'] - df_smol['CONSUMPTION_START_DATE']
df_smol['period'] = df_smol['period'].dt.days

In [146]:
df_smol['consumption_per_day'] = df_smol['CONSUMPTION'] / df_smol['period']

In [148]:
## create smaller df for testing
df = df_smol
df.shape
df.columns
df

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME,period,consumption_per_day
0,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-03-08,2013-10-07,N,28.0,1013,AMSTERDAM,HHB,213,0.131455
1,0xD763B3D7286E78B4102C06DF1FE478C899B38743,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-10-07,2014-10-06,J,48.0,1013,AMSTERDAM,HHB,364,0.131868
2,0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-10-06,2014-12-05,J,8.0,1013,AMSTERDAM,HHB,60,0.133333
3,0x9B04DD93592300582B286C693DE79E64AA474DF5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-12-05,2015-11-06,N,37.0,1013,AMSTERDAM,HHB,336,0.110119
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015-11-06,2016-10-13,N,52.0,1013,AMSTERDAM,HHB,342,0.152047
5,0xBC291865657B562B624937321FB8BA7E9BCF483C,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2016-10-13,2017-11-15,N,65.0,1013,AMSTERDAM,HHB,398,0.163317
6,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2017-11-15,2018-11-27,N,58.0,1013,AMSTERDAM,HHB,377,0.153846
7,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2018-11-27,2019-09-16,N,21.0,1013,AMSTERDAM,HHB,293,0.071672
8,0x3816A647A63C730977442667C30A9EE1959CA600,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2019-09-16,2019-11-27,J,10.0,1013,AMSTERDAM,HHB,72,0.138889
9,0x054DD295277D36C46768D05C0E49ECDD406D1687,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2009-02-15,2010-02-23,N,35.0,1013,AMSTERDAM,HHB,373,0.093834


##### Convert to Monthly 

In [149]:
#added months between date_from and date_to
df1 = pd.concat([pd.Series(r.CONSUMPTION_ID,pd.date_range(r.CONSUMPTION_START_DATE, r.CONSUMPTION_END_DATE, freq='MS')) 
                 for r in df.itertuples()]).reset_index()
df1.columns = ['CONSUMPTION_START_DATE','CONSUMPTION_ID']

#added starts of months - sorting for correct positions
df2 = (pd.concat([df[['CONSUMPTION_ID','CONSUMPTION_START_DATE']], df1], sort=False, ignore_index=True)
         .sort_values(['CONSUMPTION_ID','CONSUMPTION_START_DATE'])
         .reset_index(drop=True))

#added MonthEnd and date_to  to last rows
mask = df2['CONSUMPTION_ID'].duplicated(keep='last')
s = df2['CONSUMPTION_ID'].map(df.set_index('CONSUMPTION_ID')['CONSUMPTION_END_DATE'])
df2['CONSUMPTION_END_DATE'] = np.where(mask, df2['CONSUMPTION_START_DATE'] + pd.offsets.MonthEnd(), s)

In [80]:
df_smol.drop(['CONSUMPTION_START_DATE','CONSUMPTION_END_DATE','period','CONSUMPTION'],axis=1,inplace=True)

consumption_monthly = df2.merge(df_smol,on='CONSUMPTION_ID',how='left')

In [82]:
consumption_monthly['days'] = consumption_monthly['CONSUMPTION_END_DATE'] - consumption_monthly['CONSUMPTION_START_DATE']
consumption_monthly['days'] = consumption_monthly['days'].dt.days
consumption_monthly['CONSUMPTION'] = consumption_monthly['days'] * consumption_monthly['consumption_per_day']

In [83]:
consumption_monthly.columns

Index(['CONSUMPTION_ID', 'CONSUMPTION_START_DATE', 'CONSUMPTION_END_DATE',
       'CONSUMPTION_OBJECT_ID', 'CONSUMPTION_ESTRATED_YN', 'CONSUMPTION',
       'POSTCODE', 'CITY', 'OBJECT_TYPE_NAME', 'consumption_per_day', 'days'],
      dtype='object')

In [86]:
consumption_monthly['CONSUMPTION']

0      3.649123
1      4.561404
2      4.561404
3      4.257310
4      4.561404
         ...   
97     4.615385
98     4.615385
99     4.461538
100    4.615385
101    4.000000
Name: CONSUMPTION, Length: 102, dtype: float64

In [101]:
df_smol

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME
0,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-03-08,2013-10-07,N,28.0,1013,AMSTERDAM,HHB
1,0xD763B3D7286E78B4102C06DF1FE478C899B38743,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-10-07,2014-10-06,J,48.0,1013,AMSTERDAM,HHB
2,0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-10-06,2014-12-05,J,8.0,1013,AMSTERDAM,HHB
3,0x9B04DD93592300582B286C693DE79E64AA474DF5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-12-05,2015-11-06,N,37.0,1013,AMSTERDAM,HHB
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015-11-06,2016-10-13,N,52.0,1013,AMSTERDAM,HHB
5,0xBC291865657B562B624937321FB8BA7E9BCF483C,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2016-10-13,2017-11-15,N,65.0,1013,AMSTERDAM,HHB
6,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2017-11-15,2018-11-27,N,58.0,1013,AMSTERDAM,HHB
7,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2018-11-27,2019-09-16,N,21.0,1013,AMSTERDAM,HHB
8,0x3816A647A63C730977442667C30A9EE1959CA600,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2019-09-16,2019-11-27,J,10.0,1013,AMSTERDAM,HHB
9,0x054DD295277D36C46768D05C0E49ECDD406D1687,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2009-02-15,2010-02-23,N,35.0,1013,AMSTERDAM,HHB


In [102]:
consumption_monthly

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_OBJECT_ID,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME,consumption_per_day,days
0,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2015-11-06,2015-11-30,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,3.649123,1013,AMSTERDAM,HHB,0.152047,24
1,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2015-12-01,2015-12-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.561404,1013,AMSTERDAM,HHB,0.152047,30
2,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-01-01,2016-01-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.561404,1013,AMSTERDAM,HHB,0.152047,30
3,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-02-01,2016-02-29,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.257310,1013,AMSTERDAM,HHB,0.152047,28
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-03-01,2016-03-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.561404,1013,AMSTERDAM,HHB,0.152047,30
...,...,...,...,...,...,...,...,...,...,...,...
97,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-07-01,2018-07-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.615385,1013,AMSTERDAM,HHB,0.153846,30
98,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-08-01,2018-08-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.615385,1013,AMSTERDAM,HHB,0.153846,30
99,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-09-01,2018-09-30,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.461538,1013,AMSTERDAM,HHB,0.153846,29
100,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-10-01,2018-10-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.615385,1013,AMSTERDAM,HHB,0.153846,30


##### Convert to Yearly

In [158]:
df1 = pd.concat([pd.Series(r.CONSUMPTION_ID,pd.date_range(r.CONSUMPTION_START_DATE, r.CONSUMPTION_END_DATE, freq='YS')) 
                 for r in df.itertuples()]).reset_index()
df1.columns = ['CONSUMPTION_START_DATE','CONSUMPTION_ID']

df2 = (pd.concat([df[['CONSUMPTION_ID','CONSUMPTION_START_DATE']], df1], sort=False, ignore_index=True)
         .sort_values(['CONSUMPTION_ID','CONSUMPTION_START_DATE'])
         .reset_index(drop=True))

mask = df2['CONSUMPTION_ID'].duplicated(keep='last')
s = df2['CONSUMPTION_ID'].map(df.set_index('CONSUMPTION_ID')['CONSUMPTION_END_DATE'])
df2['CONSUMPTION_END_DATE'] = np.where(mask, df2['CONSUMPTION_START_DATE'] + pd.offsets.YearEnd(), s)

In [159]:
df_smol.drop(['CONSUMPTION_START_DATE','CONSUMPTION_END_DATE','period','CONSUMPTION'],axis=1,inplace=True)
consumption_yearly = df2.merge(df_smol,on='CONSUMPTION_ID',how='left')

In [160]:
consumption_yearly['days'] = consumption_yearly['CONSUMPTION_END_DATE'] - consumption_yearly['CONSUMPTION_START_DATE']
consumption_yearly['days'] = consumption_yearly['days'].dt.days
consumption_yearly['CONSUMPTION'] = consumption_yearly['days'] * consumption_yearly['consumption_per_day']

In [162]:
consumption_yearly['CONSUMPTION']

0      8.362573
1     43.485380
2     29.932976
3      4.973190
4     10.000000
5      2.436860
6     18.491468
7     28.000000
8      2.863095
9     34.026786
10    12.902010
11    51.934673
12     8.000000
13    11.208791
14    36.659341
15     7.076923
16    50.769231
Name: CONSUMPTION, dtype: float64

In [163]:
consumption_yearly

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_OBJECT_ID,CONSUMPTION_ESTRATED_YN,POSTCODE,CITY,OBJECT_TYPE_NAME,consumption_per_day,days,CONSUMPTION
0,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2015-11-06,2015-12-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.152047,55,8.362573
1,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-01-01,2016-10-13,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.152047,286,43.48538
2,0x054DD295277D36C46768D05C0E49ECDD406D1687,2009-02-15,2009-12-31,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,N,1013,AMSTERDAM,HHB,0.093834,319,29.932976
3,0x054DD295277D36C46768D05C0E49ECDD406D1687,2010-01-01,2010-02-23,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,N,1013,AMSTERDAM,HHB,0.093834,53,4.97319
4,0x3816A647A63C730977442667C30A9EE1959CA600,2019-09-16,2019-11-27,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,J,1013,AMSTERDAM,HHB,0.138889,72,10.0
5,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,2018-11-27,2018-12-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.071672,34,2.43686
6,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,2019-01-01,2019-09-16,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.071672,258,18.491468
7,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,2013-03-08,2013-10-07,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.131455,213,28.0
8,0x9B04DD93592300582B286C693DE79E64AA474DF5,2014-12-05,2014-12-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.110119,26,2.863095
9,0x9B04DD93592300582B286C693DE79E64AA474DF5,2015-01-01,2015-11-06,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,0.110119,309,34.026786
