In [31]:
import pandas as pd
import numpy as np

In [32]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [33]:
import warnings
warnings.filterwarnings('ignore')

CONSUMPTION_ID: Unique ID of the "observed consumption", unique for each line thus stands for the determination of observed consumption for a certain period for a certain object  
CONSUMPTION_OBJECT_ID: ID of the object (i.e. building) to which the "observed consumption" is linked.  
CONSUMPTION_START DATE: Start date of the period in which the "observed consumption" has decreased, is equal to the penultimate meter reading recorded  
CONSUMPTION_END DATE: End date of the period in which the "observed consumption" was taken, is equal to the last recorded meter reading  
CONSUMPTION_ESTRATED_YN: Indication whether the last recorded meter reading (and thus the derived "observed consumption") was passed on by the customer or estimated by our system / employee  
CONSUMPTION: "observed consumption" in M3, is calculated by the system using the penultimate meter reading and the last meter reading
POSTCODE: Postal code of the object  
CITY: City of the object  
OBJECT_TYPE_NAME: Object type the object had at the time of "CONSUMPTION_ENDDATE", theoretically an object type can change but this will generally be quite stable.  


Additional information:
- A passed meter reading says something about the consumption of this person between the last meter reading and a previous meter reading.
- In principle, Waternet asks every year to report the meter reading. However, there are not always exactly 365 days between each meter reading.
- Also note that some meter readings are estimated. In some cases, Waternet is wrong with the estimate and it appears when a customer submits the meter reading. An overestimated value can be corrected so that a negative number can be seen at the meter reading.
- This file is anonymized. The original contained postal codes with fewer than 60 objects. For each zip code where this was the case, the zip code has been changed to 0000. We have done this to ensure that customer data cannot be traced.

In [44]:
df = pd.read_csv("../pipeline_data/Export_Verbruik_2010-2019_anon.csv")

In [45]:
df.columns

Index(['Unnamed: 0', 'VERBRUIK_ID', 'VERBRUIK_OBJECT_ID', 'VERBRUIK_STARTDAT',
       'VERBRUIK_EINDDATUM', 'VERBRUIK_GESCHAT_JN', 'VERBRUIK', 'POSTCODE',
       'STAD', 'OBJECT_TYPE_NAME'],
      dtype='object')

In [42]:
df[df['CONSUMPTION_END_DATE'].isna()]

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME


In [43]:
df_test = df[df.CONSUMPTION_ID == '0x00007D66B6B07398948077754DB6EFA778CE11ED'] #df.head(100)
#df_test.to_csv("../pipeline_data/100_test.csv",index=False)

In [19]:
#df= df_test
#print(df.shape)
df.head()

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME
0,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-03-08,2013-10-07,N,28.0,1013,AMSTERDAM,HHB
1,0xD763B3D7286E78B4102C06DF1FE478C899B38743,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-10-07,2014-10-06,J,48.0,1013,AMSTERDAM,HHB
2,0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-10-06,2014-12-05,J,8.0,1013,AMSTERDAM,HHB
3,0x9B04DD93592300582B286C693DE79E64AA474DF5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-12-05,2015-11-06,N,37.0,1013,AMSTERDAM,HHB
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015-11-06,2016-10-13,N,52.0,1013,AMSTERDAM,HHB


In [40]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)
#df.iloc[:,2:4]

In [30]:
str(df['CONSUMPTION_START_DATE'].isnull().sum() + df['CONSUMPTION_END_DATE'].isnull().sum())

'35'

In [37]:
df.shape

(4203393, 10)

In [41]:
columns_eng = ["CONSUMPTION_ID","CONSUMPTION_OBJECT_ID","CONSUMPTION_START_DATE",
               "CONSUMPTION_END_DATE","CONSUMPTION_ESTRATED_YN","CONSUMPTION","POSTCODE","CITY","OBJECT_TYPE_NAME"]

df.columns = columns_eng

In [8]:
df['CONSUMPTION_START_DATE'] = df['CONSUMPTION_START_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
df['CONSUMPTION_END_DATE'] = df['CONSUMPTION_END_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))

In [27]:
df['OBJECT_TYPE_NAME'].unique()

array(['HHB', 'KZB', 'GZB', nan, 'Overig', 'OWT',
       'Geen (nvt voor Waternet)', 'IND', 'ENG', 'SIER', 'BRA'],
      dtype=object)

#### create smaller working DF

In [124]:
df_smol = df.head(10)

df_smol['CONSUMPTION_START_DATE'] = df_smol['CONSUMPTION_START_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
df_smol['CONSUMPTION_END_DATE'] = df_smol['CONSUMPTION_END_DATE'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))

### REGIONS

In [12]:
df['POSTCODE'].unique()

array([1013, 1091, 1076, 1053, 1061, 1066, 1069, 1054, 1072, 1071, 1075,
       1051, 1073, 1056, 1059, 1105, 1019, 1097, 1016, 1063, 1018,    0,
       1015, 1052, 1074, 1022, 1017, 1064, 1057, 1034, 1067, 1055, 1065,
       1092, 1021, 1033, 1103, 1095, 1093, 1012, 1027, 1031, 1094, 1077,
       1011, 1023, 1035, 1078, 1058, 1068, 1025, 1041, 1032, 1087, 1079,
       1083, 1098, 1082, 1081, 1047, 1062, 1102, 1036, 1014, 1043, 1398,
       1026, 1024, 1045, 1028, 1108, 1086, 1112, 1106, 1096, 1109, 1046,
       1101, 1115, 1060, 1111, 1107, 1391, 1182, 1181, 1104, 1042, 1114,
       1191, 1399, 1113, 2106, 1037, 1183, 1187, 1186, 1185, 1188, 2103,
       2102, 2101, 1184, 2105, 2104])

In [154]:
postcodes = pd.read_csv("data/postalcodes_ben_edit.csv")

In [156]:
#keys = postcodes.postalcode.to_list()
#values = postcodes.region.to_list()
#postaldict = dict(zip(keys, values))

postcodes.region.unique()

array(['Centrum', 'Westpoort', 'West', 'Nieuw-West', 'Zuid', 'Oost',
       'Noord', 'Zuidoost', 'Amstelveen', 'Diemen', 'Heemstede', 'Muiden',
       'Ouder amstel', 'Schiphol', 'Onbepaalde'], dtype=object)

In [14]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [16]:
#save_obj(postaldict,"data/postaldict")
postaldict = load_obj("../pipeline_data/postaldict")

In [25]:
#list(set(postaldict.keys()) - set(df['POSTCODE'].unique()))


for i in df['POSTCODE'].unique():
    if i not in postaldict.keys():
        print(i)

1391


In [125]:

df_smol['region'] = df_smol['POSTCODE'].map(postaldict) 

In [29]:
df['region'] = df['POSTCODE'].map(postaldict) 

In [36]:
df[df['region'].isna()]

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME,region
286227,0xB7C47CFC9D35C5134EE9C0B95B16E6CDFE6C18D8,0x7681AD2D37C6A99D6A5883C2F1845E4B363F8B7A,2009-08-26,2010-09-15,N,330.0,1391,ABCOUDE,KZB,
286228,0x3C3746A9D4EEA382DDA41463A21DCFEF05B76496,0x7681AD2D37C6A99D6A5883C2F1845E4B363F8B7A,2010-09-15,2011-10-01,N,337.0,1391,ABCOUDE,KZB,
286229,0x41DE2FEF7862FC8E75B937A0315A015FD18ECFB3,0x7681AD2D37C6A99D6A5883C2F1845E4B363F8B7A,2011-10-01,2012-09-20,N,285.0,1391,ABCOUDE,KZB,
286230,0x0AEA433C1D94FFD189B6ABD90EC69742367E70C4,0x7681AD2D37C6A99D6A5883C2F1845E4B363F8B7A,2012-09-20,2013-10-11,N,350.0,1391,ABCOUDE,KZB,
286231,0x2260B5984ECB346DD45D643A3B3F57ED7ED8F25A,0x7681AD2D37C6A99D6A5883C2F1845E4B363F8B7A,2013-10-11,2014-09-08,N,319.0,1391,ABCOUDE,KZB,
...,...,...,...,...,...,...,...,...,...,...
4035032,0x6124C277B56303677C1CE18ED4051CD24CF52972,0x4BD98CF152B969F95EDA4CBA7FE979512A3EABF7,2016-11-14,2017-11-27,N,73.0,1391,ABCOUDE,HHB,
4035033,0xCCC669B492E505CB369B8D6CA0E8EC2AE4992AE3,0x4BD98CF152B969F95EDA4CBA7FE979512A3EABF7,2017-11-27,2018-12-01,N,72.0,1391,ABCOUDE,HHB,
4035034,0x7141C65B07CD52355AE166CD213555B3102A4430,0x4BD98CF152B969F95EDA4CBA7FE979512A3EABF7,2018-12-01,2019-11-21,N,65.0,1391,ABCOUDE,HHB,
4175780,0xBDA14CB67D14CC991AEEC79CD6783D829C40CCAD,0xD76DF2B571AAFDBEF346F3CBB03C26FA4F2DFB98,2018-03-22,2018-10-08,N,77.0,1391,ABCOUDE,HHB,


In [39]:
df['region'].value_counts()

Nieuw-West      598731
Oost            578872
West            518421
Zuid            513829
Amstelveen      438518
Zuidoost        397189
Centrum         382882
Noord           374624
Heemstede       145471
Diemen          113533
Ouder amstel     70833
Muiden           31741
Westpoort        23408
Onbepaalde       14513
Name: region, dtype: int64

In [38]:
df['CITY'].value_counts()

AMSTERDAM                  3385320
AMSTELVEEN                  438497
HEEMSTEDE                   145471
DIEMEN                      113533
OUDERKERK AAN DE AMSTEL      41798
DUIVENDRECHT                 27315
MUIDEN                       17370
MUIDERBERG                   14371
AMSTERDAM ZUIDOOST            4059
AMSTERDAM-DUIVENDRECHT        1720
ABCOUDE                        828
LANDSMEER                      742
SCHIPHOL                       609
OOSTZAAN                       426
OUDE MEER                      244
VOGELENZANG                    137
HILLEGOM                       119
HAARLEM                         76
HALFWEG                         22
WEESP                           21
AMSETRDAM                       20
Amsterdam                       18
AERDENHOUT                      13
BROEK IN WATERLAND              11
VIJFHUIZEN                      11
Name: CITY, dtype: int64

### DATES


In [126]:
df_smol.dtypes

CONSUMPTION_ID                     object
CONSUMPTION_OBJECT_ID              object
CONSUMPTION_START_DATE     datetime64[ns]
CONSUMPTION_END_DATE       datetime64[ns]
CONSUMPTION_ESTRATED_YN            object
CONSUMPTION                       float64
POSTCODE                            int64
CITY                               object
OBJECT_TYPE_NAME                   object
region                             object
dtype: object

In [127]:
df_smol['period'] = df_smol['CONSUMPTION_END_DATE'] - df_smol['CONSUMPTION_START_DATE']
df_smol['period'] = df_smol['period'].dt.days

In [128]:
df_smol['consumption_per_day'] = df_smol['CONSUMPTION'] / df_smol['period']

In [129]:
## create smaller df for testing
df = df_smol
df.shape
df.columns
df

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME,region,period,consumption_per_day
0,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-03-08,2013-10-07,N,28.0,1013,AMSTERDAM,HHB,West,213,0.131455
1,0xD763B3D7286E78B4102C06DF1FE478C899B38743,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-10-07,2014-10-06,J,48.0,1013,AMSTERDAM,HHB,West,364,0.131868
2,0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-10-06,2014-12-05,J,8.0,1013,AMSTERDAM,HHB,West,60,0.133333
3,0x9B04DD93592300582B286C693DE79E64AA474DF5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-12-05,2015-11-06,N,37.0,1013,AMSTERDAM,HHB,West,336,0.110119
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015-11-06,2016-10-13,N,52.0,1013,AMSTERDAM,HHB,West,342,0.152047
5,0xBC291865657B562B624937321FB8BA7E9BCF483C,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2016-10-13,2017-11-15,N,65.0,1013,AMSTERDAM,HHB,West,398,0.163317
6,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2017-11-15,2018-11-27,N,58.0,1013,AMSTERDAM,HHB,West,377,0.153846
7,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2018-11-27,2019-09-16,N,21.0,1013,AMSTERDAM,HHB,West,293,0.071672
8,0x3816A647A63C730977442667C30A9EE1959CA600,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2019-09-16,2019-11-27,J,10.0,1013,AMSTERDAM,HHB,West,72,0.138889
9,0x054DD295277D36C46768D05C0E49ECDD406D1687,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2009-02-15,2010-02-23,N,35.0,1013,AMSTERDAM,HHB,West,373,0.093834


##### Convert to Monthly 

In [149]:
#added months between date_from and date_to
df1 = pd.concat([pd.Series(r.CONSUMPTION_ID,pd.date_range(r.CONSUMPTION_START_DATE, r.CONSUMPTION_END_DATE, freq='MS')) 
                 for r in df.itertuples()]).reset_index()
df1.columns = ['CONSUMPTION_START_DATE','CONSUMPTION_ID']

#added starts of months - sorting for correct positions
df2 = (pd.concat([df[['CONSUMPTION_ID','CONSUMPTION_START_DATE']], df1], sort=False, ignore_index=True)
         .sort_values(['CONSUMPTION_ID','CONSUMPTION_START_DATE'])
         .reset_index(drop=True))

#added MonthEnd and date_to  to last rows
mask = df2['CONSUMPTION_ID'].duplicated(keep='last')
s = df2['CONSUMPTION_ID'].map(df.set_index('CONSUMPTION_ID')['CONSUMPTION_END_DATE'])
df2['CONSUMPTION_END_DATE'] = np.where(mask, df2['CONSUMPTION_START_DATE'] + pd.offsets.MonthEnd(), s)

In [80]:
df_smol.drop(['CONSUMPTION_START_DATE','CONSUMPTION_END_DATE','period','CONSUMPTION'],axis=1,inplace=True)

consumption_monthly = df2.merge(df_smol,on='CONSUMPTION_ID',how='left')

In [82]:
consumption_monthly['days'] = consumption_monthly['CONSUMPTION_END_DATE'] - consumption_monthly['CONSUMPTION_START_DATE']
consumption_monthly['days'] = consumption_monthly['days'].dt.days
consumption_monthly['CONSUMPTION'] = consumption_monthly['days'] * consumption_monthly['consumption_per_day']

In [83]:
consumption_monthly.columns

Index(['CONSUMPTION_ID', 'CONSUMPTION_START_DATE', 'CONSUMPTION_END_DATE',
       'CONSUMPTION_OBJECT_ID', 'CONSUMPTION_ESTRATED_YN', 'CONSUMPTION',
       'POSTCODE', 'CITY', 'OBJECT_TYPE_NAME', 'consumption_per_day', 'days'],
      dtype='object')

In [86]:
consumption_monthly['CONSUMPTION']

0      3.649123
1      4.561404
2      4.561404
3      4.257310
4      4.561404
         ...   
97     4.615385
98     4.615385
99     4.461538
100    4.615385
101    4.000000
Name: CONSUMPTION, Length: 102, dtype: float64

In [101]:
df_smol

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_OBJECT_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME
0,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-03-08,2013-10-07,N,28.0,1013,AMSTERDAM,HHB
1,0xD763B3D7286E78B4102C06DF1FE478C899B38743,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013-10-07,2014-10-06,J,48.0,1013,AMSTERDAM,HHB
2,0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-10-06,2014-12-05,J,8.0,1013,AMSTERDAM,HHB
3,0x9B04DD93592300582B286C693DE79E64AA474DF5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014-12-05,2015-11-06,N,37.0,1013,AMSTERDAM,HHB
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015-11-06,2016-10-13,N,52.0,1013,AMSTERDAM,HHB
5,0xBC291865657B562B624937321FB8BA7E9BCF483C,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2016-10-13,2017-11-15,N,65.0,1013,AMSTERDAM,HHB
6,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2017-11-15,2018-11-27,N,58.0,1013,AMSTERDAM,HHB
7,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2018-11-27,2019-09-16,N,21.0,1013,AMSTERDAM,HHB
8,0x3816A647A63C730977442667C30A9EE1959CA600,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2019-09-16,2019-11-27,J,10.0,1013,AMSTERDAM,HHB
9,0x054DD295277D36C46768D05C0E49ECDD406D1687,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2009-02-15,2010-02-23,N,35.0,1013,AMSTERDAM,HHB


In [102]:
consumption_monthly

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_START_DATE,CONSUMPTION_END_DATE,CONSUMPTION_OBJECT_ID,CONSUMPTION_ESTRATED_YN,CONSUMPTION,POSTCODE,CITY,OBJECT_TYPE_NAME,consumption_per_day,days
0,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2015-11-06,2015-11-30,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,3.649123,1013,AMSTERDAM,HHB,0.152047,24
1,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2015-12-01,2015-12-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.561404,1013,AMSTERDAM,HHB,0.152047,30
2,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-01-01,2016-01-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.561404,1013,AMSTERDAM,HHB,0.152047,30
3,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-02-01,2016-02-29,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.257310,1013,AMSTERDAM,HHB,0.152047,28
4,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-03-01,2016-03-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.561404,1013,AMSTERDAM,HHB,0.152047,30
...,...,...,...,...,...,...,...,...,...,...,...
97,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-07-01,2018-07-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.615385,1013,AMSTERDAM,HHB,0.153846,30
98,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-08-01,2018-08-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.615385,1013,AMSTERDAM,HHB,0.153846,30
99,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-09-01,2018-09-30,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.461538,1013,AMSTERDAM,HHB,0.153846,29
100,0xE39CBE2E442FC09AD2C077FB1959390E57AE1819,2018-10-01,2018-10-31,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,4.615385,1013,AMSTERDAM,HHB,0.153846,30


##### Convert to Yearly

In [130]:
df1 = pd.concat([pd.Series(r.CONSUMPTION_ID,pd.date_range(r.CONSUMPTION_START_DATE, r.CONSUMPTION_END_DATE, freq='YS')) 
                 for r in df.itertuples()]).reset_index()
df1.columns = ['CONSUMPTION_START_DATE','CONSUMPTION_ID']

df2 = (pd.concat([df[['CONSUMPTION_ID','CONSUMPTION_START_DATE']], df1], sort=False, ignore_index=True)
         .sort_values(['CONSUMPTION_ID','CONSUMPTION_START_DATE'])
         .reset_index(drop=True))

mask = df2['CONSUMPTION_ID'].duplicated(keep='last')
s = df2['CONSUMPTION_ID'].map(df.set_index('CONSUMPTION_ID')['CONSUMPTION_END_DATE'])
df2['CONSUMPTION_END_DATE'] = np.where(mask, df2['CONSUMPTION_START_DATE'] + pd.offsets.YearEnd(), s)

In [146]:
df2#.drop(['CONSUMPTION_START_DATE'],axis=1,inplace=True)#.groupby('CONSUMPTION_ID').first()

Unnamed: 0,CONSUMPTION_ID,CONSUMPTION_END_DATE
0,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2015-12-31
1,0x04F71377870507FA1CFC31EC9D372D241AF1CF80,2016-10-13
2,0x054DD295277D36C46768D05C0E49ECDD406D1687,2009-12-31
3,0x054DD295277D36C46768D05C0E49ECDD406D1687,2010-02-23
4,0x3816A647A63C730977442667C30A9EE1959CA600,2019-11-27
5,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,2018-12-31
6,0x459C21D4206FDB682A2442BCDC84F6BD67A00382,2019-09-16
7,0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20,2013-10-07
8,0x9B04DD93592300582B286C693DE79E64AA474DF5,2014-12-31
9,0x9B04DD93592300582B286C693DE79E64AA474DF5,2015-11-06


In [132]:
df_smol_values = df_smol[['CONSUMPTION_ID','CONSUMPTION_OBJECT_ID','consumption_per_day']]
df_smol.drop(['CONSUMPTION_START_DATE','CONSUMPTION_END_DATE','period','CONSUMPTION','consumption_per_day','CONSUMPTION_ID'],axis=1,inplace=True)
df_smol_categories = df_smol.groupby(['CONSUMPTION_OBJECT_ID']).first().reset_index()

In [133]:
consumption_yearly = df2.merge(df_smol_values,on='CONSUMPTION_ID',how='left')

In [134]:
consumption_yearly['days'] = consumption_yearly['CONSUMPTION_END_DATE'] - consumption_yearly['CONSUMPTION_START_DATE']
consumption_yearly['days'] = consumption_yearly['days'].dt.days
consumption_yearly['CONSUMPTION'] = consumption_yearly['days'] * consumption_yearly['consumption_per_day']

In [135]:
df_smol_categories

Unnamed: 0,CONSUMPTION_OBJECT_ID,CONSUMPTION_ESTRATED_YN,POSTCODE,CITY,OBJECT_TYPE_NAME,region
0,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,N,1013,AMSTERDAM,HHB,West
1,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,N,1013,AMSTERDAM,HHB,West


In [138]:
consumption_yearly['year'] = consumption_yearly['CONSUMPTION_START_DATE'].dt.year
consumption_yearly.drop(consumption_yearly[consumption_yearly['CONSUMPTION_START_DATE'].dt.year < 2010].index)



In [140]:
consumption_yearly_grouped

Unnamed: 0,CONSUMPTION_OBJECT_ID,year,consumption_per_day,days,CONSUMPTION
0,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013,0.263324,298,39.208791
1,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014,0.375321,364,47.522436
2,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015,0.262166,364,42.389359
3,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2016,0.315363,365,56.38739
4,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2017,0.317163,364,59.011596
5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2018,0.225519,364,53.206091
6,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2019,0.210561,330,28.491468
7,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2009,0.093834,319,29.932976
8,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2010,0.093834,53,4.97319


In [139]:
consumption_yearly_grouped = consumption_yearly.groupby(['CONSUMPTION_OBJECT_ID','year']).sum().reset_index()

In [142]:
consumption_yearly_grouped = consumption_yearly_grouped.merge(df_smol_categories, on='CONSUMPTION_OBJECT_ID',how='left')

In [143]:
consumption_yearly_grouped

Unnamed: 0,CONSUMPTION_OBJECT_ID,year,consumption_per_day,days,CONSUMPTION,CONSUMPTION_ESTRATED_YN,POSTCODE,CITY,OBJECT_TYPE_NAME,region
0,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2013,0.263324,298,39.208791,N,1013,AMSTERDAM,HHB,West
1,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2014,0.375321,364,47.522436,N,1013,AMSTERDAM,HHB,West
2,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2015,0.262166,364,42.389359,N,1013,AMSTERDAM,HHB,West
3,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2016,0.315363,365,56.38739,N,1013,AMSTERDAM,HHB,West
4,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2017,0.317163,364,59.011596,N,1013,AMSTERDAM,HHB,West
5,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2018,0.225519,364,53.206091,N,1013,AMSTERDAM,HHB,West
6,0xB3CC642C47DA2964C780FC5590DEAB3FAA791F9D,2019,0.210561,330,28.491468,N,1013,AMSTERDAM,HHB,West
7,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2009,0.093834,319,29.932976,N,1013,AMSTERDAM,HHB,West
8,0xB49BB96200F53E67DA4EFE2D64FEAA5B10AD18BF,2010,0.093834,53,4.97319,N,1013,AMSTERDAM,HHB,West


In [40]:
df['OBJECT_TYPE_NAME'].value_counts()

HHB                         3890843
KZB                          279212
GZB                           28819
IND                            2155
ENG                             396
OWT                             101
SIER                             77
Geen (nvt voor Waternet)         15
Overig                           12
BRA                               2
Name: OBJECT_TYPE_NAME, dtype: int64

#### check for new data

In [44]:
df_smol.CONSUMPTION_ID.unique()

array(['0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20',
       '0xD763B3D7286E78B4102C06DF1FE478C899B38743',
       '0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A',
       '0x9B04DD93592300582B286C693DE79E64AA474DF5',
       '0x04F71377870507FA1CFC31EC9D372D241AF1CF80',
       '0xBC291865657B562B624937321FB8BA7E9BCF483C',
       '0xE39CBE2E442FC09AD2C077FB1959390E57AE1819',
       '0x459C21D4206FDB682A2442BCDC84F6BD67A00382',
       '0x3816A647A63C730977442667C30A9EE1959CA600',
       '0x054DD295277D36C46768D05C0E49ECDD406D1687'], dtype=object)

In [45]:
df.CONSUMPTION_ID.unique()

array(['0x7E405BC6FDD0BB06E1F7711B39D7C3CB684FAA20',
       '0xD763B3D7286E78B4102C06DF1FE478C899B38743',
       '0xD72C616CCDAAA45E049AF4AB3B5F92DD853B394A', ...,
       '0x760AB4EEBE25BEB7671AFB49952643D155D375F7',
       '0xC6ABFB1F76CE6EC325EF51609573ACC10D4672EE',
       '0xD2F94F239E4175C4985E94CD74DFA4864EAA0BA8'], dtype=object)

In [47]:
new = np.setdiff1d(df_smol.CONSUMPTION_ID.unique(),df.CONSUMPTION_ID.unique())

In [48]:
len(new)

0