# Partnervertrag Analyse: Datenpräparation

In [28]:
%load_ext autoreload
%autoreload

import pandas as pd
import qgrid

from pa_lib.file  import store_bin
from pa_lib.data  import desc_col, as_dtype, as_date, split_date_iso
from pa_lib.util  import obj_size
from pa_lib.types import dtFactor
from pa_lib.sql   import query
from pa_lib.ora   import Connection
from pa_lib.log   import info

# display long columns completely
pd.set_option('display.max_colwidth', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Daten einlesen

In [29]:
pv_query = query('pv_2')

In [30]:
info('Starting PV query on APC Prod instance')
with Connection('APC_PROD_VDWH1') as c:
    pv_data_raw = c.long_query(pv_query)
info(f'Finished PV query, returned {obj_size(pv_data_raw)} of data: {pv_data_raw.shape}')

2019-05-22 16:59:27 [INFO] Starting PV query on APC Prod instance
2019-05-22 17:01:10 [INFO] Finished query in 102.9s (10.17s CPU)
2019-05-22 17:01:12 [INFO] Finished PV query, returned 611.9 MB of data: (1682716, 11)


In [31]:
pv_data_raw.head()

Unnamed: 0,PV_NR,JAHR_KW,JAHR,KW,RES_BRUTTO,RES_NETTO_NETTO,AUS_BRUTTO,AUS_NETTO_NETTO,PV_TITEL,PARTNER_NR,PARTNER
0,3547,201629,2016,29,13836.678798,10187.105305,23112.510989,13345.782859,Sion - Convention d'affichage [3547],118107,Municipalité de Sion [118107]
1,310817,201850,2018,50,69841.6,40042.35,49478.154167,36636.811012,ShopVille / Los 2 / eBoard [310817],101350,Stadt Zürich [101350]
2,6851,201904,2019,4,244.0,173.85,1350.5,528.075,"Dällikon, Plakatierungsvertrag [6851]",116164,Gemeinde Dällikon [116164]
3,309655,201807,2018,7,770.0,322.25,263.0,179.875,"Münsterlingen-Scherzingen, Neusatzstrasse 1 / Seestrasse (41) [309655]",567552,Rohema Stürm Verwaltungen [567552]
4,307472,201840,2018,40,11488.0,6017.8,503.307692,391.603846,Tafers - Schwarzseestrasse 29 [307472],504061,Zbinden Willy und Beatrice [504061]


In [5]:
desc_col(pv_data_raw, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
PV_NR,int64,0/1682716,7901,12.8 MB,"[26,311501]"
JAHR_KW,int64,0/1682716,418,12.8 MB,"[201401,202152]"
JAHR,object,0/1682716,8,97.9 MB,"[2014,2021]"
KW,object,0/1682716,53,94.7 MB,"[01,53]"
RES_BRUTTO,float64,0/1682716,93094,12.8 MB,"[0.0,4089637.83883066]"
RES_NETTO_NETTO,float64,0/1682716,241465,12.8 MB,"[-25.599412008256,3221578.3203589036]"
AUS_BRUTTO,float64,0/1682716,202621,12.8 MB,"[0.0,1577148.971208175]"
AUS_NETTO_NETTO,float64,0/1682716,427346,12.8 MB,"[-2.864757244483,940441.8088653579]"
PV_TITEL,object,0/1682716,7901,215.3 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
PARTNER_NR,int64,0/1682716,5792,12.8 MB,"[100035,655693]"


## Leerwerte bereinigen, Datentypen korrigieren

In [6]:
pv_data_raw = pv_data_raw.dropna(how='any')

In [7]:
(obj_size(pv_data_raw), pv_data_raw.shape)

('670.5 MB', (1682716, 11))

In [8]:
pv_data_raw = pv_data_raw.pipe(as_dtype, dtFactor, incl_dtype='object')

In [9]:
(obj_size(pv_data_raw), pv_data_raw.shape)

('114.4 MB', (1682716, 11))

In [10]:
desc_col(pv_data_raw, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
PV_NR,int64,0/1682716,7901,25.7 MB,"[26,311501]"
JAHR_KW,int64,0/1682716,418,25.7 MB,"[201401,202152]"
JAHR,category,0/1682716,8,14.4 MB,"[2014,2021]"
KW,category,0/1682716,53,14.4 MB,"[01,53]"
RES_BRUTTO,float64,0/1682716,93094,25.7 MB,"[0.0,4089637.83883066]"
RES_NETTO_NETTO,float64,0/1682716,241465,25.7 MB,"[-25.599412008256,3221578.3203589036]"
AUS_BRUTTO,float64,0/1682716,202621,25.7 MB,"[0.0,1577148.971208175]"
AUS_NETTO_NETTO,float64,0/1682716,427346,25.7 MB,"[-2.864757244483,940441.8088653579]"
PV_TITEL,category,0/1682716,7901,17.4 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
PARTNER_NR,int64,0/1682716,5792,25.7 MB,"[100035,655693]"


In [11]:
pv_data_raw.head()

Unnamed: 0,PV_NR,JAHR_KW,JAHR,KW,RES_BRUTTO,RES_NETTO_NETTO,AUS_BRUTTO,AUS_NETTO_NETTO,PV_TITEL,PARTNER_NR,PARTNER
0,3547,201629,2016,29,13836.678798,10187.105305,23112.510989,13345.782859,Sion - Convention d'affichage [3547],118107,Municipalité de Sion [118107]
1,310817,201850,2018,50,69841.6,40042.35,49478.154167,36636.811012,ShopVille / Los 2 / eBoard [310817],101350,Stadt Zürich [101350]
2,6851,201904,2019,4,244.0,173.85,1350.5,528.075,"Dällikon, Plakatierungsvertrag [6851]",116164,Gemeinde Dällikon [116164]
3,309655,201807,2018,7,770.0,322.25,263.0,179.875,"Münsterlingen-Scherzingen, Neusatzstrasse 1 / Seestrasse (41) [309655]",567552,Rohema Stürm Verwaltungen [567552]
4,307472,201840,2018,40,11488.0,6017.8,503.307692,391.603846,Tafers - Schwarzseestrasse 29 [307472],504061,Zbinden Willy und Beatrice [504061]


## Netto = 0 ausfiltern, sortieren

In [13]:
pv_data = (pv_data_raw.query('AUS_NETTO_NETTO > 0')
           .sort_values(['JAHR_KW', 'PV_NR'])
           .reset_index(drop=True))

In [14]:
desc_col(pv_data, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
PV_NR,int64,0/1469033,7849,11.2 MB,"[26,311499]"
JAHR_KW,int64,0/1469033,418,11.2 MB,"[201401,202152]"
JAHR,category,0/1469033,8,1.4 MB,"[2014,2021]"
KW,category,0/1469033,53,1.4 MB,"[01,53]"
RES_BRUTTO,float64,0/1469033,84950,11.2 MB,"[0.0,4089637.83883066]"
RES_NETTO_NETTO,float64,0/1469033,214396,11.2 MB,"[-25.599412008256,2566898.6604734533]"
AUS_BRUTTO,float64,0/1469033,202187,11.2 MB,"[0.692307692307,1577148.971208175]"
AUS_NETTO_NETTO,float64,0/1469033,427339,11.2 MB,"[0.409719602665,940441.8088653579]"
PV_TITEL,category,0/1469033,7849,4.1 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
PARTNER_NR,int64,0/1469033,5765,11.2 MB,"[100035,655693]"


## Vertragsinformationen extrahieren

In [15]:
pv_idx = pv_data.groupby('PV_NR', as_index=True)

In [16]:
pv_info = pv_idx.agg({'PV_TITEL': 'first', 'RES_BRUTTO': 'sum', 'RES_NETTO_NETTO': 'sum', 'AUS_BRUTTO': 'sum', 'AUS_NETTO_NETTO': 'sum', 'PARTNER_NR': 'last', 'PARTNER': 'last',
                      'JAHR_KW': ['min', 'max']})
pv_info.columns = 'Titel totalResBrutto, totalResNettoNetto totalAusBrutto totalAusNettoNetto partnerNr Partner firstKw lastKw'.split()

In [23]:
desc_col(pv_info, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
Titel,object,0/7849,7849,1.1 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
"totalResBrutto,",float64,0/7849,7650,122.6 KB,"[0.0,200595358.35695997]"
totalResNettoNetto,float64,0/7849,7702,122.6 KB,"[0.0,128880929.3894724]"
totalAusBrutto,float64,0/7849,7818,122.6 KB,"[22.739726027393,201559729.90018842]"
totalAusNettoNetto,float64,0/7849,7836,122.6 KB,"[22.739726027393,129567091.89990559]"
partnerNr,int64,0/7849,5765,122.6 KB,"[100035,655693]"
Partner,object,0/7849,5765,823.5 KB,"[""Zürich"" Versicherungs-Gesellschaft [495776],Özdemir Ökkes [614921]]"
firstKw,int64,0/7849,256,122.6 KB,"[201401,201942]"
lastKw,int64,0/7849,334,122.6 KB,"[201401,202152]"
NettoNetto_Aus_2014,float64,0/7849,6650,122.6 KB,"[0.0,15000520.852361444]"


In [25]:
qgrid.show_grid(pv_info.loc[:,'Titel NettoNetto_Aus_2017 NettoNetto_Aus_2018 NettoNetto_Aus_2019'.split()])

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

#### Jahres-Nettoumsätze

In [45]:
pvYearANetto = pv_data.groupby(['PV_NR', 'JAHR'], observed=True, as_index=False)[['AUS_NETTO_NETTO']].agg('sum')
pvYearRNetto = pv_data.groupby(['PV_NR', 'JAHR'], observed=True, as_index=False)[['RES_NETTO_NETTO']].agg('sum')
pvANetto = pvYearANetto.pivot(index='PV_NR', columns='JAHR', values='AUS_NETTO_NETTO').fillna(0).add_prefix('NettoNetto_Aus_')
pvRNetto = pvYearRNetto.pivot(index='PV_NR', columns='JAHR', values='RES_NETTO_NETTO').fillna(0).add_prefix('NettoNetto_Res_')

In [46]:
pv_info = pv_info.merge(pvANetto, on='PV_NR').merge(pvRNetto, on='PV_NR')

## Daten speichern

In [30]:
store_bin(pv_data, 'pv_data.feather')
store_bin(pv_info, 'pv_info.feather')

2019-05-22 17:13:40 [INFO] Writing to file C:\Users\kpf\data\pv_data.feather
2019-05-22 17:13:40 [INFO] Written 87.4 MB
2019-05-22 17:13:40 [INFO] Finished storing binary file in 0.07s (0.38s CPU)
2019-05-22 17:13:40 [INFO] Writing to file C:\Users\kpf\data\pv_info.feather
2019-05-22 17:13:40 [INFO] Written 2.1 MB
2019-05-22 17:13:40 [INFO] Finished storing binary file in 0.02s (0.19s CPU)
