# Partnervertrag Analyse: Datenpräparation

In [1]:
# make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

In [2]:
%load_ext autoreload
%autoreload

import pandas as pd
import qgrid

from pa_lib.file  import store_bin, set_project_dir
from pa_lib.data  import desc_col, as_dtype, as_date, split_date_iso
from pa_lib.util  import obj_size
from pa_lib.types import dtFactor
from pa_lib.sql   import query
from pa_lib.ora   import Connection
from pa_lib.log   import info

# display long columns completely
pd.set_option('display.max_colwidth', 200)

## Daten einlesen

In [2]:
pv_query = query('pv_2')

In [3]:
info('Starting PV query on APC Prod instance')
with Connection('APC_PROD_VDWH1') as c:
    pv_data_raw = c.long_query(pv_query)
info(f'Finished PV query, returned {obj_size(pv_data_raw)} of data: {pv_data_raw.shape}')

2019-06-04 15:10:46 [INFO] Starting PV query on APC Prod instance
2019-06-04 15:11:57 [INFO] Finished query in 70.54s (17.87s CPU)
2019-06-04 15:12:01 [INFO] Finished PV query, returned 615.8 MB of data: (1693371, 11)


In [4]:
pv_data_raw.head()

Unnamed: 0,PV_NR,JAHR_KW,JAHR,KW,RES_BRUTTO,RES_NETTO_NETTO,AUS_BRUTTO,AUS_NETTO_NETTO,PV_TITEL,PARTNER_NR,PARTNER
0,308945,201918,2019,18,0.0,0.0,228.846154,42.185577,Surcuolm - Uhrentafeln [308945],112060,Bergbahnen Piz Mundaun AG [112060]
1,25191,201429,2014,29,33411.0,20848.902661,53806.077599,24592.008711,Interplakat Privatverträge [25191],103707,Interplakat AG [103707]
2,307690,201436,2014,36,72466.0,29455.679591,100834.058429,54515.924198,Plakatierung_SBB/CFF/FFS: West Bahnhofmanagement (BM) [307690],477164,Schweizerische Bundesbahnen SBB [477164]
3,307690,201447,2014,47,37493.494733,20638.096253,116873.039189,41887.827308,Plakatierung_SBB/CFF/FFS: West Bahnhofmanagement (BM) [307690],477164,Schweizerische Bundesbahnen SBB [477164]
4,27259,201547,2015,47,1430.0,498.192745,4680.211563,2972.215711,"KT Solothurn, Plakat-Standorte Kreisbauamt II [27259]",129877,Baudepartement des Kantons Solothurn [129877]


In [5]:
desc_col(pv_data_raw, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
PV_NR,int64,0/1693371,7902,12.9 MB,"[26,311503]"
JAHR_KW,int64,0/1693371,418,12.9 MB,"[201401,202152]"
JAHR,object,0/1693371,8,98.5 MB,"[2014,2021]"
KW,object,0/1693371,53,95.3 MB,"[01,53]"
RES_BRUTTO,float64,0/1693371,93446,12.9 MB,"[0.0,4089637.83883066]"
RES_NETTO_NETTO,float64,0/1693371,242096,12.9 MB,"[-25.599412008256,3221578.3203589036]"
AUS_BRUTTO,float64,0/1693371,203380,12.9 MB,"[0.0,1577148.971208175]"
AUS_NETTO_NETTO,float64,0/1693371,428799,12.9 MB,"[-13326.983333333334,940441.8088653579]"
PV_TITEL,object,0/1693371,7902,216.8 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
PARTNER_NR,int64,0/1693371,5793,12.9 MB,"[100035,656032]"


## Leerwerte bereinigen, Datentypen korrigieren

In [6]:
pv_data_raw = pv_data_raw.dropna(how='any')

In [7]:
(obj_size(pv_data_raw), pv_data_raw.shape)

('670.5 MB', (1682716, 11))

In [7]:
pv_data_raw = pv_data_raw.pipe(as_dtype, dtFactor, incl_dtype='object')

In [8]:
(obj_size(pv_data_raw), pv_data_raw.shape)

('115.1 MB', (1693371, 11))

In [9]:
desc_col(pv_data_raw, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
PV_NR,int64,0/1693371,7902,25.8 MB,"[26,311503]"
JAHR_KW,int64,0/1693371,418,25.8 MB,"[201401,202152]"
JAHR,category,0/1693371,8,14.5 MB,"[2014,2021]"
KW,category,0/1693371,53,14.5 MB,"[01,53]"
RES_BRUTTO,float64,0/1693371,93446,25.8 MB,"[0.0,4089637.83883066]"
RES_NETTO_NETTO,float64,0/1693371,242096,25.8 MB,"[-25.599412008256,3221578.3203589036]"
AUS_BRUTTO,float64,0/1693371,203380,25.8 MB,"[0.0,1577148.971208175]"
AUS_NETTO_NETTO,float64,0/1693371,428799,25.8 MB,"[-13326.983333333334,940441.8088653579]"
PV_TITEL,category,0/1693371,7902,17.5 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
PARTNER_NR,int64,0/1693371,5793,25.8 MB,"[100035,656032]"


In [10]:
pv_data_raw.head()

Unnamed: 0,PV_NR,JAHR_KW,JAHR,KW,RES_BRUTTO,RES_NETTO_NETTO,AUS_BRUTTO,AUS_NETTO_NETTO,PV_TITEL,PARTNER_NR,PARTNER
0,308945,201918,2019,18,0.0,0.0,228.846154,42.185577,Surcuolm - Uhrentafeln [308945],112060,Bergbahnen Piz Mundaun AG [112060]
1,25191,201429,2014,29,33411.0,20848.902661,53806.077599,24592.008711,Interplakat Privatverträge [25191],103707,Interplakat AG [103707]
2,307690,201436,2014,36,72466.0,29455.679591,100834.058429,54515.924198,Plakatierung_SBB/CFF/FFS: West Bahnhofmanagement (BM) [307690],477164,Schweizerische Bundesbahnen SBB [477164]
3,307690,201447,2014,47,37493.494733,20638.096253,116873.039189,41887.827308,Plakatierung_SBB/CFF/FFS: West Bahnhofmanagement (BM) [307690],477164,Schweizerische Bundesbahnen SBB [477164]
4,27259,201547,2015,47,1430.0,498.192745,4680.211563,2972.215711,"KT Solothurn, Plakat-Standorte Kreisbauamt II [27259]",129877,Baudepartement des Kantons Solothurn [129877]


## Netto = 0 ausfiltern, sortieren

In [11]:
pv_data = (pv_data_raw.query('AUS_NETTO_NETTO > 0')
           .sort_values(['JAHR_KW', 'PV_NR'])
           .reset_index(drop=True))

In [12]:
desc_col(pv_data, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
PV_NR,int64,0/1478359,7851,11.3 MB,"[26,311503]"
JAHR_KW,int64,0/1478359,418,11.3 MB,"[201401,202152]"
JAHR,category,0/1478359,8,1.4 MB,"[2014,2021]"
KW,category,0/1478359,53,1.4 MB,"[01,53]"
RES_BRUTTO,float64,0/1478359,85293,11.3 MB,"[0.0,4089637.83883066]"
RES_NETTO_NETTO,float64,0/1478359,215067,11.3 MB,"[-25.599412008256,2566898.6604734533]"
AUS_BRUTTO,float64,0/1478359,202953,11.3 MB,"[0.692307692307,1577148.971208175]"
AUS_NETTO_NETTO,float64,0/1478359,428790,11.3 MB,"[0.409719602665,940441.8088653579]"
PV_TITEL,category,0/1478359,7851,4.1 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
PARTNER_NR,int64,0/1478359,5767,11.3 MB,"[100035,656032]"


## Vertragsinformationen extrahieren

In [13]:
pv_idx = pv_data.groupby('PV_NR', as_index=True)

In [14]:
pv_info = pv_idx.agg({'PV_TITEL': 'first', 'RES_BRUTTO': 'sum', 'RES_NETTO_NETTO': 'sum', 'AUS_BRUTTO': 'sum', 'AUS_NETTO_NETTO': 'sum', 'PARTNER_NR': 'last', 'PARTNER': 'last',
                      'JAHR_KW': ['min', 'max']})
pv_info.columns = 'Titel totalResBrutto, totalResNettoNetto totalAusBrutto totalAusNettoNetto partnerNr Partner firstKw lastKw'.split()

In [20]:
desc_col(pv_info, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
Titel,object,0/7851,7851,1.1 MB,"[ Bern - Bethlehemstrasse 24 [26973],zb Zentralbahn AG [32060]]"
"totalResBrutto,",float64,0/7851,7658,122.7 KB,"[0.0,200595358.35695997]"
totalResNettoNetto,float64,0/7851,7707,122.7 KB,"[0.0,128880929.3894724]"
totalAusBrutto,float64,0/7851,7828,122.7 KB,"[22.739726027393,201559729.90018842]"
totalAusNettoNetto,float64,0/7851,7838,122.7 KB,"[22.739726027393,129567091.89990559]"
partnerNr,int64,0/7851,5767,122.7 KB,"[100035,656032]"
Partner,object,0/7851,5767,823.9 KB,"[""Zürich"" Versicherungs-Gesellschaft [495776],Özdemir Ökkes [614921]]"
firstKw,int64,0/7851,258,122.7 KB,"[201401,201945]"
lastKw,int64,0/7851,332,122.7 KB,"[201401,202152]"
NettoNetto_Aus_2014,float64,0/7851,6650,122.7 KB,"[0.0,15000520.852361444]"


In [19]:
qgrid.show_grid(pv_info.loc[:,'Titel NettoNetto_Aus_2017 NettoNetto_Aus_2018 NettoNetto_Aus_2019'.split()])

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

#### Jahres-Nettoumsätze

In [17]:
pvYearANetto = pv_data.groupby(['PV_NR', 'JAHR'], observed=True, as_index=False)[['AUS_NETTO_NETTO']].agg('sum')
pvYearRNetto = pv_data.groupby(['PV_NR', 'JAHR'], observed=True, as_index=False)[['RES_NETTO_NETTO']].agg('sum')
pvANetto = pvYearANetto.pivot(index='PV_NR', columns='JAHR', values='AUS_NETTO_NETTO').fillna(0).add_prefix('NettoNetto_Aus_')
pvRNetto = pvYearRNetto.pivot(index='PV_NR', columns='JAHR', values='RES_NETTO_NETTO').fillna(0).add_prefix('NettoNetto_Res_')

In [18]:
pv_info = pv_info.merge(pvANetto, on='PV_NR').merge(pvRNetto, on='PV_NR')

## Daten speichern

In [21]:
set_project_dir('pv')

store_bin(pv_data, 'pv_data.feather')
store_bin(pv_info, 'pv_info.feather')

2019-06-04 15:14:41 [INFO] Writing to file /home/pa/data/pv_data.feather
2019-06-04 15:14:41 [INFO] Written 88.0 MB
2019-06-04 15:14:41 [INFO] Finished storing binary file in 0.37s (1.62s CPU)
2019-06-04 15:14:41 [INFO] Writing to file /home/pa/data/pv_info.feather
2019-06-04 15:14:41 [INFO] Written 2.1 MB
2019-06-04 15:14:41 [INFO] Finished storing binary file in 0.02s (0.02s CPU)
