# Budgetbuchung Analyse: Datenpräparation

In [24]:
from pa_lib.data import data_files, load_csv, store_bin, load_bin
from pa_lib.data import desc_col, as_dtype, select_columns, as_date, split_date_iso, make_isoweek_rd, clean_up_categoricals
from pa_lib.util import obj_size
from pa_lib.types import dtFactor

# display long columns completely
pd.set_option('display.max_colwidth', 200)

## Daten einlesen

In [2]:
data_files()

Unnamed: 0_level_0,size,mtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1
bd_data.feather,181.9 MB,08.04.19 09:27:25
bd_data_vkprog.feather,94.6 MB,08.04.19 09:27:29
crm_data.feather,51.2 MB,08.04.19 09:07:53
crm_data_vkprog.feather,42.5 MB,08.04.19 09:07:55
plz_data.feather,136.0 KB,08.04.19 09:05:19
pv_akw_netto.feather,12.1 MB,11.04.19 14:08:40
pv_bd.feather,22.2 MB,11.04.19 14:08:12
pv_bd.zip,9.0 MB,04.04.19 16:25:03
pv_rkw_netto.feather,12.1 MB,11.04.19 14:08:40


In [15]:
pv_bd = load_csv('pv_bd.zip', delimiter=';', encoding='cp1252', dtype='object')

2019-04-15 09:35:15 [INFO] Reading from file /home/pa/data/pv_bd.zip
2019-04-15 09:35:16 [INFO] Finished loading CSV in 1.22s (1.22s CPU)


In [4]:
pv_bd.head()

Unnamed: 0,Res. Datum,Aushang Beginn,PVPos Nr.,PartnerNr,PartnerName,PvNr,PvTitel,opt Brutto,opt Netto,opt NettoNetto
0,03.05.2016,01.08.2016,11825,100511,KSS Sport- und Freizeitanlagen Schaffhausen,7834,"Schaffhausen, Breitenaustrasse 117 / Parkplatz...",213,213.0,213.0
1,03.05.2016,01.08.2016,40040,101871,Frey Dieter,26732,"Schaffhausen, Spiegelgutstrasse 54 / Gemsgasse",142,142.0,142.0
2,03.05.2016,01.08.2016,71574,103435,Stadt Schaffhausen,9084,"Stadt Schaffhausen, Plakatierung auf offentlic...",3195,3195.0,3195.0
3,04.07.2016,01.08.2016,0,0,,0,,1477,457.87,457.87
4,04.07.2016,01.08.2016,10023,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",4316,1337.96,1337.96


In [5]:
display(obj_size(pv_bd), pv_bd.shape)

'206.8 MB'

(316984, 10)

In [8]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE
Res. Datum,object,0/316984,709
Aushang Beginn,object,0/316984,196
PVPos Nr.,object,0/316984,9847
PartnerNr,object,0/316984,5421
PartnerName,object,222/316762,5552
PvNr,object,0/316984,6673
PvTitel,object,222/316762,7374
opt Brutto,object,0/316984,7355
opt Netto,object,0/316984,23857
opt NettoNetto,object,0/316984,28237


## Leerwerte bereinigen, Datentypen korrigieren

In [16]:
pv_bd = pv_bd.dropna(how='any')

In [11]:
(obj_size(pv_bd), pv_bd.shape)

('209.0 MB', (316762, 10))

In [18]:
pv_bd = (pv_bd
         .pipe(as_dtype, 'int', incl_pattern='.*Nr.*')
         .pipe(as_dtype, 'float', incl_pattern='.*tto')
         .pipe(as_date, format='%d.%m.%Y', incl_col=('Res. Datum', 'Aushang Beginn'))
         .pipe(as_dtype, dtFactor, incl_dtype='object'))        

In [13]:
(obj_size(pv_bd), pv_bd.shape)

('24.6 MB', (316762, 10))

In [17]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
Res. Datum,object,0/316762,709,22.7 MB,"[01.02.2017,31.10.2018]"
Aushang Beginn,object,0/316762,196,22.7 MB,"[01.01.2018,31.12.2018]"
PVPos Nr.,object,0/316762,9846,21.1 MB,"[100,9917]"
PartnerNr,object,0/316762,5420,21.4 MB,"[100035,653959]"
PartnerName,object,0/316762,5552,26.5 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,object,0/316762,6672,21.2 MB,"[10014,9981]"
PvTitel,object,0/316762,7374,32.0 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 ..."
opt Brutto,object,0/316762,7330,20.6 MB,"[0,9999]"
opt Netto,object,0/316762,23803,21.4 MB,"[0,999.95]"
opt NettoNetto,object,0/316762,28164,21.5 MB,"[0,999.95]"


In [None]:
pv_bd.head()

## Spaltennamen korrigieren

In [20]:
pv_bd.columns = ['ResDatum', 'AushangBeginn', 'PvPosNr', 'PartnerNr', 'PartnerName',
       'PvNr', 'PvTitel', 'optBrutto', 'optNetto', 'optNettoNetto']

In [21]:
pv_bd.head()

Unnamed: 0,ResDatum,AushangBeginn,PvPosNr,PartnerNr,PartnerName,PvNr,PvTitel,optBrutto,optNetto,optNettoNetto
0,2016-05-03,2016-08-01,11825,100511,KSS Sport- und Freizeitanlagen Schaffhausen,7834,"Schaffhausen, Breitenaustrasse 117 / Parkplatz...",213.0,213.0,213.0
1,2016-05-03,2016-08-01,40040,101871,Frey Dieter,26732,"Schaffhausen, Spiegelgutstrasse 54 / Gemsgasse",142.0,142.0,142.0
2,2016-05-03,2016-08-01,71574,103435,Stadt Schaffhausen,9084,"Stadt Schaffhausen, Plakatierung auf offentlic...",3195.0,3195.0,3195.0
4,2016-07-04,2016-08-01,10023,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",4316.0,1337.96,1337.96
5,2016-07-04,2016-08-01,10444,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",3269.0,1013.39,1013.39


## Konstellation Partner/Verträge/Positionen prüfen

#### Gibt es Vertragspositionen mit mehr als einem unterschiedlichen Vertrag? (Wäre nicht toll)

In [22]:
pv_bd.groupby(['PvPosNr'], observed=True)[['PvNr']].agg('nunique').query('PvNr > 1')

Unnamed: 0_level_0,PvNr
PvPosNr,Unnamed: 1_level_1


#### Gibt es Verträge mit mehr als einem unterschiedlichen Partner? (Namensänderungen, Handwechsel von Grundstücken, Reorgs...)

In [25]:
multiPvListe = pv_bd.groupby('PvNr')[['PartnerNr']].agg('nunique').query('PartnerNr > 1').index.values

def partners(x):
    return pd.Series({'#partners': x['PartnerNr'].nunique(), 'pNr': set(x['PartnerNr']), 'pNames': ' | '.join(set(x['PartnerName']))})

(pv_bd.loc[pv_bd.PvNr.isin(multiPvListe)]
      .groupby('PvNr').apply(partners)
      .sort_values(['#partners', 'pNames'], ascending=[False, True]))

Unnamed: 0_level_0,#partners,pNr,pNames
PvNr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12065,3,"{640211, 103605, 465910}",Bitterli Rolf und Barbara Roty | Bitterli Rolf | Bitterli Peter
22162,3,"{103466, 632150, 640271}",Gebr. Amberg Generalunternehmung AG | Einfache Gesellschaft Erschliessung Ober-Kapf | Buhlmann-Wigger Otto
29820,3,"{645563, 488301, 110207}",Immoveris Properties AG | Helvetica Swiss Commercial AG | Wincasa AG
27726,3,"{133227, 631293, 647502}","Lavizzari Carlo, R. et B. & Payot O. | Huber Rene | Konig Thierry"
19778,3,"{630336, 630337, 597836}",Penta Jean-Pierre | Stamp Patrick | Cauderay Micheline
10580,3,"{551952, 123941, 513359}",Realstone Swiss Property SA | Zurich Versicherungs-Gesellschaft AG | Realstone SA
5158,3,"{637194, 469996, 642645}",Restaurant Sonne | B+S Invest AG | B+S Invest AG & Albert Nuesch AG
31478,3,"{653672, 167870, 627727}",Zehnder Huguette Armelle | Zehnder Andre | Florey Johanna
309549,2,"{618948, 649766}",3C Champ-Colin Centre SA | Pi Infac SA
26690,2,"{629691, 101325}","A. Gneupel, Prazisionsmechanik | Hurst Song Architekten GmbH"


## Sortieren, Geschäftsjahr und -woche für Aushang und Reservation berechnen

In [26]:
pv_bd = (pv_bd.sort_values('AushangBeginn').reset_index(drop=True)
         .pipe(split_date_iso, dt_col='ResDatum', yr_col='RJahr', kw_col='RKw')
         .pipe(split_date_iso, dt_col='AushangBeginn', yr_col='AJahr', kw_col='AKw')
         .reset_index(drop=True))

In [27]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
ResDatum,datetime64[ns],0/316762,709,2.4 MB,"[2016-03-09 00:00:00,2019-03-29 00:00:00]"
AushangBeginn,datetime64[ns],0/316762,196,2.4 MB,"[2016-03-21 00:00:00,2019-12-16 00:00:00]"
PvPosNr,int64,0/316762,9846,2.4 MB,"[19,80445]"
PartnerNr,int64,0/316762,5420,2.4 MB,"[100035,653959]"
PartnerName,category,0/316762,5552,1.2 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,int64,0/316762,6672,2.4 MB,"[26,311490]"
PvTitel,category,0/316762,7374,1.6 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
optBrutto,float64,0/316762,7330,2.4 MB,"[0.0,266459.5]"
optNetto,float64,0/316762,23803,2.4 MB,"[0.0,57145.5]"
optNettoNetto,float64,0/316762,28164,2.4 MB,"[0.0,54288.22]"


## Daten speichern

In [28]:
store_bin(pv_bd, 'pv_bd_raw.feather')

2019-04-15 09:39:46 [INFO] Writing to file /home/pa/data/pv_bd_raw.feather
2019-04-15 09:39:46 [INFO] Written 22.2 MB
2019-04-15 09:39:46 [INFO] Finished storing binary file in 0.08s (0.53s CPU)
