# Budgetbuchung Analyse: Datenpräparation

In [2]:
%load_ext autoreload
%autoreload

import pandas as pd

from pa_lib.file  import data_files, load_csv, store_bin
from pa_lib.df    import desc_col, as_dtype, as_date, split_date_iso
from pa_lib.util  import obj_size
from pa_lib.types import dtFactor

# display long columns completely
pd.set_option('display.max_colwidth', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Daten einlesen

In [3]:
data_files()

Unnamed: 0_level_0,size,mtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1
bd_data.feather,194.2 MB,23.04.19 08:39:45
bd_data_vkprog.feather,100.5 MB,23.04.19 08:39:49
crm_data.csv.zip,18.1 MB,23.04.19 09:57:51
crm_data.feather,59.0 MB,23.04.19 09:58:00
crm_data_vkprog.feather,47.8 MB,23.04.19 09:58:01
plz_data.feather,135.7 KB,23.04.19 08:20:36
pv_akw_netto.feather,12.1 MB,16.04.19 16:16:31
pv_bd.zip,9.0 MB,04.04.19 16:25:03
pv_bd_raw.feather,20.6 MB,18.04.19 10:21:30
pv_by_week.feather,11.1 MB,23.04.19 15:07:06


In [4]:
pv_bd = load_csv('pv_bd.zip', delimiter=';', encoding='cp1252', dtype='object')

2019-04-23 15:12:43 [INFO] Reading from file /home/pa/data/pv_bd.zip
2019-04-23 15:12:44 [INFO] Finished loading CSV in 1.22s (1.22s CPU)


In [33]:
pv_bd.head()

Unnamed: 0,Res. Datum,Aushang Beginn,PVPos Nr.,PartnerNr,PartnerName,PvNr,PvTitel,opt Brutto,opt Netto,opt NettoNetto
0,03.05.2016,01.08.2016,11825,100511,KSS Sport- und Freizeitanlagen Schaffhausen,7834,"Schaffhausen, Breitenaustrasse 117 / Parkplatz Schwimmbad KSS",213,213.0,213.0
1,03.05.2016,01.08.2016,40040,101871,Frey Dieter,26732,"Schaffhausen, Spiegelgutstrasse 54 / Gemsgasse",142,142.0,142.0
2,03.05.2016,01.08.2016,71574,103435,Stadt Schaffhausen,9084,"Stadt Schaffhausen, Plakatierung auf offentlichem Grund",3195,3195.0,3195.0
3,04.07.2016,01.08.2016,0,0,,0,,1477,457.87,457.87
4,04.07.2016,01.08.2016,10023,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",4316,1337.96,1337.96


In [34]:
(obj_size(pv_bd), pv_bd.shape)

('206.8 MB', (316984, 10))

In [5]:
desc_col(pv_bd)

Unnamed: 0,DTYPE,NULLS,UNIQUE
Res. Datum,object,0/316984,709
Aushang Beginn,object,0/316984,196
PVPos Nr.,object,0/316984,9847
PartnerNr,object,0/316984,5421
PartnerName,object,222/316762,5552
PvNr,object,0/316984,6673
PvTitel,object,222/316762,7374
opt Brutto,object,0/316984,7355
opt Netto,object,0/316984,23857
opt NettoNetto,object,0/316984,28237


## Spalten umbenennen, Leerwerte bereinigen, Datentypen korrigieren

In [6]:
pv_bd.columns = ['ResDatum', 'AushangBeginn', 'PvPosNr', 'PartnerNr', 'PartnerName',
       'PvNr', 'PvTitel', 'optBrutto', 'optNetto', 'optNettoNetto']
pv_bd = pv_bd.dropna(how='any')

In [37]:
(obj_size(pv_bd), pv_bd.shape)

('209.0 MB', (316762, 10))

In [7]:
pv_bd.loc[:,:] = (pv_bd
                  .pipe(as_dtype, 'int', incl_pattern='.*Nr.*')
                  .pipe(as_dtype, 'float', incl_pattern='.*tto')
                  .pipe(as_date, format_str='%d.%m.%Y', incl_col=('ResDatum', 'AushangBeginn'))
                  .pipe(as_dtype, dtFactor, incl_dtype='object'))

In [13]:
(obj_size(pv_bd), pv_bd.shape)

('24.6 MB', (316762, 10))

In [8]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
ResDatum,datetime64[ns],0/316762,709,4.8 MB,"[2016-03-09 00:00:00,2019-03-29 00:00:00]"
AushangBeginn,datetime64[ns],0/316762,196,4.8 MB,"[2016-03-21 00:00:00,2019-12-16 00:00:00]"
PvPosNr,int64,0/316762,9846,4.8 MB,"[19,80445]"
PartnerNr,int64,0/316762,5420,4.8 MB,"[100035,653959]"
PartnerName,category,0/316762,5552,3.6 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,int64,0/316762,6672,4.8 MB,"[26,311490]"
PvTitel,category,0/316762,7374,4.1 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
optBrutto,float64,0/316762,7330,4.8 MB,"[0.0,266459.5]"
optNetto,float64,0/316762,23803,4.8 MB,"[0.0,57145.5]"
optNettoNetto,float64,0/316762,28164,4.8 MB,"[0.0,54288.22]"


In [15]:
pv_bd.head()

Unnamed: 0,ResDatum,AushangBeginn,PvPosNr,PartnerNr,PartnerName,PvNr,PvTitel,optBrutto,optNetto,optNettoNetto
0,2016-05-03,2016-08-01,11825,100511,KSS Sport- und Freizeitanlagen Schaffhausen,7834,"Schaffhausen, Breitenaustrasse 117 / Parkplatz Schwimmbad KSS",213.0,213.0,213.0
1,2016-05-03,2016-08-01,40040,101871,Frey Dieter,26732,"Schaffhausen, Spiegelgutstrasse 54 / Gemsgasse",142.0,142.0,142.0
2,2016-05-03,2016-08-01,71574,103435,Stadt Schaffhausen,9084,"Stadt Schaffhausen, Plakatierung auf offentlichem Grund",3195.0,3195.0,3195.0
4,2016-07-04,2016-08-01,10023,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",4316.0,1337.96,1337.96
5,2016-07-04,2016-08-01,10444,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",3269.0,1013.39,1013.39


## Netto = 0 ausfiltern, sortieren, Geschäftsjahr und -woche für Aushang und Reservation berechnen

In [9]:
pv_bd = (pv_bd.query('optNettoNetto > 0')
         .sort_values('AushangBeginn')
         .pipe(split_date_iso, dt_col='ResDatum', yr_col='RJahr', kw_col='RKw')
         .pipe(split_date_iso, dt_col='AushangBeginn', yr_col='AJahr', kw_col='AKw')
         .reset_index(drop=True))

In [10]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
ResDatum,datetime64[ns],0/292922,687,2.2 MB,"[2016-03-09 00:00:00,2019-03-29 00:00:00]"
AushangBeginn,datetime64[ns],0/292922,194,2.2 MB,"[2016-03-28 00:00:00,2019-12-16 00:00:00]"
PvPosNr,int64,0/292922,9821,2.2 MB,"[19,80445]"
PartnerNr,int64,0/292922,5414,2.2 MB,"[100035,653959]"
PartnerName,category,0/292922,5544,1.1 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,int64,0/292922,6658,2.2 MB,"[26,311490]"
PvTitel,category,0/292922,7356,1.6 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
optBrutto,float64,0/292922,6984,2.2 MB,"[21.0,97796.0]"
optNetto,float64,0/292922,23802,2.2 MB,"[11.7,57145.5]"
optNettoNetto,float64,0/292922,28163,2.2 MB,"[11.7,54288.22]"


## Vertragsinformationen extrahieren

In [11]:
pv_idx = pv_bd.sort_values(['PvNr', 'ResDatum']).groupby('PvNr', as_index=True)

In [12]:
pv_info = pv_idx.agg({'PvTitel': 'first', 'optNettoNetto': 'sum', 'PartnerNr': 'nunique', 'PartnerName': 'last', 'PvPosNr': 'nunique',
                      'ResDatum': ['min', 'max'], 'AushangBeginn': ['min', 'max']})
pv_info.columns = 'Titel totalNetto nPartner Partner nPos firstRes lastRes firstAus lastAus'.split()

In [13]:
desc_col(pv_info, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
Titel,object,0/6658,6529,687.1 KB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
totalNetto,float64,0/6658,6631,104.0 KB,"[40.38,6333770.714299988]"
nPartner,int64,0/6658,3,104.0 KB,"[1,3]"
Partner,object,0/6658,5052,558.2 KB,"[A&A Liegenschaften Schweiz AG,zb Zentralbahn AG]"
nPos,int64,0/6658,33,104.0 KB,"[1,401]"
firstRes,datetime64[ns],0/6658,333,104.0 KB,"[2016-03-09 00:00:00,2019-03-21 00:00:00]"
lastRes,datetime64[ns],0/6658,416,104.0 KB,"[2016-03-17 00:00:00,2019-03-29 00:00:00]"
firstAus,datetime64[ns],0/6658,146,104.0 KB,"[2016-03-28 00:00:00,2019-10-14 00:00:00]"
lastAus,datetime64[ns],0/6658,173,104.0 KB,"[2016-05-16 00:00:00,2019-12-16 00:00:00]"


#### Mehrfach-Partner: Namen zusammenfügen (Reihenfolge wie in Daten)

In [14]:
pv_info.assign(allPartner = pv_info.Partner, inplace=True)
multi_partner = pv_info.nPartner > 1
pv_multi_prtn = pv_info.loc[multi_partner].index.values
pv_info.loc[multi_partner, 'allPartner'] = (pv_bd[pv_bd.PvNr.isin(pv_multi_prtn)].groupby('PvNr')['PartnerName']
                                                .apply(lambda x: ' | '.join(x.unique())))

#### Jahres-Nettoumsätze

In [15]:
pvYearANetto = pv_bd.groupby(['PvNr', 'AJahr'], observed=True, as_index=False)[['optNettoNetto']].agg('sum')
pvYearRNetto = pv_bd.groupby(['PvNr', 'RJahr'], observed=True, as_index=False)[['optNettoNetto']].agg('sum')
pvANetto = pvYearANetto.pivot(index='PvNr', columns='AJahr', values='optNettoNetto').fillna(0).add_prefix('Netto_Aus_')
pvRNetto = pvYearRNetto.pivot(index='PvNr', columns='RJahr', values='optNettoNetto').fillna(0).add_prefix('Netto_Res_')

In [16]:
pv_info = pv_info.merge(pvANetto, on='PvNr').merge(pvRNetto, on='PvNr')

## Konstellation Verträge/Positionen prüfen

#### Gibt es Vertragspositionen mit mehr als einem unterschiedlichen Vertrag? (Wäre nicht toll)

In [17]:
pv_bd.groupby(['PvPosNr'], observed=True)[['PvNr']].agg('nunique').query('PvNr > 1')

Unnamed: 0_level_0,PvNr
PvPosNr,Unnamed: 1_level_1


## Daten speichern

In [18]:
store_bin(pv_bd, 'pv_bd_raw.feather')
pv_data = pv_bd.drop(['PvPosNr', 'PartnerNr', 'PartnerName', 'PvTitel', 'optBrutto', 'optNetto'], axis='columns')
store_bin(pv_data, 'pv_data.feather')
store_bin(pv_info, 'pv_info.feather')

2019-04-23 15:14:11 [INFO] Writing to file /home/pa/data/pv_bd_raw.feather
2019-04-23 15:14:11 [INFO] Written 20.6 MB
2019-04-23 15:14:11 [INFO] Finished storing binary file in 0.08s (0.58s CPU)
2019-04-23 15:14:11 [INFO] Writing to file /home/pa/data/pv_data.feather
2019-04-23 15:14:11 [INFO] Written 10.1 MB
2019-04-23 15:14:11 [INFO] Finished storing binary file in 0.03s (0.26s CPU)
2019-04-23 15:14:11 [INFO] Writing to file /home/pa/data/pv_info.feather
2019-04-23 15:14:11 [INFO] Written 1.3 MB
2019-04-23 15:14:11 [INFO] Finished storing binary file in 0.02s (0.12s CPU)
