# Budgetbuchung Analyse: Datenpräparation

In [1]:
%load_ext autoreload
%autoreload

import pandas as pd

from pa_lib.file  import data_files, load_csv, store_bin
from pa_lib.data  import desc_col, as_dtype, as_date, split_date_iso
from pa_lib.util  import obj_size
from pa_lib.types import dtFactor

# display long columns completely
pd.set_option('display.max_colwidth', 200)

## Daten einlesen

In [2]:
data_files()

Unnamed: 0_level_0,size,mtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1
bd_by_week.feather,85.6 MB,08.05.19 13:54:40
bd_data.feather,201.7 MB,17.06.19 10:59:39
bd_data_raw.feather,334.4 MB,17.06.19 10:59:28
bd_data_vkprog.feather,71.5 MB,17.06.19 10:59:42
bd_ek_minmax.feather,1.5 MB,09.05.19 08:54:54
bd_long_by_week.feather,9.2 MB,08.05.19 13:55:16
crm_data.feather,52.7 MB,17.06.19 10:39:54
crm_data_vkprog.feather,43.1 MB,17.06.19 10:39:55
plz_data.feather,135.7 KB,17.06.19 10:39:20
pv_akw_netto.feather,12.1 MB,16.04.19 16:16:31


In [3]:
pv_bd = load_csv('pv_bd.zip', delimiter=';', encoding='cp1252', dtype='object')

2019-06-17 12:58:13 [INFO] Reading from file /home/pa/data/pv_bd.zip
2019-06-17 12:58:15 [INFO] Finished loading CSV in 1.29s (1.26s CPU)


In [4]:
pv_bd.head()

Unnamed: 0,Res. Datum,Aushang Beginn,PVPos Nr.,PartnerNr,PartnerName,PvNr,PvTitel,opt Brutto,opt Netto,opt NettoNetto
0,03.05.2016,01.08.2016,11825,100511,KSS Sport- und Freizeitanlagen Schaffhausen,7834,"Schaffhausen, Breitenaustrasse 117 / Parkplatz Schwimmbad KSS",213,213.0,213.0
1,03.05.2016,01.08.2016,40040,101871,Frey Dieter,26732,"Schaffhausen, Spiegelgutstrasse 54 / Gemsgasse",142,142.0,142.0
2,03.05.2016,01.08.2016,71574,103435,Stadt Schaffhausen,9084,"Stadt Schaffhausen, Plakatierung auf offentlichem Grund",3195,3195.0,3195.0
3,04.07.2016,01.08.2016,0,0,,0,,1477,457.87,457.87
4,04.07.2016,01.08.2016,10023,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",4316,1337.96,1337.96


In [5]:
(obj_size(pv_bd), pv_bd.shape)

('206.8 MB', (316984, 10))

In [6]:
desc_col(pv_bd)

Unnamed: 0,DTYPE,NULLS,UNIQUE
Res. Datum,object,0/316984,709
Aushang Beginn,object,0/316984,196
PVPos Nr.,object,0/316984,9847
PartnerNr,object,0/316984,5421
PartnerName,object,222/316762,5552
PvNr,object,0/316984,6673
PvTitel,object,222/316762,7374
opt Brutto,object,0/316984,7355
opt Netto,object,0/316984,23857
opt NettoNetto,object,0/316984,28237


## Spalten umbenennen, Leerwerte bereinigen, Datentypen korrigieren

In [7]:
pv_bd.columns = ['ResDatum', 'AushangBeginn', 'PvPosNr', 'PartnerNr', 'PartnerName',
       'PvNr', 'PvTitel', 'optBrutto', 'optNetto', 'optNettoNetto']
pv_bd = pv_bd.dropna(how='any')

In [8]:
(obj_size(pv_bd), pv_bd.shape)

('209.0 MB', (316762, 10))

In [9]:
pv_bd.loc[:,:] = (pv_bd
                  .pipe(as_dtype, 'int', incl_pattern='.*Nr.*')
                  .pipe(as_dtype, 'float', incl_pattern='.*tto')
                  .pipe(as_date, format_str='%d.%m.%Y', incl_col=('ResDatum', 'AushangBeginn'))
                  .pipe(as_dtype, dtFactor, incl_dtype='object'))

In [10]:
(obj_size(pv_bd), pv_bd.shape)

('24.6 MB', (316762, 10))

In [11]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
ResDatum,datetime64[ns],0/316762,709,4.8 MB,"[2016-03-09 00:00:00,2019-03-29 00:00:00]"
AushangBeginn,datetime64[ns],0/316762,196,4.8 MB,"[2016-03-21 00:00:00,2019-12-16 00:00:00]"
PvPosNr,int64,0/316762,9846,4.8 MB,"[19,80445]"
PartnerNr,int64,0/316762,5420,4.8 MB,"[100035,653959]"
PartnerName,category,0/316762,5552,3.6 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,int64,0/316762,6672,4.8 MB,"[26,311490]"
PvTitel,category,0/316762,7374,4.1 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
optBrutto,float64,0/316762,7330,4.8 MB,"[0.0,266459.5]"
optNetto,float64,0/316762,23803,4.8 MB,"[0.0,57145.5]"
optNettoNetto,float64,0/316762,28164,4.8 MB,"[0.0,54288.22]"


In [12]:
pv_bd.head()

Unnamed: 0,ResDatum,AushangBeginn,PvPosNr,PartnerNr,PartnerName,PvNr,PvTitel,optBrutto,optNetto,optNettoNetto
0,2016-05-03,2016-08-01,11825,100511,KSS Sport- und Freizeitanlagen Schaffhausen,7834,"Schaffhausen, Breitenaustrasse 117 / Parkplatz Schwimmbad KSS",213.0,213.0,213.0
1,2016-05-03,2016-08-01,40040,101871,Frey Dieter,26732,"Schaffhausen, Spiegelgutstrasse 54 / Gemsgasse",142.0,142.0,142.0
2,2016-05-03,2016-08-01,71574,103435,Stadt Schaffhausen,9084,"Stadt Schaffhausen, Plakatierung auf offentlichem Grund",3195.0,3195.0,3195.0
4,2016-07-04,2016-08-01,10023,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",4316.0,1337.96,1337.96
5,2016-07-04,2016-08-01,10444,109362,Liegenschaften der Stadt Biel,6529,"Biel - Gemeindevertrag o, Biel",3269.0,1013.39,1013.39


## Netto = 0 ausfiltern, sortieren, Geschäftsjahr und -woche für Aushang und Reservation berechnen

In [13]:
pv_bd = (pv_bd.query('optNettoNetto > 0')
         .sort_values('AushangBeginn')
         .pipe(split_date_iso, dt_col='ResDatum', yr_col='RJahr', kw_col='RKw')
         .pipe(split_date_iso, dt_col='AushangBeginn', yr_col='AJahr', kw_col='AKw')
         .reset_index(drop=True))

In [14]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
ResDatum,datetime64[ns],0/292922,687,2.2 MB,"[2016-03-09 00:00:00,2019-03-29 00:00:00]"
AushangBeginn,datetime64[ns],0/292922,194,2.2 MB,"[2016-03-28 00:00:00,2019-12-16 00:00:00]"
PvPosNr,int64,0/292922,9821,2.2 MB,"[19,80445]"
PartnerNr,int64,0/292922,5414,2.2 MB,"[100035,653959]"
PartnerName,category,0/292922,5544,1.1 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,int64,0/292922,6658,2.2 MB,"[26,311490]"
PvTitel,category,0/292922,7356,1.6 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
optBrutto,float64,0/292922,6984,2.2 MB,"[21.0,97796.0]"
optNetto,float64,0/292922,23802,2.2 MB,"[11.7,57145.5]"
optNettoNetto,float64,0/292922,28163,2.2 MB,"[11.7,54288.22]"


## Vertragsinformationen extrahieren

In [15]:
pv_idx = pv_bd.sort_values(['PvNr', 'ResDatum']).groupby('PvNr', as_index=True)

In [16]:
pv_info = pv_idx.agg({'PvTitel': 'first', 'optNettoNetto': 'sum', 'PartnerNr': 'nunique', 'PartnerName': 'last', 'PvPosNr': 'nunique',
                      'ResDatum': ['min', 'max'], 'AushangBeginn': ['min', 'max']})
pv_info.columns = 'Titel totalNetto nPartner Partner nPos firstRes lastRes firstAus lastAus'.split()

In [17]:
desc_col(pv_info, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
Titel,object,0/6658,6529,687.1 KB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
totalNetto,float64,0/6658,6631,104.0 KB,"[40.38,6333770.714299988]"
nPartner,int64,0/6658,3,104.0 KB,"[1,3]"
Partner,object,0/6658,5052,558.2 KB,"[A&A Liegenschaften Schweiz AG,zb Zentralbahn AG]"
nPos,int64,0/6658,33,104.0 KB,"[1,401]"
firstRes,datetime64[ns],0/6658,333,104.0 KB,"[2016-03-09 00:00:00,2019-03-21 00:00:00]"
lastRes,datetime64[ns],0/6658,416,104.0 KB,"[2016-03-17 00:00:00,2019-03-29 00:00:00]"
firstAus,datetime64[ns],0/6658,146,104.0 KB,"[2016-03-28 00:00:00,2019-10-14 00:00:00]"
lastAus,datetime64[ns],0/6658,173,104.0 KB,"[2016-05-16 00:00:00,2019-12-16 00:00:00]"


#### Mehrfach-Partner: Namen zusammenfügen (Reihenfolge wie in Daten)

In [18]:
pv_info.assign(allPartner = pv_info.Partner, inplace=True)
multi_partner = pv_info.nPartner > 1
pv_multi_prtn = pv_info.loc[multi_partner].index.values
pv_info.loc[multi_partner, 'allPartner'] = (pv_bd[pv_bd.PvNr.isin(pv_multi_prtn)].groupby('PvNr')['PartnerName']
                                                .apply(lambda x: ' | '.join(x.unique())))

#### Jahres-Nettoumsätze

In [19]:
pvYearANetto = pv_bd.groupby(['PvNr', 'AJahr'], observed=True, as_index=False)[['optNettoNetto']].agg('sum')
pvYearRNetto = pv_bd.groupby(['PvNr', 'RJahr'], observed=True, as_index=False)[['optNettoNetto']].agg('sum')
pvANetto = pvYearANetto.pivot(index='PvNr', columns='AJahr', values='optNettoNetto').fillna(0).add_prefix('Netto_Aus_')
pvRNetto = pvYearRNetto.pivot(index='PvNr', columns='RJahr', values='optNettoNetto').fillna(0).add_prefix('Netto_Res_')

In [20]:
pv_info = pv_info.merge(pvANetto, on='PvNr').merge(pvRNetto, on='PvNr')

## Konstellation Verträge/Positionen prüfen

#### Gibt es Vertragspositionen mit mehr als einem unterschiedlichen Vertrag? (Wäre nicht toll)

In [21]:
pv_bd.groupby(['PvPosNr'], observed=True)[['PvNr']].agg('nunique').query('PvNr > 1')

Unnamed: 0_level_0,PvNr
PvPosNr,Unnamed: 1_level_1


## Daten speichern

In [22]:
store_bin(pv_bd, 'pv_data_file_raw.feather')
pv_data = pv_bd.drop(['PvPosNr', 'PartnerNr', 'PartnerName', 'PvTitel', 'optBrutto', 'optNetto'], axis='columns')
store_bin(pv_data, 'pv_data_file.feather')
store_bin(pv_info, 'pv_info_file.feather')

2019-06-17 12:59:25 [INFO] Writing to file /home/pa/data/pv_data_file_raw.feather
2019-06-17 12:59:26 [INFO] Written 20.6 MB
2019-06-17 12:59:26 [INFO] Finished storing binary file in 0.1s (0.71s CPU)
2019-06-17 12:59:26 [INFO] Writing to file /home/pa/data/pv_data_file.feather
2019-06-17 12:59:26 [INFO] Written 10.1 MB
2019-06-17 12:59:26 [INFO] Finished storing binary file in 0.03s (0.26s CPU)
2019-06-17 12:59:26 [INFO] Writing to file /home/pa/data/pv_info_file.feather
2019-06-17 12:59:26 [INFO] Written 1.3 MB
2019-06-17 12:59:26 [INFO] Finished storing binary file in 0.02s (0.14s CPU)
