# Budgetbuchung Analyse: Datenpräparation

In [None]:
import pandas as pd

from pa_lib.data import data_files, load_csv, store_bin
from pa_lib.data import desc_col, as_dtype, as_date, split_date_iso
from pa_lib.util import obj_size
from pa_lib.types import dtFactor

# display long columns completely
pd.set_option('display.max_colwidth', 200)

## Daten einlesen

In [None]:
data_files()

In [None]:
pv_bd = load_csv('pv_bd.zip', delimiter=';', encoding='cp1252', dtype='object')

In [None]:
pv_bd.head()

In [None]:
(obj_size(pv_bd), pv_bd.shape)

In [None]:
desc_col(pv_bd)

## Spalten umbenennen, Leerwerte bereinigen, Datentypen korrigieren

In [None]:
pv_bd.columns = ['ResDatum', 'AushangBeginn', 'PvPosNr', 'PartnerNr', 'PartnerName',
       'PvNr', 'PvTitel', 'optBrutto', 'optNetto', 'optNettoNetto']
pv_bd = pv_bd.dropna(how='any')

In [None]:
(obj_size(pv_bd), pv_bd.shape)

In [None]:
pv_bd.loc[:,:] = (pv_bd
                  .pipe(as_dtype, 'int', incl_pattern='.*Nr.*')
                  .pipe(as_dtype, 'float', incl_pattern='.*tto')
                  .pipe(as_date, format='%d.%m.%Y', incl_col=('ResDatum', 'AushangBeginn'))
                  .pipe(as_dtype, dtFactor, incl_dtype='object'))

In [None]:
(obj_size(pv_bd), pv_bd.shape)

In [22]:
desc_col(pv_bd, det=True)

Unnamed: 0,DTYPE,NULLS,UNIQUE,MEM,RANGE
ResDatum,datetime64[ns],0/292922,687,2.2 MB,"[2016-03-09 00:00:00,2019-03-29 00:00:00]"
AushangBeginn,datetime64[ns],0/292922,194,2.2 MB,"[2016-03-28 00:00:00,2019-12-16 00:00:00]"
PvPosNr,int64,0/292922,9821,2.2 MB,"[19,80445]"
PartnerNr,int64,0/292922,5414,2.2 MB,"[100035,653959]"
PartnerName,category,0/292922,5544,1.1 MB,"[3C Champ-Colin Centre SA,zb Zentralbahn AG]"
PvNr,int64,0/292922,6658,2.2 MB,"[26,311490]"
PvTitel,category,0/292922,7356,1.6 MB,"[ Bern - Bethlehemstrasse 24,zb Zentralbahn 2 (ex LSE)]"
optBrutto,float64,0/292922,6984,2.2 MB,"[21.0,97796.0]"
optNetto,float64,0/292922,23802,2.2 MB,"[11.7,57145.5]"
optNettoNetto,float64,0/292922,28163,2.2 MB,"[11.7,54288.22]"


In [None]:
pv_bd.head()

## Netto = 0 ausfiltern, sortieren, Geschäftsjahr und -woche für Aushang und Reservation berechnen

In [None]:
pv_bd = (pv_bd.query('optNettoNetto > 0')
         .sort_values('AushangBeginn')
         .pipe(split_date_iso, dt_col='ResDatum', yr_col='RJahr', kw_col='RKw')
         .pipe(split_date_iso, dt_col='AushangBeginn', yr_col='AJahr', kw_col='AKw')
         .reset_index(drop=True))

In [None]:
desc_col(pv_bd, det=True)

## Vertragsinformationen extrahieren

In [None]:
pv_idx = pv_bd.groupby('PvNr', as_index=True)

In [26]:
pv_info = pv_idx.agg({'PvTitel': 'first', 'optNettoNetto': 'sum', 'PartnerNr': 'nunique', 'PartnerName': 'first', 'PvPosNr': 'nunique',
                      'ResDatum': ['min', 'max'], 'AushangBeginn': ['min', 'max']})
pv_info.columns = 'Titel totalNetto nPartner Partner nPos firstRes lastRes firstAus lastAus'.split()

#### Mehrfach-Partner: Namen zusammenfügen (Reihenfolge wie in Daten)

In [27]:
multi_partner = pv_info.nPartner > 1
pv_multi_prtn = pv_info.loc[multi_partner].index.values
pv_info.loc[multi_partner, 'Partner'] = (pv_bd[pv_bd.PvNr.isin(pv_multi_prtn)].groupby('PvNr')['PartnerName']
                                                .apply(lambda x: ' | '.join(x.unique())))

#### Jahres-Nettoumsätze

In [28]:
pvYearANetto = pv_bd.groupby(['PvNr', 'AJahr'], observed=True, as_index=False)[['optNettoNetto']].agg('sum')
pvYearRNetto = pv_bd.groupby(['PvNr', 'RJahr'], observed=True, as_index=False)[['optNettoNetto']].agg('sum')
pvANetto = pvYearANetto.pivot(index='PvNr', columns='AJahr', values='optNettoNetto').fillna(0).add_prefix('Netto_Aus_')
pvRNetto = pvYearRNetto.pivot(index='PvNr', columns='RJahr', values='optNettoNetto').fillna(0).add_prefix('Netto_Res_')

In [29]:
pv_info = pv_info.merge(pvANetto, on='PvNr').merge(pvRNetto, on='PvNr')

## Konstellation Verträge/Positionen prüfen

#### Gibt es Vertragspositionen mit mehr als einem unterschiedlichen Vertrag? (Wäre nicht toll)

In [30]:
pv_bd.groupby(['PvPosNr'], observed=True)[['PvNr']].agg('nunique').query('PvNr > 1')

Unnamed: 0_level_0,PvNr
PvPosNr,Unnamed: 1_level_1


## Daten speichern

In [31]:
store_bin(pv_bd, 'pv_bd_raw.feather')
pv_data = pv_bd.drop(['PvPosNr', 'PartnerNr', 'PartnerName', 'PvTitel', 'optBrutto', 'optNetto'], axis='columns')
store_bin(pv_data, 'pv_data.feather')
store_bin(pv_info, 'pv_info.feather')

2019-04-16 14:22:18 [INFO] Writing to file /home/pa/data/pv_bd_raw.feather
2019-04-16 14:22:18 [INFO] Written 20.6 MB
2019-04-16 14:22:18 [INFO] Finished storing binary file in 0.05s (0.4s CPU)
2019-04-16 14:22:18 [INFO] Writing to file /home/pa/data/pv_data.feather
2019-04-16 14:22:18 [INFO] Written 10.1 MB
2019-04-16 14:22:18 [INFO] Finished storing binary file in 0.03s (0.23s CPU)
2019-04-16 14:22:18 [INFO] Writing to file /home/pa/data/pv_info.feather
2019-04-16 14:22:18 [INFO] Written 1.2 MB
2019-04-16 14:22:18 [INFO] Finished storing binary file in 0.02s (0.1s CPU)
