# Libraries & Settings

In [32]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import qgrid
#import beakerx as bx
from datetime import datetime as dtt

from pa_lib.file import data_files, load_bin, store_bin, store_excel, load_csv
from pa_lib.data import (calc_col_partitioned, clean_up_categoricals, flatten, 
                         replace_col, cond_col, desc_col, unfactorize, as_dtype)
from pa_lib.util import obj_size, cap_words
from pa_lib.log  import time_log, info
from pa_lib.types import dtFactor
from pa_lib.vis import dive

# display long columns completely, show more rows
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)

def qshow(df, fit_width=False):
    return qgrid.show_grid(df, grid_options={'forceFitColumns': fit_width, 'fullWidthRows': False})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read data file

In [2]:
data_files('PP*')

Unnamed: 0_level_0,size,mtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1
PPI_Pivot_2019_AGH.csv,160.4 MB,16.07.19 11:34:00


In [3]:
ppi_data = (load_csv('PPI_Pivot_2019_AGH.csv', sep=';', encoding='cp1252')
            .rename(mapper=lambda name: cap_words(name, sep='_'), axis='columns'))

2019-07-16 18:09:02 [INFO] Reading from file C:\Users\kpf\data\PPI_Pivot_2019_AGH.csv
2019-07-16 18:09:05 [INFO] Finished loading CSV in 3.55s (4.02s CPU)


In [33]:
qshow(ppi_data)

QgridWidget(grid_options={'fullWidthRows': False, 'syncColumnCellResize': True, 'forceFitColumns': False, 'def…

In [43]:
desc_col(ppi_data)

Unnamed: 0,DTYPE,NULLS,UNIQUE
Ppi_NR,int64,0/515036,1217
Kpg_Name,object,0/515036,1089
Jahr,int64,0/515036,17
Kunde,object,0/515036,267
Branche,object,0/515036,124
Gruppe,object,0/515036,58
Kategorie,object,0/515036,22
Std_Publ,object,0/515036,32
Alter_Jahre,int64,0/515036,62
Alter_15_25_35_49,object,0/515036,4


# Add columns
* Sum of answer weights per campaign
* Scaled answers (by sum of answer weights)
* Raw (uncorrected) answer values

In [29]:
ppi_data.loc[:,'Kamp_Gew_Sum'] = ppi_data.groupby('Ppi_NR')['Gewichtung'].transform(sum)
ppi_data.loc[:,'Befr_Erinn_Prz'] = ppi_data.Befr_Erinnerung / ppi_data.Kamp_Gew_Sum
ppi_data.loc[:,'Befr_Zuord_Prz'] = ppi_data.Befr_Zuordnung / ppi_data.Kamp_Gew_Sum
ppi_data.loc[:,'Befr_Erinn_Raw'] = ppi_data.Befr_Erinnerung / ppi_data.Gewichtung
ppi_data.loc[:,'Befr_Zuord_Raw'] = ppi_data.Befr_Zuordnung / ppi_data.Gewichtung

In [36]:
qshow(ppi_data.loc[:,'Ppi_NR Kamp_Gew_Sum Befr_Erinn_Prz Befr_Zuord_Prz'.split()], fit_width=True)

QgridWidget(grid_options={'fullWidthRows': False, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defa…

# Check distribution of answers

### How many of each? (0 = No, 1 = Yes)

In [44]:
pd.DataFrame(dict(Erinn=ppi_data.Befr_Erinn_Raw.value_counts(), Zuord=ppi_data.Befr_Zuord_Raw.value_counts()))

Unnamed: 0,Erinn,Zuord
0.0,282789,360456
1.0,232247,154580


### Crosstable

In [45]:
ppi_data.pivot_table(index='Befr_Erinn_Raw', columns='Befr_Zuord_Raw', aggfunc='size')

Befr_Zuord_Raw,0.0,1.0
Befr_Erinn_Raw,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,282787,2
1.0,77669,154578


### What is it with those two cases?

In [46]:
ppi_data.query('Befr_Erinn_Raw == 0 and Befr_Zuord_Raw == 1').T

Unnamed: 0,133601,133884
Ppi_NR,719,719
Kpg_Name,PKZ/FP: My Number One,PKZ/FP: My Number One
Jahr,2010,2010
Kunde,PKZ Burger-Kehl + Co. AG,PKZ Burger-Kehl + Co. AG
Branche,Kleidung (sonstige),Kleidung (sonstige)
Gruppe,Bekleidung / Wäsche,Bekleidung / Wäsche
Kategorie,Bekleidung / persönlicher Bedarf,Bekleidung / persönlicher Bedarf
Std_Publ,vêtements - mode,vêtements - mode
Alter_Jahre,53,36
Alter_15_25_35_49,autre,35-49


# Aggregate on campaign level

In [38]:
ppi_kamp_data = (ppi_data.groupby('Ppi_NR')
                 .agg({'Kunde': 'last', 'Kpg_Name': 'last', 'Branche': 'last', 'Gruppe': 'last', 'Kategorie': 'last', 'Std_Publ': 'last', 
                       'Befr_Erinn_Prz': 'sum', 'Befr_Zuord_Prz': 'sum'})
                 .eval('Nettowirk_Prz = Befr_Zuord_Prz / Befr_Erinn_Prz')
                 .join(ppi_data.groupby('Ppi_NR').size().rename('Befr_N')))

qshow(ppi_kamp_data, fit_width=True)

QgridWidget(grid_options={'fullWidthRows': False, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defa…

In [42]:
ppi_kamp_data.Std_Publ.value_counts()

Telekommunikation                130
vêtements - mode                 125
Handel - Grossverteiler           96
Fahrzeuge                         74
Gesundheit / Pers. Bedarf         70
Food                              55
Dienstleistungen                  54
Food - Neueinführungen            50
Krankenkassen                     49
Versicherungen                    46
Ferien                            43
Internet Handel (+ Zalando)       43
Milchprodukte / Käse              41
Bank- / Finanz-DL                 37
Alkoholfreie Getränke             37
Bauen, Industrie, Einrichtung     37
Optik                             35
luxueuses                         33
Lose (Glückspiel)                 29
Glacé / Eiskrem                   23
Schokolade                        21
Medienwerbung                     17
Non-food (übrige)                 15
Food-Süsswaren                    12
Loteries / Lotterien              11
Energie - Gas                      9
Shoes                              8
A