# CC

Code samples to aid discussion.

In [1]:
from itertools import chain

import pandas as pd

from utils import more_utils
from utils import dictionaries_rename as naming

## Read raw data

Suggestion to deal with data types explicitely early on.
Along below lines.
In general: As I progress in the pipeline the less I want to be concerned with low level details.

In [2]:
df = pd.read_csv('data/10017_da_en_v2_0.tab', sep='\t')

In [3]:
dates = more_utils.find_columns(df, like='date')

In [4]:
dates = ['w1_date', 'w2_date', 'w3_date', 'w4_date', 'w5_date', 'w6_date']

In [5]:
ordinals = list(naming.get_ordinal_names().keys())[:3]
ordinals

['w3_q6x2', 'w3_q6x3', 'w3_q6x4']

In [6]:
binaries = list(naming.get_binary_names().keys())[:3]
binaries

['w2_q4x1', 'w3_q4x1', 'w4_q9x1']

In [7]:
# based on remaining object-columns after specifying dtypes for above columns
strings = ['version', 'doi', 'panelpat', 'w4_q62t', 'w4_q63t', 'w4_q64t',
           'w4_q80x5t', 'w4_q84x5t', 'w4f_q56t', 'w6_q31t', 'w6_q32t', 'w6_q33t',
           'w6_q48x5t', 'w6_q52x5t', 'w6f_q42t', 'w1_q32t', 'w2_q47x5t', 'w2_q51x5t']
           
print('w1_q32t should be a numeric value?')

w1_q32t should be a numeric value?


In [8]:
# specify already know dtypes
# * integer-like
int_dtypes = dict.fromkeys(
    chain(
        naming.get_ordinal_names(), 
        naming.get_dummies_names(), 
        naming.get_binary_names()
    ), 'Int64'
)
# * string-like
str_dtypes = dict.fromkeys(strings, 'string')
dtypes = {**int_dtypes, **str_dtypes}

In [9]:
df = pd.read_csv('data/10017_da_en_v2_0.tab', sep='\t', dtype=dtypes, parse_dates=dates)

## Sanity checks

In [10]:
df.age.unique()

array([18., 45., 60., 36., 46., 50., 62., 20., 44., nan, 47., 57., 56.,
       69., 41., 28., 31., 51., 64., 37., 33., 25., 53., 68., 24., 67.,
       52., 54., 49., 61., 30., 42., 65., 39., 22., 34., 55., 29., 58.,
       59., 48., 21., 38., 16., 43., 19., 32., 40., 23., 17., 26., 35.,
       27., 66., 63., 70.])

In [11]:
(df == 'refused').any().any()

False

## Preliminary Preprocessing

This includes cases that to not nicely fall into below categories.
* Invert `dte`
* Add `ratio_dont_knows`
* ...

## Reshaping

We bring the data in the correct form.

We subset waves.

This means:
* row-wise: Select rows, i.e. survey participants that took part in a wave
* column-wise: Select columns, i.e. variables related to a wave

In [12]:
WAVENOS = ['1', '2', '3', '4', '5', '6']

In [13]:
def select_rows_for_wave(df, waveno):
    mask = df['panelpat'].str.contains(waveno)
    return df[mask]

In [14]:
waves = {}
for waveno in WAVENOS:
    df_new = select_rows_for_wave(df, waveno)
    waves[waveno] = df_new

In [15]:
# Following a similar procedure
def filter_wave_questions(wave, df):
    """select only questions from specific wave + personal features"""
    
    # wave related questions
    df_w = df.filter(regex='w' + wave)
    # personal features (coded with 'sd' prefix)
    df_sd = df.filter(regex='(sd)')
    # id, popnum and age are personal features which are not coded with prefixes
    # 'w' or 'sd', hence picked manually
    df_id = df['id']
    # changing string binned value to integer 
    df.loc[df.age == '>= 70', 'age'] = 70  # CC: It exists?
    df_age = df['age']
    y = df['panelpat']
    df_popnum = df['popnum']
    new_df = pd.concat([df_w, df_sd, df_popnum, df_id, df_age, y], axis=1)
    return new_df


# we get
def select_columns_for_wave(df, waveno):
    # This is filter_wave_questions
    wave_related_questions = more_utils.find_columns(df, regex=f'w{waveno}')
    personal_features = more_utils.find_columns(df, regex='(sd)')
    additional_columns = ['popnum', 'id', 'age', 'panelpat']
    relevant_columns = wave_related_questions + personal_features + additional_columns

    return df[relevant_columns].copy()

In [16]:
# Consider add apply-utilities to wrap application for specific types of operations (cf. dplyr grammar)
# However, keep minimal example to showcase effect of transformation. In this case df_wave_new (too large of course).
for waveno, df_wave in waves.items():
    df_wave_new = select_columns_for_wave(df_wave, waveno)
    waves[waveno] = df_wave_new

## Tidying

We clean the data.

We convert types and handle missing values.

This means:
* Imputation of missing values
* Mapping specific values to other values we consider more appropriate
* Adjusting/transforming variable types

### Ordinal Features

In [17]:
# Similar to above, this one
def prepare_ordinals_to_transform(df):
    """rename ordinal columns that need to be coded (e.g. opinion questions), 
    throw other features, check all the unique values of these columns, print (disabled by #),
    replacing values with numbers, NaN values replaced by mode"""

    X_ordinal = rename_specific_features_set(df, naming.get_ordinal_names())
    
    # 77, 88 values are "don't know" and 99 is refused, 12 is "would vote invalid"
    # so we make these values NaN for it not to be considered as ordinal values
    X_ordinal = X_ordinal.replace(dict.fromkeys([77, 88, 99, 12], np.NaN))

    # replacing NaN by mode  # CC: May call it "imputation".
    for column in X_ordinal.columns:
        X_ordinal[column].fillna(X_ordinal[column].mode()[0], inplace=True)
    # excluding string responses (like open questions)
    # X_ordinal = X_ordinal.select_dtypes(exclude=[object])
    drop_list = list(get_ordinal_names().keys())
    df = df.drop(columns=drop_list, errors='ignore')
    df = pd.concat([df, X_ordinal], axis=1)
    return df


# turns into
def rename_and_impute_ordinals(df):

    ordinal_names = naming.get_ordinal_names()

    df_ordinal = df.filter(items=ordinal_names)

    # Map specific responses to missing values (NA)
    non_ordinal_responds_to_nan = dict.fromkeys([77, 88, 99, 12], pd.NA)
    df_ordinal = df_ordinal.replace(non_ordinal_responds_to_nan)

    # Impute missing values using the mode
    for column in df_ordinal:
        mode = df_ordinal[column].mode()[0]
        df_ordinal[column] = df_ordinal[column].fillna(mode)
    
    # rename
    df_ordinal = df_ordinal.rename(columns=ordinal_names)
    return df_ordinal

In [18]:
# Same comment as above
for waveno, df_wave in waves.items():
    df_wave_ordinal = rename_and_impute_ordinals(df_wave)
    df_wave_new = pd.concat([df_wave, df_wave_ordinal], axis=1)
    waves[waveno] = df_wave_new

In [19]:
df_wave_ordinal.head()

Unnamed: 0,QUIZ: USE OF RESOURCES -w6f_q40,POLITICAL DISCUSSION (TYPICAL WEEK): NUMBER OF PEOPLE -w6_q55,POLITICAL DISCUSSION: NUMBER OF PEOPLE HOLDING DIFFERENT OPINIONS -w6_q56,INTERVIEW DURATION IN SECONDS -w6_intdur,INTERVIEWDATUM (DAYS TO ELECTION DAY) -w6_dte,HOUSEHOLD SIZE -sd5,MEMBERS OF HOUSEHOLD YOUNGER THAN 18 YEARS -sd6,PARTY UNITED/DIVIDED - IN GENERAL: LIST PETER PILZ -w6_q4x6,PARTY UNITED/DIVIDED - IN GENERAL: SPOE -w6_q4x1,PARTY UNITED/DIVIDED - IN GENERAL: OEVP -w6_q4x2,...,"THE PEOPLE SHOULD TAKE MOST IMPORTANT DECISIONS, NOT POLITICIANS -w6_q34x6",PREFER INDEPENDENT CITIZEN INSTEAD OF A PARTY MEMBER -w6_q34x7,CORPORATIONS AND NOT THE GOVERNMENT DECIDE OVER POLITICS -w6_q34x8,POLITICIANS DO NOT CARE ABOUT WHAT PEOPLE LIKE ME THINK -w6_q34x5,SUBJECTIVE RISK-TAKING PROPENSITY -w6_q29,LIKE-DISLIKE: SPOE -w6_q36x1,LIKE-DISLIKE: OEVP -w6_q36x2,LIKE-DISLIKE: FPOE -w6_q36x3,ASSESSMENT OF POLIT. SITUATION: NERVOUS -w6_q38x7,age_group -sd2x2
2,1,4,2,665,60,2,0,7,9,2,...,5,5,1,3,7,0,8,0,2,6
3,1,1,0,14329,61,4,3,5,5,5,...,3,1,2,1,5,5,5,0,0,3
5,1,10,10,375,60,4,2,5,5,5,...,3,2,3,2,5,0,5,0,5,5
6,1,1,0,739,61,2,0,8,3,2,...,3,2,1,1,3,8,1,1,8,6
7,1,1,1,727,60,4,1,7,6,5,...,2,2,2,2,7,5,4,2,2,2


## Feature Engineering

We derive features to express domain concepts.

We add additional columns based on our considerations.

This means:
* Give examples

In [20]:
# Original
def count_voting_age_awareness(df):
    """coded as binary feature depending on whether answer is correct"""
    
    age_column = df.filter(regex='w1_q31|w4f_q55|w6f_q41').columns
    df[age_column] = df[age_column].replace([16], True)    
    # Age with capital letter because otherwise it's mixed with personal 
    # feature of age and gets to wrong dataset of personal features
    df = df.rename(columns={age_column[0]: 'voting_Age_awareness'})
    # replace wrong values and NaN by 0
    df['voting_Age_awareness'][df['voting_Age_awareness'] != True] = False
    return df


# could (if I am not mistaken) be rewritten as
def compute_voting_age_awareness(df):
    """
    Does the participant know that ...?
    """
    age_answer = more_utils.find_column(df, regex='w1_q31|w4f_q55|w6f_q41')
    is_aware = df[age_answer] == 16
    return pd.Series(is_aware, name='voting_Age_awareness', dtype='int')

# Playground

## Perform sanity checks on return values?

We could be defensive an apply some basic sanity checks to 
preprocessing steps where something could go wrong.
Here is an example.

In [21]:
@more_utils.has_non_trivial_return_value
def some_function_with_a_problem(df):
    """
    Does the participant know that ...?
    """
    political_interest = df.filter(like='VISITED FACEBOOK').sum()
    return political_interest

In [22]:
# some_function_with_a_problem(df_wave_new)

Uncomment above block to get

```python
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [28], in <cell line: 1>()
----> 1 some_function_with_a_problem(df_wave_new)

File ~/repo/training/univie-vsm-2022/resources/survey_attrition_for_mooc/utils/more_utils.py:29, in has_non_trivial_return_value.<locals>.wrapper(*args, **kwds)
     27 if is_okay:
     28     return value
---> 29 raise ValueError('Trivial return value encountered.')

ValueError: Trivial return value encountered.


```

## Managing Names

Variable names are important for a variety of reasons:
* Expression domain concepts
* Readbility
* Source of error

At the moment, variable names are *flying* around in different places,
they are oftentimes duplicated and hard to track and check (at least for me:).

There are different options.

In [23]:
# Bare bones
wave_related_names = more_utils.find_columns(df, regex='w1')
wave_related_names[:3]
# Observations
# * Need to convey meaning through variable name
# * There is no place where `regex=w1` is expressed as a concept
# * Most likely used within a function and result then not available, e.g. for sanity checks

['w1_panelist', 'w1_weightd', 'w1_weightp']

In [24]:
# Collect in a namespace
class VariableNames:
    # No need to be class, could be functions in a naming module as well
    def personal(self, df):
        return more_utils.find_columns(df, regex='(sd)')
    def wave_related(self, df, wave):
        return more_utils.find_columns(df, regex=f'w{wave}')
    def age_answer(self, df):
        return more_utils.find_column(df, regex='w1_q31|w4f_q55|w6f_q41')
    
variable_names = VariableNames()
variable_names.wave_related(df, '1')[:3]
# Observations
# * As a user I do not need to know about the underlying mechanism, 
#   I can simply ask for everything wave related.
# * Again, needs data. But we could as well pass variable names in the 
#   constructor and/or import them from elsewhere (happens already anyway).
# * As soon as there is a mechanism to find/define variable names, we can perform
#   sanity checks.

['w1_panelist', 'w1_weightd', 'w1_weightp']

In [25]:
# Collect and define in a namespace
# Similar to what dictionaries_rename.py is doing right now 
# but with more utilities.

In [26]:
# Bind to DataFrame
# Just an experiment.

In [27]:
@pd.api.extensions.register_dataframe_accessor("survey")
class SurveyAccessor:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    def personal(self):
        return more_utils.find_columns(self._obj, regex='(sd)')
    def wave_related(self, wave):
        return more_utils.find_columns(self._obj, regex=f'w{wave}')
    def age_awareness(self):
        return more_utils.find_column(self._obj, regex='w1_q31|w4f_q55|w6f_q41')

In [28]:
df.survey.wave_related('1')[:3]
# Obversvations
# * We could say this *is* survey data and it therefore know about 
#   these things. Not sure but good to know.

['w1_panelist', 'w1_weightd', 'w1_weightp']