given a toy dataframe A with columns: client_id, dash_a_filter, dash_b_filter, dash_c_filter, we want to answer questions like:
1) how many times does each client filter (a,b,c,d) by a each location code? 
2) how many dashboards does each location code occur in? 
3) how many times overall?

Q1 
write a function that accepts:
- such a dataframe,
- the name of the id column,
- and a list of the (categorical) columns names.
then, the function returns a series whose indeces are all the distinct values that occur at least once in any of the categorical columns, and whose corresponding values are the number of distinct ids in the dataframe that the index occurs in.

Q2 
- write another function that accepts the same kind of input, but this time returns a series indexed on the ids, (one row per id) and whose values are the number of distinct values in all three columns the dataframe has.


In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
# make dataframe:

NUM_ROWS = 12
all_distinct_ids  = [10,11,13,14,15,16,17,18,19,25]
all_distinct_values = [None, 'cat', 'dog', 'dogs', 'hamster', 'parrot', None]
id_list = sorted([random.choice(all_distinct_ids) for _ in range(NUM_ROWS)])

column_id = pd.Series(id_list, name='person_id')

column_values_a = pd.Series([random.choice(all_distinct_values[:3]) for _ in range(NUM_ROWS)], name='col_a')

column_values_b = pd.Series([random.choice(all_distinct_values[3:]) for _ in range(NUM_ROWS)], name='col_b')

value_list = [random.choice(all_distinct_values) for _ in range(NUM_ROWS)]
column_values_c = pd.Series(value_list, name='col_c')

word_list = [random.choice(['cut','cot', 'cat']) for _ in range(NUM_ROWS)]
column_values_bull = pd.Series(word_list, name='another_column')

all_columns = [column_id, column_values_a, column_values_b, column_values_c, column_values_bull]
df = pd.concat(all_columns, axis=1)
print(df)

    person_id col_a    col_b    col_c another_column
0          10   cat  hamster   parrot            cot
1          10   dog     None   parrot            cat
2          11   cat     dogs      dog            cat
3          11   cat   parrot  hamster            cot
4          14  None   parrot      cat            cut
5          14   cat  hamster   parrot            cot
6          15  None   parrot     None            cat
7          17  None     None     None            cat
8          18  None   parrot      cat            cut
9          18   dog  hamster     None            cat
10         25   dog     None     None            cat
11         25   dog   parrot      cat            cot


In [3]:
column_names = ['col_a','col_b','col_c',]
id_column = "person_id"

In [4]:
# QUESTION 1:
# count the number of distinct/different IDs in which each value/word occurs in the dataframe 

def count_id_occurances_per_value(dataframe, id_col, list_of_cat_cols):
   
    all_distinct_values = set()
    for acolumn in list_of_cat_cols:
        values_in_col = dataframe[acolumn].unique()
        all_distinct_values.update(values_in_col)
        
    value_series_counts = pd.Series(data=[0 for _ in range(len(all_distinct_values))], index=all_distinct_values)

    for value in all_distinct_values:
        set_of_ids = {}
        for acolumn in list_of_cat_cols:
            subset_df = dataframe[dataframe[acolumn] == value]
            set_of_ids.update(subset_df[id_col])
        value_series_counts[value] = len(set_of_ids)

    return value_series_counts

In [5]:
count_id_occurances_per_value(df, id_column, column_names)

cat        7
dog        5
NaN        0
parrot     8
dogs       1
hamster    4
dtype: int64

In [6]:
# QUESTION 2:
# count the number of times each word occurs per person

def count_distinct_value_occurances_per_id(df, id_col, list_of_cat_cols):
    
    id_series_counts = pd.Series(data=[{} for _ in range(len(all_distinct_ids))], index=all_distinct_ids)

    for person in all_distinct_ids:
        count_dict = {}
        df_subset = df[df[id_column]==person]
        for column in list_of_cat_cols:
            single_col_count = df_subset[column].value_counts()
            for word, count in single_col_count.items():
                old_value = count_dict.get(word,0)
                new_value = old_value + count
                count_dict[word] = new_value
        id_series_counts[person] = count_dict
    
    return id_series_counts

In [7]:
count_distinct_value_occurances_per_id(df, id_column, column_names)

10      {'cat': 1, 'dog': 1, 'hamster': 1, 'parrot': 2}
11    {'cat': 2, 'parrot': 1, 'dogs': 1, 'dog': 1, '...
13                                                   {}
14                {'cat': 2, 'parrot': 2, 'hamster': 1}
15                                        {'parrot': 1}
16                                                   {}
17                                                   {}
18      {'dog': 1, 'parrot': 1, 'hamster': 1, 'cat': 1}
19                                                   {}
25                    {'dog': 2, 'parrot': 1, 'cat': 1}
dtype: object