# Groupby Examples

In [7]:
# basic setup
import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pytz
import statsmodels.api as sm
import statsmodels.formula.api as smf
import string
import sys



## Group Each Variable By Mean

In [11]:
df = pd.DataFrame({'group': ['a', 'a', 'b'], 'var1': [1, 2, 3], 'var2': [2, 3, 4]})
df2 = df.groupby(['group']).mean()
print(df2)



       var1  var2
group            
a       1.5   2.5
b       3.0   4.0


## Group One Variable By Mean

In [13]:
df2 = df.groupby(['group'])['var1'].mean()
print(df2)

group
a    1.5
b    3.0
Name: var1, dtype: float64


## Group By Custom Function - One Value Per Row

In [16]:
# where custom function defines same number of values as in original group
def f(x):
    x['var1_demeanedbygroup'] = x['var1'] - x['var1'].mean()

    return(x)
df2 = df.groupby(['group']).apply(f)
print(df2)



  group  var1  var2  var1_demeanedbygroup
0     a     1     2                  -0.5
1     a     2     3                   0.5
2     b     3     4                   0.0


## Group By Custom Function - One Value Per Group

In [15]:
# where custom function defines same number of values as in original group
def f(x):
    return( np.mean(list(x['var1'])) - np.max(list(x['var1'])) )
df2 = df.groupby(['group']).apply(f)
print(df2)

group
a   -0.5
b    0.0
dtype: float64


## Groupby Weighted Mean

In [17]:
# Template Functions
def groupby_weighted_mean_singlecol(df,data_col,weight_col,by_col):
    df['_data_times_weight'] = df[data_col]*df[weight_col]
    df['_weight_where_notnull'] = df[weight_col]*pd.notnull(df[data_col])
    g = df.groupby(by_col)
    result = g['_data_times_weight'].sum() / g['_weight_where_notnull'].sum()
    del df['_data_times_weight'], df['_weight_where_notnull']
    return result


def groupby_weighted_mean_manycol(df, data_col, weight_col, by_col):
    """
    by_col can be one weight for all columns or different weights for each column
    """
    g = df.groupby(by_col)

    if isinstance(weight_col, str):
        # if weight_col is not a list then set it to be a list of same length as data_col
        weight_col = [weight_col] * len(data_col)

    df2list = []

    for i in range(len(data_col)):
        df['_data_times_weight'] = df[data_col[i]]*df[weight_col[i]]
        df['_weight_where_notnull'] = df[weight_col[i]]*pd.notnull(df[data_col[i]])
        result = g['_data_times_weight'].sum() / g['_weight_where_notnull'].sum()
        df2 = result.to_frame()
        df2.columns = [data_col[i]]
        df2list.append(df2)

    del df['_data_times_weight'], df['_weight_where_notnull']

    dfout = pd.concat(df2list, axis = 1)

    return(dfout)

In [18]:
# Application of Template Functions
df = pd.DataFrame({'group': ['a', 'a', 'b', 'b', 'c', 'c'], 'weight1': [0.1, 0.2, 0.1, 0.2, 0.1, 0.3], 'weight2': [0.01, 0.02, 0.01, 0.02, 0.02, 0.02], 'val1': [np.nan, np.nan, np.nan, 1, 2, 3], 'val2': [np.nan, np.nan, np.nan, 1, 2, 4]})

df2 = groupby_weighted_mean_singlecol(df, 'val1', 'weight1', 'group')
print(df2)
df2 = groupby_weighted_mean_manycol(df, ['val1', 'val2'], 'weight1', 'group')
print(df2)
df2 = groupby_weighted_mean_manycol(df, ['val1', 'val2'], ['weight1', 'weight2'], 'group')
print(df2)

group
a     NaN
b    1.00
c    2.75
dtype: float64
       val1  val2
group            
a       NaN   NaN
b      1.00   1.0
c      2.75   3.5
       val1  val2
group            
a       NaN   NaN
b      1.00   1.0
c      2.75   3.0
