In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

### Groupby starting examples

In [2]:
df = pd.DataFrame(
    {
        "key1" : ["a", "a", None, "b", "b", "a", None],
        "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],dtype="Int64"),
        "data1" : np.random.standard_normal(7),
        "data2" : np.random.standard_normal(7)
    }
)
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.211351,0.965511
1,a,2.0,-1.904924,-1.199828
2,,1.0,-1.696808,-0.648888
3,b,2.0,0.777448,-0.237527
4,b,1.0,-1.886838,1.339196
5,a,,1.415096,1.14478
6,,1.0,-1.314978,0.781848


In [4]:
df.dtypes

key1      object
key2       Int64
data1    float64
data2    float64
dtype: object

#### Compute mean of `data1` column using the labels from `key1`

In [5]:
grouped = df["data1"].groupby(df["key1"])
grouped.mean()

key1
a   -0.092826
b   -0.554695
Name: data1, dtype: float64

In [8]:
# using a composite key (key made from more than one value)
grouped_ck = df["data1"].groupby([df["key1"], df["key2"]])
means = grouped_ck.mean()
means

key1  key2
a     1       0.211351
      2      -1.904924
b     1      -1.886838
      2       0.777448
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.211351,-1.904924
b,-1.886838,0.777448


In [10]:
# the keys above were all Series, here is an example where the keys are arrays or lists
states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])

years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

df["data1"].groupby([states, years]).mean()

CA  2005   -0.244914
    2006   -1.696808
OH  2005    0.494400
    2006   -1.600908
Name: data1, dtype: float64

In [19]:
# taking the grouping keys directly from the DataFrame that is being grouped
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.092826,0.303488
b,1.5,-0.554695,0.550834


In [17]:
df.groupby("key2").mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.171818,0.609417
2,-0.563738,-0.718677


In [13]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.211351,0.965511
a,2,-1.904924,-1.199828
b,1,-1.886838,1.339196
b,2,0.777448,-0.237527


In [22]:
df.groupby("key1").size()

key1
a    3
b    2
dtype: int64

In [25]:
for name, group in df.groupby("key1"):
    print(f"Type of `name`: {type(name)}")
    print(name)
    print(f"Type of `group`: {type(group)}")
    print(group)
    print()

Type of `name`: <class 'str'>
a
Type of `group`: <class 'pandas.core.frame.DataFrame'>
  key1  key2     data1     data2
0    a     1  0.211351  0.965511
1    a     2 -1.904924 -1.199828
5    a  <NA>  1.415096  1.144780

Type of `name`: <class 'str'>
b
Type of `group`: <class 'pandas.core.frame.DataFrame'>
  key1  key2     data1     data2
3    b     2  0.777448 -0.237527
4    b     1 -1.886838  1.339196



In [30]:
df.groupby(["key1", "key2"])["data2"].mean()

key1  key2
a     1       0.965511
      2      -1.199828
b     1       1.339196
      2      -0.237527
Name: data2, dtype: float64

#### Grouping with dictionaries and Series examples

In [31]:
people = pd.DataFrame(
    np.random.standard_normal((5, 5)),
    columns=["a", "b", "c", "d", "e"],
    index=["Joe", "Steve", "Wanda", "Jill", "Trey"]
)
people

Unnamed: 0,a,b,c,d,e
Joe,-0.591968,-0.165439,-0.347067,2.143806,0.492916
Steve,-0.146437,-1.84241,1.383051,-1.128612,-0.632699
Wanda,0.689231,0.493003,0.837355,0.946366,-0.575434
Jill,1.209122,-0.096136,-0.270371,0.433531,1.464271
Trey,-0.236423,-0.287423,1.456327,0.12188,-0.97068


In [38]:
people.iloc[2:3,[1,2]] = np.nan

In [39]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.591968,-0.165439,-0.347067,2.143806,0.492916
Steve,-0.146437,-1.84241,1.383051,-1.128612,-0.632699
Wanda,0.689231,,,0.946366,-0.575434
Jill,1.209122,-0.096136,-0.270371,0.433531,1.464271
Trey,-0.236423,-0.287423,1.456327,0.12188,-0.97068


In [40]:
mapping = {
    "a": "red",
    "b": "red",
    "c": "blue",
    "d": "blue",
    "e": "red",
    "f" : "orange"
}

In [41]:
by_column = people.groupby(mapping, axis="columns")

In [43]:
by_column.sum()

Unnamed: 0,blue,red
Joe,1.796739,-0.264491
Steve,0.254439,-2.621546
Wanda,0.946366,0.113798
Jill,0.16316,2.577257
Trey,1.578207,-1.494526


#### Data Aggregation with a nonoptimized for groupby method (any method that works on the object being grouped)

In [46]:
grouped = df.groupby("key1")
grouped["data1"].nsmallest(2)

key1   
a     1   -1.904924
      0    0.211351
b     4   -1.886838
      3    0.777448
Name: data1, dtype: float64

#### Group Weighted Average and Correlation Example

In [2]:
close_px = pd.read_csv(
    "examples/stock_px.csv",
    parse_dates=True,
    index_col=0
)

close_px.tail(4)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [7]:
close_px.corrwith(close_px["SPX"])

AAPL    0.244478
MSFT    0.746871
XOM     0.528731
SPX     1.000000
dtype: float64

In [6]:
rets = close_px.pct_change().dropna()
print(rets.tail(4))

                AAPL      MSFT       XOM       SPX
2011-10-11  0.029526  0.002227 -0.000131  0.000544
2011-10-12  0.004747 -0.001481  0.011669  0.009795
2011-10-13  0.015515  0.008160 -0.010238 -0.002974
2011-10-14  0.033225  0.003311  0.022784  0.017380


In [10]:
def get_year(index_value):
    return index_value.year

by_year = rets.groupby(get_year)

#### Group-Wise Linear Regression Example

In [31]:
def regress(data, yvar=None, xvars=None):
    Y = data[yvar].copy()
    X = data[xvars].copy()
    X["intercept"] = 1.
    print(f"type of Y: {type(Y)}")
    print(f"type of X: {type(X)}")
    print(X.__array__())
    result = sm.OLS(Y, X).fit()
    return result.params

regress(rets, yvar="AAPL", xvars=["SPX"])

type of Y: <class 'pandas.core.series.Series'>
type of X: <class 'pandas.core.frame.DataFrame'>
[[-4.84032430e-04  1.00000000e+00]
 [ 2.24743834e-02  1.00000000e+00]
 [-6.54460124e-03  1.00000000e+00]
 ...
 [ 9.79473711e-03  1.00000000e+00]
 [-2.97370056e-03  1.00000000e+00]
 [ 1.73803233e-02  1.00000000e+00]]


SPX          1.025987
intercept    0.001896
dtype: float64

In [32]:
def regress(data, yvar=None, xvars=None):
    Y = data[yvar].copy()
    X = data[xvars].copy()
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


#### `transform()` Example

In [33]:
df = pd.DataFrame(
    {
        'key': ['a', 'b', 'c'] * 4,
        'value': np.arange(12.)
    }
)
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [35]:
g = df.groupby("key")
print(f"type of full g: {type(g)}")
g = df.groupby("key")["value"]
print(f"type of g with `value` column selected: {type(g)}")

type of full g: <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
type of g with `value` column selected: <class 'pandas.core.groupby.generic.SeriesGroupBy'>


In [36]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64