#### Pandas Part 88: GroupBy Operations

This notebook explores GroupBy operations in pandas, which allow you to split data into groups, apply functions to each group, and combine the results.

In [None]:
import pandas as pd
import numpy as np

##### 1. Creating a Sample DataFrame

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
    'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
    'C': np.random.randn(8),
    'D': np.random.randn(8),
    'E': np.random.randint(0, 10, 8)
})

print("Sample DataFrame:")
print(df)

##### 2. Basic GroupBy Operations

The `groupby()` method is used to split the data into groups based on some criteria.

In [None]:
# Group by column 'A'
grouped = df.groupby('A')

# Get the groups
print("Groups:")
for name, group in grouped:
    print(f"\nGroup name: {name}")
    print(group)

In [None]:
# Get the group dictionary
print("Group dictionary:")
print(grouped.groups)

# Get a specific group
print("\nGroup 'foo':")
print(grouped.get_group('foo'))

##### 3. Aggregation Operations

Aggregation operations compute a summary statistic for each group.

In [None]:
# Compute the mean of each group
print("Mean of each group:")
print(grouped.mean())

# Compute the sum of each group
print("\nSum of each group:")
print(grouped.sum())

# Compute the size of each group
print("\nSize of each group:")
print(grouped.size())

In [None]:
# Compute multiple aggregations at once
print("Multiple aggregations:")
print(grouped.agg(['mean', 'sum', 'count', 'std']))

In [None]:
# Different aggregations for different columns
print("Different aggregations for different columns:")
print(grouped.agg({'C': 'sum', 'D': 'mean', 'E': ['min', 'max']}))

##### 4. Transformation Operations

Transformation operations return an object with the same shape as the input, with values computed group-wise.

In [None]:
# Standardize the data within each group
print("Standardized data within each group:")
print(grouped.transform(lambda x: (x - x.mean()) / x.std()))

In [None]:
# Fill NA values with the group mean
df_with_na = df.copy()
df_with_na.loc[1, 'C'] = np.nan
df_with_na.loc[3, 'D'] = np.nan
print("DataFrame with NA values:")
print(df_with_na)

print("\nFill NA values with group mean:")
print(df_with_na.groupby('A').transform(lambda x: x.fillna(x.mean())))

##### 5. Filtration Operations

Filtration operations discard some groups based on a condition.

In [None]:
# Filter groups where the mean of column 'C' is greater than 0
print("Groups where mean of 'C' > 0:")
print(df.groupby('A').filter(lambda x: x['C'].mean() > 0))

##### 6. The `apply()` Method

The `apply()` method applies a function to each group and combines the results.

In [None]:
# Define a function to apply to each group
def top_n(group, n=2, column='C'):
    return group.sort_values(by=column, ascending=False).head(n)

# Apply the function to each group
print("Top 2 rows in each group by 'C' value:")
print(df.groupby('A').apply(top_n))

##### 7. Grouping by Multiple Columns

In [None]:
# Group by multiple columns
grouped_multi = df.groupby(['A', 'B'])

# Compute the mean of each group
print("Mean of each group (grouped by 'A' and 'B'):")
print(grouped_multi.mean())

# Get the groups
print("\nGroups:")
for name, group in grouped_multi:
    print(f"\nGroup name: {name}")
    print(group)

##### 8. The `Grouper` Object

The `Grouper` object provides a flexible way to specify grouping instructions.

In [None]:
# Create a DataFrame with datetime index
dates = pd.date_range('2023-01-01', periods=10)
df_dates = pd.DataFrame({
    'A': np.random.randn(10),
    'B': np.random.randn(10),
    'C': np.random.choice(['X', 'Y', 'Z'], 10)
}, index=dates)
print("DataFrame with datetime index:")
print(df_dates)

# Group by 2-day frequency
print("\nGrouped by 2-day frequency:")
print(df_dates.groupby(pd.Grouper(freq='2D')).mean())

# Group by column 'C' and 3-day frequency
print("\nGrouped by column 'C' and 3-day frequency:")
print(df_dates.groupby(['C', pd.Grouper(freq='3D')]).mean())

##### 9. Grouping by Index Levels

You can group by index levels in a MultiIndex DataFrame.

In [None]:
# Create a MultiIndex DataFrame
arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
index = pd.MultiIndex.from_arrays(arrays, names=('first', 'second'))
df_multi = pd.DataFrame({'C': np.random.randn(4), 'D': np.random.randn(4)}, index=index)
print("MultiIndex DataFrame:")
print(df_multi)

# Group by level 'first'
print("\nGrouped by level 'first':")
print(df_multi.groupby(level='first').mean())

# Group by level 'second'
print("\nGrouped by level 'second':")
print(df_multi.groupby(level='second').mean())

# Group by both levels
print("\nGrouped by both levels:")
print(df_multi.groupby(level=['first', 'second']).mean())

##### 10. Other GroupBy Methods

In [None]:
# Create a sample DataFrame
df2 = pd.DataFrame({
    'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
    'B': [1, 2, 3, 4, 5, 6],
    'C': [2.0, 5.0, 8.0, 1.0, 2.0, 9.0]
})
print("Sample DataFrame:")
print(df2)

# Group by column 'A'
grouped = df2.groupby('A')

# Compute the cumulative sum within each group
print("\nCumulative sum within each group:")
print(grouped.cumsum())

# Compute the difference between consecutive rows within each group
print("\nDifference between consecutive rows within each group:")
print(grouped.diff())

# Compute the rank within each group
print("\nRank within each group:")
print(grouped.rank())

# Compute the first and last rows of each group
print("\nFirst row of each group:")
print(grouped.first())
print("\nLast row of each group:")
print(grouped.last())

##### 11. Groupby with Custom Aggregation Functions

In [None]:
# Define custom aggregation functions
def range_func(x):
    return x.max() - x.min()

def custom_percentile(x):
    return np.percentile(x, q=75)

# Apply custom aggregation functions
print("Custom aggregation functions:")
print(df2.groupby('A').agg({
    'B': ['sum', 'mean', range_func],
    'C': ['min', 'max', custom_percentile]
}))

##### 12. Named Aggregation

In [None]:
# Named aggregation
print("Named aggregation:")
print(df2.groupby('A').agg(
    b_sum=('B', 'sum'),
    b_mean=('B', 'mean'),
    c_min=('C', 'min'),
    c_max=('C', 'max')
))

##### 13. The `pipe()` Method

The `pipe()` method allows you to chain together functions that expect a GroupBy object.

In [None]:
# Define a function that expects a GroupBy object
def grouped_mean_plus_1(grouped):
    return grouped.mean() + 1

# Use pipe to chain operations
print("Using pipe to chain operations:")
print(df2.groupby('A').pipe(grouped_mean_plus_1))