In [4]:
import pandas as pd
import numpy as np

In [6]:
# Create sample data
data = {
    'date': pd.date_range('2024-01-01', periods=8),
    'store': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'sales': [100, 150, 80, 200, 120, 180, 90, 160]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n")

Original DataFrame:
        date store  sales
0 2024-01-01     A    100
1 2024-01-02     B    150
2 2024-01-03     A     80
3 2024-01-04     B    200
4 2024-01-05     A    120
5 2024-01-06     B    180
6 2024-01-07     A     90
7 2024-01-08     B    160




#### 1. SPLIT

In [11]:
# Manual split-apply-combine
# 1. Split
groups = dict(tuple(df.groupby('store')))
print(groups)
print(type(groups['A']))

{'A':         date store  sales
0 2024-01-01     A    100
2 2024-01-03     A     80
4 2024-01-05     A    120
6 2024-01-07     A     90, 'B':         date store  sales
1 2024-01-02     B    150
3 2024-01-04     B    200
5 2024-01-06     B    180
7 2024-01-08     B    160}
<class 'pandas.core.frame.DataFrame'>


#### 2. APPLY
- Assign adds ( = assign) new columns to a df, returns a new df
- rolling provides rolling window calculations, in this case mean()

In [13]:
# 2. Apply: Calculate moving average for each group separately
processed_groups = {}     # this is an empty dict
for store, group_df in groups.items():
    # Calculate 2-day moving average for this store
    processed_groups[store] = group_df.assign(
        moving_avg=group_df['sales'].rolling(window=2).mean()
    )
processed_groups

{'A':         date store  sales  moving_avg
 0 2024-01-01     A    100         NaN
 2 2024-01-03     A     80        90.0
 4 2024-01-05     A    120       100.0
 6 2024-01-07     A     90       105.0,
 'B':         date store  sales  moving_avg
 1 2024-01-02     B    150         NaN
 3 2024-01-04     B    200       175.0
 5 2024-01-06     B    180       190.0
 7 2024-01-08     B    160       170.0}

#### 3. COMBINE
- We get a new column with a moving average over sales

In [14]:
# 3. Explicit Combine
combined_df = pd.concat(processed_groups.values())
# Sort to restore original order
combined_df = combined_df.sort_index()

print("Result after explicit combine:")
print(combined_df)
print("\n")

Result after explicit combine:
        date store  sales  moving_avg
0 2024-01-01     A    100         NaN
1 2024-01-02     B    150         NaN
2 2024-01-03     A     80        90.0
3 2024-01-04     B    200       175.0
4 2024-01-05     A    120       100.0
5 2024-01-06     B    180       190.0
6 2024-01-07     A     90       105.0
7 2024-01-08     B    160       170.0


